diff --git a/.gitattributes b/.gitattributes
index a5905e27279b18bd791be2190749cbaa579ae920..4a2228877876ecc93631f751e80f64e2ffbd901d 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -38,3 +38,9 @@ checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
checkpoint-3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
checkpoint-4000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-10650/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-6000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-7000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-8000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-9000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
index 9296ccc9d50720d62d591e1ba43165033a5c8819..d314d80b1172dc083bc91587b90829a50c241b6f 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ print(output["generated_text"])
## Training procedure
-[
](https://wandb.ai/ahmed-heakl/huggingface/runs/ma76091v)
+[
](https://wandb.ai/ahmed-heakl/huggingface/runs/8iigi6ha)
This model was trained with SFT.
diff --git a/checkpoint-1000/model-00002-of-00002.safetensors b/checkpoint-1000/model-00002-of-00002.safetensors
index 2020cf600ecec2842d0feaff8c9e558bf58124db..7bafbefb10bbb5e23f7a86a86f8c0211292420d6 100644
--- a/checkpoint-1000/model-00002-of-00002.safetensors
+++ b/checkpoint-1000/model-00002-of-00002.safetensors
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:a7a9e6dc11d15833075e5e64306d68f8d615232768ab82e58d82d3867d872b08
+oid sha256:625c065241e49b903540eb6942c2c6fa3f781a3f8f221e7296e0dc0d0ad81a06
size 1481790520
diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt
index 4961bdb2863a4f446133b24ecdb2aceb1ce82a9e..2a9bfa45bdb2517fe106b3fe388c992484f9ebf3 100644
--- a/checkpoint-1000/optimizer.pt
+++ b/checkpoint-1000/optimizer.pt
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:b2550bac9fbfaea5f705116e90c64524a127406e9a64746d3d4b7f3ceff064aa
+oid sha256:028ef400bd2e3463a5e55ce90dac80de32802b1d03b9b8071169baa5eb4412a4
size 44191162
diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json
index df491ed1b26b682fae6d6b145e331669dde9f359..f72e44e98692c041300d1ddc1ace3acf35a9e913 100644
--- a/checkpoint-1000/trainer_state.json
+++ b/checkpoint-1000/trainer_state.json
@@ -12,18 +12,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 31.0,
+ "avg_layers": 25.0,
"epoch": 0.009392427355444672,
- "f1_execute": 0.4864864945411682,
+ "f1_execute": 0.6976743936538696,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 2.40625,
+ "grad_norm": 2.25,
"learning_rate": 2e-06,
- "loss": 0.5484,
- "macro_f1": 0.1621621698141098,
+ "loss": 0.4974,
+ "macro_f1": 0.23255813121795654,
"num_tokens": 3175.0,
"repeat_count": 0.0,
- "routers_loss": 0.503563642501831,
+ "routers_loss": 0.4339469373226166,
"skip_count": 0.0,
"step": 2,
"text_loss": 0.3330848515033722
@@ -31,18 +31,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 23.0,
"epoch": 0.018784854710889344,
- "f1_execute": 0.4864864945411682,
+ "f1_execute": 0.7272726893424988,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.9140625,
+ "grad_norm": 1.8359375,
"learning_rate": 6e-06,
- "loss": 0.536,
- "macro_f1": 0.1621621698141098,
+ "loss": 0.4988,
+ "macro_f1": 0.24242423474788666,
"num_tokens": 5816.0,
"repeat_count": 0.0,
- "routers_loss": 0.4589468538761139,
+ "routers_loss": 0.4511934816837311,
"skip_count": 1.0,
"step": 4,
"text_loss": 0.4571273922920227
@@ -50,37 +50,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 32.0,
+ "avg_layers": 28.0,
"epoch": 0.02817728206633402,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.6666666865348816,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 2.375,
+ "grad_norm": 2.234375,
"learning_rate": 1e-05,
- "loss": 0.5469,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.5113,
+ "macro_f1": 0.222222238779068,
"num_tokens": 9739.0,
"repeat_count": 0.0,
- "routers_loss": 0.5736724138259888,
+ "routers_loss": 0.49306994676589966,
"skip_count": 0.0,
"step": 6,
"text_loss": 0.41060560941696167
},
{
- "acc_repeat": 1.0,
- "acc_skip": 0.5,
- "avg_layers": 33.0,
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 0.03756970942177869,
- "f1_execute": 0.47058823704719543,
- "f1_repeat": 0.1538461595773697,
- "f1_skip": 0.222222238779068,
- "grad_norm": 1.8515625,
+ "f1_execute": 0.5641025900840759,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.7265625,
"learning_rate": 1.4e-05,
- "loss": 0.5291,
- "macro_f1": 0.28221890330314636,
+ "loss": 0.4766,
+ "macro_f1": 0.18803420662879944,
"num_tokens": 12869.0,
"repeat_count": 1.0,
- "routers_loss": 0.49970296025276184,
+ "routers_loss": 0.48872503638267517,
"skip_count": 2.0,
"step": 8,
"text_loss": 0.36678561568260193
@@ -88,37 +88,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 27.0,
"epoch": 0.046962136777223364,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.6976743936538696,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.953125,
+ "grad_norm": 1.78125,
"learning_rate": 1.8e-05,
- "loss": 0.5316,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.4806,
+ "macro_f1": 0.23255813121795654,
"num_tokens": 15845.0,
"repeat_count": 0.0,
- "routers_loss": 0.5153562426567078,
+ "routers_loss": 0.45077216625213623,
"skip_count": 0.0,
"step": 10,
"text_loss": 0.5597779154777527
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 0.5,
"acc_skip": 0.3333333432674408,
- "avg_layers": 34.0,
+ "avg_layers": 26.0,
"epoch": 0.05635456413266804,
- "f1_execute": 0.5714285373687744,
- "f1_repeat": 0.0,
- "f1_skip": 0.25,
- "grad_norm": 1.6328125,
+ "f1_execute": 0.7179487347602844,
+ "f1_repeat": 0.2857142984867096,
+ "f1_skip": 0.20000000298023224,
+ "grad_norm": 1.5390625,
"learning_rate": 2.2e-05,
- "loss": 0.5051,
- "macro_f1": 0.2738095223903656,
+ "loss": 0.4557,
+ "macro_f1": 0.40122103691101074,
"num_tokens": 19353.0,
"repeat_count": 2.0,
- "routers_loss": 0.46214747428894043,
+ "routers_loss": 0.4130440056324005,
"skip_count": 3.0,
"step": 12,
"text_loss": 0.2056603729724884
@@ -126,37 +126,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 27.0,
"epoch": 0.06574699148811271,
- "f1_execute": 0.5263157486915588,
+ "f1_execute": 0.6976743936538696,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 2.671875,
+ "grad_norm": 2.4375,
"learning_rate": 2.6e-05,
- "loss": 0.5653,
- "macro_f1": 0.17543858289718628,
+ "loss": 0.5129,
+ "macro_f1": 0.23255813121795654,
"num_tokens": 22675.0,
"repeat_count": 0.0,
- "routers_loss": 0.5300976634025574,
+ "routers_loss": 0.4582902193069458,
"skip_count": 0.0,
"step": 14,
"text_loss": 0.32989829778671265
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 34.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 0.07513941884355738,
- "f1_execute": 0.6153846383094788,
+ "f1_execute": 0.6829268336296082,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 1.8828125,
+ "f1_skip": 0.2222222238779068,
+ "grad_norm": 1.7421875,
"learning_rate": 3e-05,
- "loss": 0.5225,
- "macro_f1": 0.20512822270393372,
+ "loss": 0.4729,
+ "macro_f1": 0.3017163574695587,
"num_tokens": 26022.0,
"repeat_count": 0.0,
- "routers_loss": 0.473240464925766,
+ "routers_loss": 0.42910993099212646,
"skip_count": 1.0,
"step": 16,
"text_loss": 0.1353905349969864
@@ -164,18 +164,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 38.0,
+ "avg_layers": 27.0,
"epoch": 0.08453184619900206,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.7555555105209351,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.6015625,
+ "grad_norm": 1.4765625,
"learning_rate": 3.4000000000000007e-05,
- "loss": 0.4867,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.4274,
+ "macro_f1": 0.2518518567085266,
"num_tokens": 29251.0,
"repeat_count": 0.0,
- "routers_loss": 0.4795944094657898,
+ "routers_loss": 0.3990713059902191,
"skip_count": 0.0,
"step": 18,
"text_loss": 0.3806765377521515
@@ -183,18 +183,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 36.0,
+ "avg_layers": 26.0,
"epoch": 0.09392427355444673,
- "f1_execute": 0.6153846383094788,
- "f1_repeat": 0.1538461595773697,
+ "f1_execute": 0.6829268336296082,
+ "f1_repeat": 0.2857142984867096,
"f1_skip": 0.0,
- "grad_norm": 1.3984375,
+ "grad_norm": 1.3125,
"learning_rate": 3.8e-05,
- "loss": 0.4718,
- "macro_f1": 0.25641027092933655,
+ "loss": 0.4261,
+ "macro_f1": 0.3228803873062134,
"num_tokens": 32545.0,
"repeat_count": 1.0,
- "routers_loss": 0.41872408986091614,
+ "routers_loss": 0.40146592259407043,
"skip_count": 0.0,
"step": 20,
"text_loss": 0.25648367404937744
@@ -202,18 +202,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 26.0,
"epoch": 0.1033167009098914,
- "f1_execute": 0.6341463327407837,
+ "f1_execute": 0.7272727489471436,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.7734375,
+ "grad_norm": 1.625,
"learning_rate": 4.2000000000000004e-05,
- "loss": 0.4472,
- "macro_f1": 0.21138212084770203,
+ "loss": 0.404,
+ "macro_f1": 0.24242424964904785,
"num_tokens": 36560.0,
"repeat_count": 0.0,
- "routers_loss": 0.4152105450630188,
+ "routers_loss": 0.372715026140213,
"skip_count": 0.0,
"step": 22,
"text_loss": 0.2799522578716278
@@ -221,18 +221,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 32.0,
+ "avg_layers": 27.0,
"epoch": 0.11270912826533608,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.7555555105209351,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.8046875,
+ "grad_norm": 1.6328125,
"learning_rate": 4.6e-05,
- "loss": 0.4554,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.4218,
+ "macro_f1": 0.2518518567085266,
"num_tokens": 39597.0,
"repeat_count": 0.0,
- "routers_loss": 0.47541096806526184,
+ "routers_loss": 0.4504941403865814,
"skip_count": 0.0,
"step": 24,
"text_loss": 0.6635695695877075
@@ -240,18 +240,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 34.0,
+ "avg_layers": 27.0,
"epoch": 0.12210155562078075,
- "f1_execute": 0.7826087474822998,
+ "f1_execute": 0.8085106015205383,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.875,
+ "grad_norm": 1.7109375,
"learning_rate": 5e-05,
- "loss": 0.4182,
- "macro_f1": 0.2608695924282074,
+ "loss": 0.3886,
+ "macro_f1": 0.26950353384017944,
"num_tokens": 43080.0,
"repeat_count": 0.0,
- "routers_loss": 0.37319275736808777,
+ "routers_loss": 0.3498791456222534,
"skip_count": 0.0,
"step": 26,
"text_loss": 0.7035041451454163
@@ -259,18 +259,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 0.13149398297622542,
- "f1_execute": 0.7826087474822998,
+ "f1_execute": 0.8085106015205383,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.4375,
+ "grad_norm": 1.34375,
"learning_rate": 5.4e-05,
- "loss": 0.3991,
- "macro_f1": 0.2608695924282074,
+ "loss": 0.3724,
+ "macro_f1": 0.26950353384017944,
"num_tokens": 46406.0,
"repeat_count": 0.0,
- "routers_loss": 0.3604123294353485,
+ "routers_loss": 0.31265875697135925,
"skip_count": 0.0,
"step": 28,
"text_loss": 0.6388277411460876
@@ -280,16 +280,16 @@
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.1408864103316701,
- "f1_execute": 0.8979591727256775,
+ "f1_execute": 0.8571428060531616,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.421875,
+ "grad_norm": 1.2578125,
"learning_rate": 5.800000000000001e-05,
- "loss": 0.3827,
- "macro_f1": 0.2993197441101074,
+ "loss": 0.341,
+ "macro_f1": 0.2857142686843872,
"num_tokens": 49966.0,
"repeat_count": 0.0,
- "routers_loss": 0.35880225896835327,
+ "routers_loss": 0.3200918138027191,
"skip_count": 2.0,
"step": 30,
"text_loss": 0.17372547090053558
@@ -297,18 +297,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 24.0,
+ "avg_layers": 25.0,
"epoch": 0.15027883768711475,
- "f1_execute": 0.9200000166893005,
+ "f1_execute": 0.8571428060531616,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.4609375,
+ "grad_norm": 1.4140625,
"learning_rate": 6.2e-05,
- "loss": 0.3452,
- "macro_f1": 0.30666667222976685,
+ "loss": 0.3207,
+ "macro_f1": 0.2857142686843872,
"num_tokens": 53378.0,
"repeat_count": 1.0,
- "routers_loss": 0.31086465716362,
+ "routers_loss": 0.32304447889328003,
"skip_count": 1.0,
"step": 32,
"text_loss": 0.18196581304073334
@@ -316,18 +316,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 25.0,
"epoch": 0.15967126504255943,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.3671875,
+ "grad_norm": 1.46875,
"learning_rate": 6.6e-05,
- "loss": 0.3283,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.3304,
+ "macro_f1": 0.3006536364555359,
"num_tokens": 56933.0,
"repeat_count": 0.0,
- "routers_loss": 0.2674171030521393,
+ "routers_loss": 0.24814388155937195,
"skip_count": 0.0,
"step": 34,
"text_loss": 0.28823015093803406
@@ -335,18 +335,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 0.16906369239800412,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.1015625,
+ "grad_norm": 1.1171875,
"learning_rate": 7.000000000000001e-05,
- "loss": 0.2849,
- "macro_f1": 0.3205128312110901,
+ "loss": 0.2778,
+ "macro_f1": 0.3006536066532135,
"num_tokens": 60744.0,
"repeat_count": 1.0,
- "routers_loss": 0.24587315320968628,
+ "routers_loss": 0.22411039471626282,
"skip_count": 0.0,
"step": 36,
"text_loss": 0.5260357856750488
@@ -354,18 +354,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 31.0,
+ "avg_layers": 27.0,
"epoch": 0.17845611975344877,
- "f1_execute": 0.8085106015205383,
+ "f1_execute": 0.8571428656578064,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.3046875,
+ "grad_norm": 1.484375,
"learning_rate": 7.4e-05,
- "loss": 0.2616,
- "macro_f1": 0.26950353384017944,
+ "loss": 0.2738,
+ "macro_f1": 0.2857142984867096,
"num_tokens": 64900.0,
"repeat_count": 0.0,
- "routers_loss": 0.32050269842147827,
+ "routers_loss": 0.44355395436286926,
"skip_count": 0.0,
"step": 38,
"text_loss": 0.5382097363471985
@@ -373,18 +373,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 0.18784854710889345,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.1796875,
+ "grad_norm": 1.3828125,
"learning_rate": 7.8e-05,
- "loss": 0.2084,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.2137,
+ "macro_f1": 0.3076923191547394,
"num_tokens": 68000.0,
"repeat_count": 0.0,
- "routers_loss": 0.15196125209331512,
+ "routers_loss": 0.202330082654953,
"skip_count": 0.0,
"step": 40,
"text_loss": 0.5946118831634521
@@ -392,18 +392,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 25.0,
"epoch": 0.19724097446433814,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.61328125,
+ "grad_norm": 0.78125,
"learning_rate": 8.2e-05,
- "loss": 0.1947,
+ "loss": 0.21,
"macro_f1": 0.3144654333591461,
"num_tokens": 70529.0,
"repeat_count": 0.0,
- "routers_loss": 0.14121046662330627,
+ "routers_loss": 0.18023855984210968,
"skip_count": 0.0,
"step": 42,
"text_loss": 0.5550904273986816
@@ -416,13 +416,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.50390625,
+ "grad_norm": 0.609375,
"learning_rate": 8.599999999999999e-05,
- "loss": 0.1884,
+ "loss": 0.1918,
"macro_f1": 0.32098764181137085,
"num_tokens": 73427.0,
"repeat_count": 2.0,
- "routers_loss": 0.21312278509140015,
+ "routers_loss": 0.2101590931415558,
"skip_count": 0.0,
"step": 44,
"text_loss": 0.4636923372745514
@@ -435,13 +435,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.45703125,
+ "grad_norm": 0.53125,
"learning_rate": 8.999999999999999e-05,
- "loss": 0.166,
+ "loss": 0.1881,
"macro_f1": 0.3333333432674408,
"num_tokens": 76472.0,
"repeat_count": 0.0,
- "routers_loss": 0.1184137836098671,
+ "routers_loss": 0.11800424009561539,
"skip_count": 0.0,
"step": 46,
"text_loss": 0.4187001883983612
@@ -454,13 +454,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.62890625,
+ "grad_norm": 0.953125,
"learning_rate": 9.400000000000001e-05,
- "loss": 0.1313,
+ "loss": 0.1446,
"macro_f1": 0.3272727429866791,
"num_tokens": 79124.0,
"repeat_count": 1.0,
- "routers_loss": 0.10897563397884369,
+ "routers_loss": 0.11632519960403442,
"skip_count": 0.0,
"step": 48,
"text_loss": 0.2253919243812561
@@ -468,18 +468,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 0.2348106838861168,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.4375,
+ "grad_norm": 0.58984375,
"learning_rate": 9.800000000000001e-05,
- "loss": 0.1531,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.1543,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 81980.0,
"repeat_count": 1.0,
- "routers_loss": 0.09979952871799469,
+ "routers_loss": 0.09669367223978043,
"skip_count": 0.0,
"step": 50,
"text_loss": 0.6053179502487183
@@ -487,18 +487,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 0.2442031112415615,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.515625,
+ "grad_norm": 0.8515625,
"learning_rate": 0.000102,
- "loss": 0.1265,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.1393,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 85236.0,
"repeat_count": 0.0,
- "routers_loss": 0.05543195456266403,
+ "routers_loss": 0.12471720576286316,
"skip_count": 0.0,
"step": 52,
"text_loss": 0.6027331948280334
@@ -511,13 +511,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.328125,
+ "grad_norm": 0.421875,
"learning_rate": 0.000106,
- "loss": 0.1436,
+ "loss": 0.1473,
"macro_f1": 0.32098764181137085,
"num_tokens": 88238.0,
"repeat_count": 0.0,
- "routers_loss": 0.15049344301223755,
+ "routers_loss": 0.1376056969165802,
"skip_count": 2.0,
"step": 54,
"text_loss": 0.2861751616001129
@@ -530,13 +530,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.263671875,
+ "grad_norm": 0.35546875,
"learning_rate": 0.00011,
- "loss": 0.1021,
+ "loss": 0.1082,
"macro_f1": 0.3333333432674408,
"num_tokens": 91056.0,
"repeat_count": 0.0,
- "routers_loss": 0.07367338240146637,
+ "routers_loss": 0.07449393719434738,
"skip_count": 0.0,
"step": 56,
"text_loss": 0.48106974363327026
@@ -544,18 +544,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 26.0,
"epoch": 0.2723803933078955,
- "f1_execute": 1.0,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25,
+ "grad_norm": 0.271484375,
"learning_rate": 0.000114,
- "loss": 0.114,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.1123,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 94987.0,
"repeat_count": 0.0,
- "routers_loss": 0.03782692551612854,
+ "routers_loss": 0.07064720243215561,
"skip_count": 0.0,
"step": 58,
"text_loss": 0.3554874658584595
@@ -568,13 +568,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.333984375,
+ "grad_norm": 0.5390625,
"learning_rate": 0.000118,
- "loss": 0.1197,
+ "loss": 0.1234,
"macro_f1": 0.32098764181137085,
"num_tokens": 97909.0,
"repeat_count": 0.0,
- "routers_loss": 0.14074955880641937,
+ "routers_loss": 0.16835889220237732,
"skip_count": 2.0,
"step": 60,
"text_loss": 0.5475804805755615
@@ -587,13 +587,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.000122,
- "loss": 0.1174,
+ "loss": 0.1224,
"macro_f1": 0.3333333432674408,
"num_tokens": 101043.0,
"repeat_count": 0.0,
- "routers_loss": 0.058013737201690674,
+ "routers_loss": 0.06127442046999931,
"skip_count": 0.0,
"step": 62,
"text_loss": 0.5966938734054565
@@ -606,13 +606,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.212890625,
"learning_rate": 0.000126,
- "loss": 0.0911,
+ "loss": 0.0931,
"macro_f1": 0.3333333432674408,
"num_tokens": 104103.0,
"repeat_count": 0.0,
- "routers_loss": 0.04936821386218071,
+ "routers_loss": 0.047825805842876434,
"skip_count": 0.0,
"step": 64,
"text_loss": 0.5480486750602722
@@ -625,13 +625,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.220703125,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.00013000000000000002,
- "loss": 0.1107,
+ "loss": 0.1088,
"macro_f1": 0.3006536364555359,
"num_tokens": 107009.0,
"repeat_count": 1.0,
- "routers_loss": 0.2628525495529175,
+ "routers_loss": 0.275174081325531,
"skip_count": 4.0,
"step": 66,
"text_loss": 0.41714492440223694
@@ -644,13 +644,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.000134,
- "loss": 0.1109,
+ "loss": 0.1123,
"macro_f1": 0.3333333432674408,
"num_tokens": 110486.0,
"repeat_count": 0.0,
- "routers_loss": 0.02859785594046116,
+ "routers_loss": 0.029025178402662277,
"skip_count": 0.0,
"step": 68,
"text_loss": 0.6775627732276917
@@ -663,13 +663,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.298828125,
+ "grad_norm": 0.314453125,
"learning_rate": 0.00013800000000000002,
- "loss": 0.1067,
+ "loss": 0.1049,
"macro_f1": 0.3272727429866791,
"num_tokens": 113878.0,
"repeat_count": 0.0,
- "routers_loss": 0.10459086298942566,
+ "routers_loss": 0.10141710191965103,
"skip_count": 1.0,
"step": 70,
"text_loss": 0.6678873896598816
@@ -682,13 +682,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2109375,
+ "grad_norm": 0.248046875,
"learning_rate": 0.00014199999999999998,
- "loss": 0.1166,
+ "loss": 0.1119,
"macro_f1": 0.3272727429866791,
"num_tokens": 116989.0,
"repeat_count": 0.0,
- "routers_loss": 0.0718551054596901,
+ "routers_loss": 0.08002066612243652,
"skip_count": 1.0,
"step": 72,
"text_loss": 0.405692994594574
@@ -701,13 +701,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1787109375,
"learning_rate": 0.000146,
- "loss": 0.1007,
+ "loss": 0.0944,
"macro_f1": 0.3144654333591461,
"num_tokens": 119883.0,
"repeat_count": 0.0,
- "routers_loss": 0.1850946843624115,
+ "routers_loss": 0.1867009848356247,
"skip_count": 3.0,
"step": 74,
"text_loss": 0.44616150856018066
@@ -720,13 +720,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.34375,
+ "grad_norm": 0.333984375,
"learning_rate": 0.00015,
- "loss": 0.1019,
+ "loss": 0.1003,
"macro_f1": 0.32098764181137085,
"num_tokens": 123325.0,
"repeat_count": 0.0,
- "routers_loss": 0.09809529036283493,
+ "routers_loss": 0.07042168825864792,
"skip_count": 2.0,
"step": 76,
"text_loss": 0.11340200901031494
@@ -739,13 +739,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.259765625,
+ "grad_norm": 0.26171875,
"learning_rate": 0.000154,
- "loss": 0.1088,
+ "loss": 0.1066,
"macro_f1": 0.32098764181137085,
"num_tokens": 126131.0,
"repeat_count": 0.0,
- "routers_loss": 0.11277207732200623,
+ "routers_loss": 0.11535373330116272,
"skip_count": 2.0,
"step": 78,
"text_loss": 0.3269135355949402
@@ -758,13 +758,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2412109375,
+ "grad_norm": 0.255859375,
"learning_rate": 0.000158,
- "loss": 0.0866,
+ "loss": 0.0891,
"macro_f1": 0.3272727429866791,
"num_tokens": 130349.0,
"repeat_count": 0.0,
- "routers_loss": 0.09079254418611526,
+ "routers_loss": 0.09497501701116562,
"skip_count": 1.0,
"step": 80,
"text_loss": 0.15273472666740417
@@ -777,13 +777,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.000162,
- "loss": 0.0928,
+ "loss": 0.0929,
"macro_f1": 0.3333333432674408,
"num_tokens": 133607.0,
"repeat_count": 0.0,
- "routers_loss": 0.02900076098740101,
+ "routers_loss": 0.030639523640275,
"skip_count": 0.0,
"step": 82,
"text_loss": 0.282884806394577
@@ -796,13 +796,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.00016600000000000002,
- "loss": 0.1251,
+ "loss": 0.1254,
"macro_f1": 0.3272727429866791,
"num_tokens": 136694.0,
"repeat_count": 0.0,
- "routers_loss": 0.0763339251279831,
+ "routers_loss": 0.07906441390514374,
"skip_count": 1.0,
"step": 84,
"text_loss": 0.459094375371933
@@ -817,11 +817,11 @@
"f1_skip": 0.0,
"grad_norm": 0.212890625,
"learning_rate": 0.00017,
- "loss": 0.1064,
+ "loss": 0.1071,
"macro_f1": 0.3144654333591461,
"num_tokens": 139966.0,
"repeat_count": 1.0,
- "routers_loss": 0.13191410899162292,
+ "routers_loss": 0.1124570444226265,
"skip_count": 2.0,
"step": 86,
"text_loss": 0.29985448718070984
@@ -834,13 +834,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.25390625,
"learning_rate": 0.000174,
- "loss": 0.1055,
+ "loss": 0.1031,
"macro_f1": 0.32098764181137085,
"num_tokens": 142788.0,
"repeat_count": 2.0,
- "routers_loss": 0.21200031042099,
+ "routers_loss": 0.1966402679681778,
"skip_count": 0.0,
"step": 88,
"text_loss": 0.6435291767120361
@@ -853,13 +853,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.318359375,
+ "grad_norm": 0.349609375,
"learning_rate": 0.000178,
- "loss": 0.0971,
+ "loss": 0.0963,
"macro_f1": 0.3333333432674408,
"num_tokens": 146192.0,
"repeat_count": 0.0,
- "routers_loss": 0.031911369413137436,
+ "routers_loss": 0.0325632207095623,
"skip_count": 0.0,
"step": 90,
"text_loss": 0.35170626640319824
@@ -872,13 +872,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.216796875,
+ "grad_norm": 0.2265625,
"learning_rate": 0.000182,
- "loss": 0.1056,
+ "loss": 0.1073,
"macro_f1": 0.32098764181137085,
"num_tokens": 149792.0,
"repeat_count": 1.0,
- "routers_loss": 0.14131835103034973,
+ "routers_loss": 0.15115146338939667,
"skip_count": 1.0,
"step": 92,
"text_loss": 0.83159339427948
@@ -891,13 +891,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.205078125,
"learning_rate": 0.000186,
- "loss": 0.1059,
+ "loss": 0.1073,
"macro_f1": 0.3333333432674408,
"num_tokens": 152766.0,
"repeat_count": 0.0,
- "routers_loss": 0.04137955233454704,
+ "routers_loss": 0.043313540518283844,
"skip_count": 0.0,
"step": 94,
"text_loss": 0.49707934260368347
@@ -910,13 +910,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.00019,
- "loss": 0.0934,
+ "loss": 0.0947,
"macro_f1": 0.3333333432674408,
"num_tokens": 156112.0,
"repeat_count": 0.0,
- "routers_loss": 0.03163003921508789,
+ "routers_loss": 0.032021280378103256,
"skip_count": 0.0,
"step": 96,
"text_loss": 0.27608928084373474
@@ -929,13 +929,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.2099609375,
"learning_rate": 0.000194,
- "loss": 0.0847,
+ "loss": 0.0846,
"macro_f1": 0.3076923191547394,
"num_tokens": 159454.0,
"repeat_count": 2.0,
- "routers_loss": 0.2567490339279175,
+ "routers_loss": 0.24473154544830322,
"skip_count": 2.0,
"step": 98,
"text_loss": 0.6026689410209656
@@ -948,13 +948,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.30859375,
+ "grad_norm": 0.271484375,
"learning_rate": 0.00019800000000000002,
- "loss": 0.1077,
+ "loss": 0.1028,
"macro_f1": 0.32098764181137085,
"num_tokens": 163661.0,
"repeat_count": 0.0,
- "routers_loss": 0.11468870937824249,
+ "routers_loss": 0.11468276381492615,
"skip_count": 2.0,
"step": 100,
"text_loss": 0.46733155846595764
@@ -967,13 +967,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.17578125,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.000202,
- "loss": 0.1131,
+ "loss": 0.1089,
"macro_f1": 0.3333333432674408,
"num_tokens": 167134.0,
"repeat_count": 0.0,
- "routers_loss": 0.02124219387769699,
+ "routers_loss": 0.021144939586520195,
"skip_count": 0.0,
"step": 102,
"text_loss": 0.6362994909286499
@@ -986,13 +986,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.000206,
- "loss": 0.0624,
+ "loss": 0.0621,
"macro_f1": 0.3272727429866791,
"num_tokens": 170433.0,
"repeat_count": 0.0,
- "routers_loss": 0.06983796507120132,
+ "routers_loss": 0.06594710797071457,
"skip_count": 1.0,
"step": 104,
"text_loss": 0.4515477120876312
@@ -1005,13 +1005,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.00021,
- "loss": 0.0951,
+ "loss": 0.0929,
"macro_f1": 0.3333333432674408,
"num_tokens": 173387.0,
"repeat_count": 0.0,
- "routers_loss": 0.03467355668544769,
+ "routers_loss": 0.032923027873039246,
"skip_count": 0.0,
"step": 106,
"text_loss": 0.6638453006744385
@@ -1024,13 +1024,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.240234375,
"learning_rate": 0.000214,
- "loss": 0.0881,
+ "loss": 0.0883,
"macro_f1": 0.3272727429866791,
"num_tokens": 176170.0,
"repeat_count": 1.0,
- "routers_loss": 0.08142061531543732,
+ "routers_loss": 0.08034781366586685,
"skip_count": 0.0,
"step": 108,
"text_loss": 1.186936855316162
@@ -1043,13 +1043,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.267578125,
"learning_rate": 0.000218,
- "loss": 0.0795,
+ "loss": 0.0794,
"macro_f1": 0.3272727429866791,
"num_tokens": 179877.0,
"repeat_count": 0.0,
- "routers_loss": 0.08327355235815048,
+ "routers_loss": 0.07814185321331024,
"skip_count": 1.0,
"step": 110,
"text_loss": 0.5488709211349487
@@ -1062,13 +1062,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.000222,
- "loss": 0.0943,
+ "loss": 0.0946,
"macro_f1": 0.3333333432674408,
"num_tokens": 182726.0,
"repeat_count": 0.0,
- "routers_loss": 0.019890006631612778,
+ "routers_loss": 0.01884695515036583,
"skip_count": 0.0,
"step": 112,
"text_loss": 0.5195863842964172
@@ -1081,13 +1081,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2001953125,
+ "grad_norm": 0.19921875,
"learning_rate": 0.00022600000000000002,
- "loss": 0.0933,
+ "loss": 0.0974,
"macro_f1": 0.32098764181137085,
"num_tokens": 185624.0,
"repeat_count": 0.0,
- "routers_loss": 0.09992363303899765,
+ "routers_loss": 0.09657823294401169,
"skip_count": 2.0,
"step": 114,
"text_loss": 0.43858134746551514
@@ -1100,13 +1100,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.3046875,
"learning_rate": 0.00023,
- "loss": 0.0762,
+ "loss": 0.0753,
"macro_f1": 0.3333333432674408,
"num_tokens": 188155.0,
"repeat_count": 0.0,
- "routers_loss": 0.014119029976427555,
+ "routers_loss": 0.01463601179420948,
"skip_count": 0.0,
"step": 116,
"text_loss": 0.392981618642807
@@ -1119,13 +1119,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.423828125,
+ "grad_norm": 0.439453125,
"learning_rate": 0.00023400000000000002,
- "loss": 0.0842,
+ "loss": 0.0843,
"macro_f1": 0.3333333432674408,
"num_tokens": 190970.0,
"repeat_count": 0.0,
- "routers_loss": 0.03976766765117645,
+ "routers_loss": 0.03859659656882286,
"skip_count": 0.0,
"step": 118,
"text_loss": 0.309179425239563
@@ -1138,13 +1138,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.2255859375,
"learning_rate": 0.00023799999999999998,
- "loss": 0.0517,
+ "loss": 0.053,
"macro_f1": 0.3333333432674408,
"num_tokens": 193988.0,
"repeat_count": 0.0,
- "routers_loss": 0.017428619787096977,
+ "routers_loss": 0.019092386588454247,
"skip_count": 0.0,
"step": 120,
"text_loss": 0.48543134331703186
@@ -1157,13 +1157,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.296875,
+ "grad_norm": 0.35546875,
"learning_rate": 0.000242,
- "loss": 0.1134,
+ "loss": 0.1203,
"macro_f1": 0.3272727429866791,
"num_tokens": 196475.0,
"repeat_count": 0.0,
- "routers_loss": 0.06965513527393341,
+ "routers_loss": 0.0619138665497303,
"skip_count": 1.0,
"step": 122,
"text_loss": 0.4615364074707031
@@ -1176,13 +1176,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1875,
"learning_rate": 0.000246,
- "loss": 0.0984,
+ "loss": 0.1002,
"macro_f1": 0.3272727429866791,
"num_tokens": 200045.0,
"repeat_count": 1.0,
- "routers_loss": 0.10476501286029816,
+ "routers_loss": 0.09752107411623001,
"skip_count": 0.0,
"step": 124,
"text_loss": 0.15802054107189178
@@ -1195,13 +1195,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.00025,
- "loss": 0.0771,
+ "loss": 0.0773,
"macro_f1": 0.3333333432674408,
"num_tokens": 203214.0,
"repeat_count": 0.0,
- "routers_loss": 0.028317544609308243,
+ "routers_loss": 0.02896115928888321,
"skip_count": 0.0,
"step": 126,
"text_loss": 0.4543360471725464
@@ -1214,13 +1214,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.390625,
+ "grad_norm": 0.4296875,
"learning_rate": 0.000254,
- "loss": 0.0933,
+ "loss": 0.0973,
"macro_f1": 0.3333333432674408,
"num_tokens": 206168.0,
"repeat_count": 0.0,
- "routers_loss": 0.012766432017087936,
+ "routers_loss": 0.011423567309975624,
"skip_count": 0.0,
"step": 128,
"text_loss": 0.4730179011821747
@@ -1233,13 +1233,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.353515625,
+ "grad_norm": 0.365234375,
"learning_rate": 0.00025800000000000004,
- "loss": 0.0989,
+ "loss": 0.099,
"macro_f1": 0.3333333432674408,
"num_tokens": 209907.0,
"repeat_count": 0.0,
- "routers_loss": 0.021400077268481255,
+ "routers_loss": 0.01957600563764572,
"skip_count": 0.0,
"step": 130,
"text_loss": 0.45122358202934265
@@ -1252,13 +1252,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.2060546875,
"learning_rate": 0.000262,
- "loss": 0.0873,
+ "loss": 0.0868,
"macro_f1": 0.3272727429866791,
"num_tokens": 213521.0,
"repeat_count": 0.0,
- "routers_loss": 0.05025051161646843,
+ "routers_loss": 0.04882373288273811,
"skip_count": 1.0,
"step": 132,
"text_loss": 0.4341491758823395
@@ -1271,13 +1271,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1611328125,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.000266,
- "loss": 0.085,
+ "loss": 0.0834,
"macro_f1": 0.3333333432674408,
"num_tokens": 216484.0,
"repeat_count": 0.0,
- "routers_loss": 0.017420046031475067,
+ "routers_loss": 0.016083380207419395,
"skip_count": 0.0,
"step": 134,
"text_loss": 0.46990111470222473
@@ -1290,13 +1290,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2041015625,
+ "grad_norm": 0.220703125,
"learning_rate": 0.00027,
- "loss": 0.086,
+ "loss": 0.0863,
"macro_f1": 0.3333333432674408,
"num_tokens": 219398.0,
"repeat_count": 0.0,
- "routers_loss": 0.018217921257019043,
+ "routers_loss": 0.01733536459505558,
"skip_count": 0.0,
"step": 136,
"text_loss": 0.4455361068248749
@@ -1309,13 +1309,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.00027400000000000005,
- "loss": 0.0985,
+ "loss": 0.0997,
"macro_f1": 0.3333333432674408,
"num_tokens": 222430.0,
"repeat_count": 0.0,
- "routers_loss": 0.012350660748779774,
+ "routers_loss": 0.01332803163677454,
"skip_count": 0.0,
"step": 138,
"text_loss": 0.47699397802352905
@@ -1328,13 +1328,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.302734375,
+ "grad_norm": 0.333984375,
"learning_rate": 0.00027800000000000004,
"loss": 0.0922,
"macro_f1": 0.3144654333591461,
"num_tokens": 225458.0,
"repeat_count": 1.0,
- "routers_loss": 0.14993029832839966,
+ "routers_loss": 0.14924728870391846,
"skip_count": 2.0,
"step": 140,
"text_loss": 0.5858222842216492
@@ -1347,13 +1347,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.251953125,
+ "grad_norm": 0.25,
"learning_rate": 0.00028199999999999997,
- "loss": 0.0791,
+ "loss": 0.0798,
"macro_f1": 0.3144654333591461,
"num_tokens": 229365.0,
"repeat_count": 1.0,
- "routers_loss": 0.17921413481235504,
+ "routers_loss": 0.1860177218914032,
"skip_count": 2.0,
"step": 142,
"text_loss": 0.5003137588500977
@@ -1366,13 +1366,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.00028599999999999996,
- "loss": 0.0535,
+ "loss": 0.054,
"macro_f1": 0.32098764181137085,
"num_tokens": 231787.0,
"repeat_count": 1.0,
- "routers_loss": 0.1420905590057373,
+ "routers_loss": 0.16498211026191711,
"skip_count": 1.0,
"step": 144,
"text_loss": 0.5026470422744751
@@ -1385,13 +1385,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.29296875,
+ "grad_norm": 0.306640625,
"learning_rate": 0.00029,
- "loss": 0.0956,
+ "loss": 0.0936,
"macro_f1": 0.32098764181137085,
"num_tokens": 235014.0,
"repeat_count": 1.0,
- "routers_loss": 0.12468750029802322,
+ "routers_loss": 0.11801310628652573,
"skip_count": 1.0,
"step": 146,
"text_loss": 0.611888587474823
@@ -1404,13 +1404,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.000294,
- "loss": 0.0879,
+ "loss": 0.0878,
"macro_f1": 0.3333333432674408,
"num_tokens": 238210.0,
"repeat_count": 0.0,
- "routers_loss": 0.024295611307024956,
+ "routers_loss": 0.02422776259481907,
"skip_count": 0.0,
"step": 148,
"text_loss": 0.2876914143562317
@@ -1423,13 +1423,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.000298,
- "loss": 0.087,
+ "loss": 0.0858,
"macro_f1": 0.32098764181137085,
"num_tokens": 241582.0,
"repeat_count": 0.0,
- "routers_loss": 0.07016433775424957,
+ "routers_loss": 0.07282499223947525,
"skip_count": 2.0,
"step": 150,
"text_loss": 0.3919292390346527
@@ -1442,13 +1442,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3828125,
+ "grad_norm": 0.37890625,
"learning_rate": 0.000302,
- "loss": 0.0782,
+ "loss": 0.0797,
"macro_f1": 0.32098764181137085,
"num_tokens": 244621.0,
"repeat_count": 1.0,
- "routers_loss": 0.18942493200302124,
+ "routers_loss": 0.20659038424491882,
"skip_count": 1.0,
"step": 152,
"text_loss": 0.4294498860836029
@@ -1461,13 +1461,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1787109375,
"learning_rate": 0.000306,
- "loss": 0.0713,
+ "loss": 0.072,
"macro_f1": 0.3333333432674408,
"num_tokens": 247833.0,
"repeat_count": 0.0,
- "routers_loss": 0.02319060079753399,
+ "routers_loss": 0.02428400330245495,
"skip_count": 0.0,
"step": 154,
"text_loss": 0.5930765867233276
@@ -1480,13 +1480,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.15234375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.00031,
- "loss": 0.0778,
+ "loss": 0.0772,
"macro_f1": 0.3333333432674408,
"num_tokens": 251349.0,
"repeat_count": 0.0,
- "routers_loss": 0.01764747127890587,
+ "routers_loss": 0.0167869683355093,
"skip_count": 0.0,
"step": 156,
"text_loss": 0.41063904762268066
@@ -1499,13 +1499,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.000314,
- "loss": 0.0829,
+ "loss": 0.0821,
"macro_f1": 0.3333333432674408,
"num_tokens": 254886.0,
"repeat_count": 0.0,
- "routers_loss": 0.02268100716173649,
+ "routers_loss": 0.02531604655086994,
"skip_count": 0.0,
"step": 158,
"text_loss": 0.6739020347595215
@@ -1518,13 +1518,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.201171875,
"learning_rate": 0.00031800000000000003,
- "loss": 0.0889,
+ "loss": 0.09,
"macro_f1": 0.3333333432674408,
"num_tokens": 258260.0,
"repeat_count": 0.0,
- "routers_loss": 0.016952091827988625,
+ "routers_loss": 0.017772775143384933,
"skip_count": 0.0,
"step": 160,
"text_loss": 0.46873849630355835
@@ -1537,13 +1537,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2216796875,
+ "grad_norm": 0.224609375,
"learning_rate": 0.000322,
- "loss": 0.0923,
+ "loss": 0.0893,
"macro_f1": 0.3272727429866791,
"num_tokens": 261846.0,
"repeat_count": 0.0,
- "routers_loss": 0.03669808804988861,
+ "routers_loss": 0.034902360290288925,
"skip_count": 1.0,
"step": 162,
"text_loss": 0.3727971017360687
@@ -1556,13 +1556,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.212890625,
"learning_rate": 0.000326,
- "loss": 0.0769,
+ "loss": 0.076,
"macro_f1": 0.3333333432674408,
"num_tokens": 264348.0,
"repeat_count": 0.0,
- "routers_loss": 0.012101447209715843,
+ "routers_loss": 0.013553355820477009,
"skip_count": 0.0,
"step": 164,
"text_loss": 0.5798237323760986
@@ -1575,13 +1575,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.37109375,
+ "grad_norm": 0.408203125,
"learning_rate": 0.00033,
- "loss": 0.0897,
+ "loss": 0.0926,
"macro_f1": 0.32098764181137085,
"num_tokens": 267479.0,
"repeat_count": 1.0,
- "routers_loss": 0.1562056541442871,
+ "routers_loss": 0.13571743667125702,
"skip_count": 1.0,
"step": 166,
"text_loss": 0.8084776997566223
@@ -1594,13 +1594,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.2431640625,
"learning_rate": 0.00033400000000000004,
- "loss": 0.0829,
+ "loss": 0.0817,
"macro_f1": 0.32098764181137085,
"num_tokens": 270268.0,
"repeat_count": 2.0,
- "routers_loss": 0.20807914435863495,
+ "routers_loss": 0.19884146749973297,
"skip_count": 0.0,
"step": 168,
"text_loss": 0.7366134524345398
@@ -1613,13 +1613,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.267578125,
"learning_rate": 0.00033800000000000003,
- "loss": 0.0987,
+ "loss": 0.1022,
"macro_f1": 0.32098764181137085,
"num_tokens": 273518.0,
"repeat_count": 1.0,
- "routers_loss": 0.1530539095401764,
+ "routers_loss": 0.15469175577163696,
"skip_count": 1.0,
"step": 170,
"text_loss": 0.27204006910324097
@@ -1632,13 +1632,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000342,
- "loss": 0.087,
+ "loss": 0.0865,
"macro_f1": 0.32098764181137085,
"num_tokens": 277210.0,
"repeat_count": 0.0,
- "routers_loss": 0.08004544675350189,
+ "routers_loss": 0.08603330701589584,
"skip_count": 2.0,
"step": 172,
"text_loss": 0.7137667536735535
@@ -1651,13 +1651,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1767578125,
+ "grad_norm": 0.189453125,
"learning_rate": 0.000346,
- "loss": 0.0916,
+ "loss": 0.0902,
"macro_f1": 0.3076923191547394,
"num_tokens": 280389.0,
"repeat_count": 0.0,
- "routers_loss": 0.19228078424930573,
+ "routers_loss": 0.17851492762565613,
"skip_count": 4.0,
"step": 174,
"text_loss": 0.5148105621337891
@@ -1670,13 +1670,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1611328125,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.00035,
- "loss": 0.0863,
+ "loss": 0.0853,
"macro_f1": 0.3333333432674408,
"num_tokens": 283501.0,
"repeat_count": 0.0,
- "routers_loss": 0.024507170543074608,
+ "routers_loss": 0.021331604570150375,
"skip_count": 0.0,
"step": 176,
"text_loss": 0.301013320684433
@@ -1689,13 +1689,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.000354,
- "loss": 0.0898,
+ "loss": 0.0911,
"macro_f1": 0.32098764181137085,
"num_tokens": 287154.0,
"repeat_count": 0.0,
- "routers_loss": 0.05055495724081993,
+ "routers_loss": 0.057273946702480316,
"skip_count": 2.0,
"step": 178,
"text_loss": 0.4740981459617615
@@ -1708,13 +1708,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.240234375,
"learning_rate": 0.000358,
- "loss": 0.0865,
+ "loss": 0.0904,
"macro_f1": 0.3272727429866791,
"num_tokens": 289929.0,
"repeat_count": 0.0,
- "routers_loss": 0.03999815881252289,
+ "routers_loss": 0.04116598889231682,
"skip_count": 1.0,
"step": 180,
"text_loss": 0.4838573932647705
@@ -1727,13 +1727,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.14453125,
"learning_rate": 0.000362,
- "loss": 0.0983,
+ "loss": 0.0991,
"macro_f1": 0.3333333432674408,
"num_tokens": 294293.0,
"repeat_count": 0.0,
- "routers_loss": 0.025158070027828217,
+ "routers_loss": 0.027111956849694252,
"skip_count": 0.0,
"step": 182,
"text_loss": 0.7495553493499756
@@ -1746,32 +1746,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.158203125,
"learning_rate": 0.000366,
- "loss": 0.1015,
+ "loss": 0.1038,
"macro_f1": 0.3333333432674408,
"num_tokens": 297730.0,
"repeat_count": 0.0,
- "routers_loss": 0.01825365424156189,
+ "routers_loss": 0.019166452810168266,
"skip_count": 0.0,
"step": 184,
"text_loss": 0.534831166267395
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 0.8734957440563546,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.2158203125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2236328125,
"learning_rate": 0.00037,
- "loss": 0.0736,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0784,
+ "macro_f1": 0.5427350401878357,
"num_tokens": 300593.0,
"repeat_count": 1.0,
- "routers_loss": 0.22729666531085968,
+ "routers_loss": 0.2349659502506256,
"skip_count": 2.0,
"step": 186,
"text_loss": 0.3549048602581024
@@ -1784,13 +1784,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.2041015625,
"learning_rate": 0.000374,
- "loss": 0.0838,
+ "loss": 0.0827,
"macro_f1": 0.3076923191547394,
"num_tokens": 303456.0,
"repeat_count": 2.0,
- "routers_loss": 0.24516475200653076,
+ "routers_loss": 0.22502389550209045,
"skip_count": 2.0,
"step": 188,
"text_loss": 0.8837642073631287
@@ -1803,13 +1803,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2470703125,
+ "grad_norm": 0.271484375,
"learning_rate": 0.000378,
- "loss": 0.1056,
+ "loss": 0.1085,
"macro_f1": 0.3272727429866791,
"num_tokens": 306241.0,
"repeat_count": 1.0,
- "routers_loss": 0.1307530701160431,
+ "routers_loss": 0.12291611731052399,
"skip_count": 0.0,
"step": 190,
"text_loss": 0.73353511095047
@@ -1822,13 +1822,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.15625,
"learning_rate": 0.000382,
- "loss": 0.0961,
+ "loss": 0.0969,
"macro_f1": 0.3272727429866791,
"num_tokens": 310606.0,
"repeat_count": 0.0,
- "routers_loss": 0.06541688740253448,
+ "routers_loss": 0.055988848209381104,
"skip_count": 1.0,
"step": 192,
"text_loss": 0.6261917352676392
@@ -1841,13 +1841,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.333984375,
+ "grad_norm": 0.34375,
"learning_rate": 0.000386,
- "loss": 0.1058,
+ "loss": 0.1055,
"macro_f1": 0.3144654333591461,
"num_tokens": 313564.0,
"repeat_count": 0.0,
- "routers_loss": 0.12492545694112778,
+ "routers_loss": 0.12363404780626297,
"skip_count": 3.0,
"step": 194,
"text_loss": 0.2790874242782593
@@ -1860,13 +1860,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28515625,
+ "grad_norm": 0.27734375,
"learning_rate": 0.00039000000000000005,
- "loss": 0.0966,
+ "loss": 0.0964,
"macro_f1": 0.3076923191547394,
"num_tokens": 316958.0,
"repeat_count": 2.0,
- "routers_loss": 0.2838033139705658,
+ "routers_loss": 0.2718356251716614,
"skip_count": 2.0,
"step": 196,
"text_loss": 0.14428086578845978
@@ -1881,11 +1881,11 @@
"f1_skip": 0.0,
"grad_norm": 0.2021484375,
"learning_rate": 0.00039400000000000004,
- "loss": 0.0929,
+ "loss": 0.0917,
"macro_f1": 0.32098764181137085,
"num_tokens": 320103.0,
"repeat_count": 0.0,
- "routers_loss": 0.07692629098892212,
+ "routers_loss": 0.07188102602958679,
"skip_count": 2.0,
"step": 198,
"text_loss": 0.27155816555023193
@@ -1898,13 +1898,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.201171875,
"learning_rate": 0.000398,
"loss": 0.0809,
"macro_f1": 0.32098764181137085,
"num_tokens": 323566.0,
"repeat_count": 1.0,
- "routers_loss": 0.18504399061203003,
+ "routers_loss": 0.18038256466388702,
"skip_count": 1.0,
"step": 200,
"text_loss": 0.8453494310379028
@@ -1917,13 +1917,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.000402,
- "loss": 0.078,
+ "loss": 0.0801,
"macro_f1": 0.3333333432674408,
"num_tokens": 326385.0,
"repeat_count": 0.0,
- "routers_loss": 0.014647359028458595,
+ "routers_loss": 0.014639763161540031,
"skip_count": 0.0,
"step": 202,
"text_loss": 0.5733131766319275
@@ -1936,13 +1936,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2041015625,
+ "grad_norm": 0.21875,
"learning_rate": 0.00040600000000000006,
- "loss": 0.1028,
+ "loss": 0.104,
"macro_f1": 0.3333333432674408,
"num_tokens": 329266.0,
"repeat_count": 0.0,
- "routers_loss": 0.017848484218120575,
+ "routers_loss": 0.015269627794623375,
"skip_count": 0.0,
"step": 204,
"text_loss": 0.7355639934539795
@@ -1955,13 +1955,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.27734375,
"learning_rate": 0.00041,
- "loss": 0.0832,
+ "loss": 0.0833,
"macro_f1": 0.3333333432674408,
"num_tokens": 332984.0,
"repeat_count": 0.0,
- "routers_loss": 0.01900508813560009,
+ "routers_loss": 0.018046971410512924,
"skip_count": 0.0,
"step": 206,
"text_loss": 0.587641179561615
@@ -1974,13 +1974,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.166015625,
+ "grad_norm": 0.185546875,
"learning_rate": 0.000414,
"loss": 0.0588,
"macro_f1": 0.3272727429866791,
"num_tokens": 335739.0,
"repeat_count": 1.0,
- "routers_loss": 0.13018715381622314,
+ "routers_loss": 0.12791286408901215,
"skip_count": 0.0,
"step": 208,
"text_loss": 0.6538406610488892
@@ -1993,13 +1993,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.24609375,
"learning_rate": 0.00041799999999999997,
- "loss": 0.0697,
+ "loss": 0.0732,
"macro_f1": 0.3272727429866791,
"num_tokens": 338966.0,
"repeat_count": 0.0,
- "routers_loss": 0.055288366973400116,
+ "routers_loss": 0.050490595400333405,
"skip_count": 1.0,
"step": 210,
"text_loss": 0.4188295602798462
@@ -2012,13 +2012,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.271484375,
"learning_rate": 0.000422,
- "loss": 0.0576,
+ "loss": 0.0588,
"macro_f1": 0.3144654333591461,
"num_tokens": 342063.0,
"repeat_count": 0.0,
- "routers_loss": 0.10952572524547577,
+ "routers_loss": 0.11652113497257233,
"skip_count": 3.0,
"step": 212,
"text_loss": 0.21822240948677063
@@ -2031,13 +2031,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.197265625,
+ "grad_norm": 0.2060546875,
"learning_rate": 0.000426,
- "loss": 0.062,
+ "loss": 0.0621,
"macro_f1": 0.3333333432674408,
"num_tokens": 344887.0,
"repeat_count": 0.0,
- "routers_loss": 0.02415696159005165,
+ "routers_loss": 0.023898238316178322,
"skip_count": 0.0,
"step": 214,
"text_loss": 0.24692800641059875
@@ -2050,13 +2050,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.353515625,
+ "grad_norm": 0.3671875,
"learning_rate": 0.00043,
- "loss": 0.1011,
+ "loss": 0.1005,
"macro_f1": 0.3272727429866791,
"num_tokens": 348700.0,
"repeat_count": 1.0,
- "routers_loss": 0.06956391036510468,
+ "routers_loss": 0.06414655596017838,
"skip_count": 0.0,
"step": 216,
"text_loss": 0.4744548797607422
@@ -2069,13 +2069,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1904296875,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.00043400000000000003,
- "loss": 0.076,
+ "loss": 0.0753,
"macro_f1": 0.32098764181137085,
"num_tokens": 351507.0,
"repeat_count": 1.0,
- "routers_loss": 0.1140352189540863,
+ "routers_loss": 0.11702914535999298,
"skip_count": 1.0,
"step": 218,
"text_loss": 0.5614864826202393
@@ -2090,11 +2090,11 @@
"f1_skip": 0.0,
"grad_norm": 0.189453125,
"learning_rate": 0.000438,
- "loss": 0.0788,
+ "loss": 0.0792,
"macro_f1": 0.3333333432674408,
"num_tokens": 354484.0,
"repeat_count": 0.0,
- "routers_loss": 0.011621571145951748,
+ "routers_loss": 0.014991643838584423,
"skip_count": 0.0,
"step": 220,
"text_loss": 0.47209832072257996
@@ -2107,13 +2107,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.251953125,
"learning_rate": 0.000442,
"loss": 0.106,
"macro_f1": 0.3272727429866791,
"num_tokens": 357954.0,
"repeat_count": 0.0,
- "routers_loss": 0.05813701078295708,
+ "routers_loss": 0.04747112840414047,
"skip_count": 1.0,
"step": 222,
"text_loss": 0.2968728244304657
@@ -2126,13 +2126,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.357421875,
+ "grad_norm": 0.40234375,
"learning_rate": 0.000446,
- "loss": 0.0827,
+ "loss": 0.0853,
"macro_f1": 0.32098764181137085,
"num_tokens": 360547.0,
"repeat_count": 0.0,
- "routers_loss": 0.0646885335445404,
+ "routers_loss": 0.06754162162542343,
"skip_count": 2.0,
"step": 224,
"text_loss": 0.2364148646593094
@@ -2145,13 +2145,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.244140625,
+ "grad_norm": 0.2412109375,
"learning_rate": 0.00045000000000000004,
- "loss": 0.1011,
+ "loss": 0.1016,
"macro_f1": 0.3272727429866791,
"num_tokens": 364529.0,
"repeat_count": 0.0,
- "routers_loss": 0.07224348932504654,
+ "routers_loss": 0.07830183953046799,
"skip_count": 1.0,
"step": 226,
"text_loss": 0.4787476360797882
@@ -2164,13 +2164,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.1953125,
"learning_rate": 0.00045400000000000003,
- "loss": 0.0781,
+ "loss": 0.0792,
"macro_f1": 0.3333333432674408,
"num_tokens": 367683.0,
"repeat_count": 0.0,
- "routers_loss": 0.015971746295690536,
+ "routers_loss": 0.015735948458313942,
"skip_count": 0.0,
"step": 228,
"text_loss": 0.37148505449295044
@@ -2183,13 +2183,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.25,
"learning_rate": 0.000458,
- "loss": 0.099,
+ "loss": 0.0995,
"macro_f1": 0.3333333432674408,
"num_tokens": 371402.0,
"repeat_count": 0.0,
- "routers_loss": 0.017818331718444824,
+ "routers_loss": 0.013354359194636345,
"skip_count": 0.0,
"step": 230,
"text_loss": 0.7464763522148132
@@ -2202,13 +2202,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.000462,
- "loss": 0.0757,
+ "loss": 0.0731,
"macro_f1": 0.3333333432674408,
"num_tokens": 374587.0,
"repeat_count": 0.0,
- "routers_loss": 0.01582280732691288,
+ "routers_loss": 0.013763721100986004,
"skip_count": 0.0,
"step": 232,
"text_loss": 0.8754443526268005
@@ -2221,13 +2221,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.42578125,
+ "grad_norm": 0.3984375,
"learning_rate": 0.00046600000000000005,
- "loss": 0.0876,
+ "loss": 0.0861,
"macro_f1": 0.3333333432674408,
"num_tokens": 377513.0,
"repeat_count": 0.0,
- "routers_loss": 0.011417915113270283,
+ "routers_loss": 0.010075435042381287,
"skip_count": 0.0,
"step": 234,
"text_loss": 0.31534913182258606
@@ -2240,13 +2240,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.17578125,
"learning_rate": 0.00047,
- "loss": 0.0801,
+ "loss": 0.0791,
"macro_f1": 0.3272727429866791,
"num_tokens": 380736.0,
"repeat_count": 0.0,
- "routers_loss": 0.05787832289934158,
+ "routers_loss": 0.059825167059898376,
"skip_count": 1.0,
"step": 236,
"text_loss": 0.5936337113380432
@@ -2259,13 +2259,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.236328125,
+ "grad_norm": 0.267578125,
"learning_rate": 0.000474,
- "loss": 0.0508,
+ "loss": 0.0514,
"macro_f1": 0.32098764181137085,
"num_tokens": 383236.0,
"repeat_count": 0.0,
- "routers_loss": 0.09476690739393234,
+ "routers_loss": 0.09134846180677414,
"skip_count": 2.0,
"step": 238,
"text_loss": 0.5976157784461975
@@ -2278,13 +2278,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.208984375,
"learning_rate": 0.00047799999999999996,
- "loss": 0.0833,
+ "loss": 0.0858,
"macro_f1": 0.32098764181137085,
"num_tokens": 385778.0,
"repeat_count": 1.0,
- "routers_loss": 0.1099705696105957,
+ "routers_loss": 0.11989791691303253,
"skip_count": 1.0,
"step": 240,
"text_loss": 0.3554210960865021
@@ -2297,13 +2297,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.171875,
"learning_rate": 0.000482,
- "loss": 0.0745,
+ "loss": 0.0734,
"macro_f1": 0.3333333432674408,
"num_tokens": 388777.0,
"repeat_count": 0.0,
- "routers_loss": 0.01269970741122961,
+ "routers_loss": 0.013591105118393898,
"skip_count": 0.0,
"step": 242,
"text_loss": 0.4829460382461548
@@ -2316,13 +2316,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11962890625,
+ "grad_norm": 0.12060546875,
"learning_rate": 0.000486,
- "loss": 0.061,
+ "loss": 0.0625,
"macro_f1": 0.32098764181137085,
"num_tokens": 391797.0,
"repeat_count": 0.0,
- "routers_loss": 0.08505752682685852,
+ "routers_loss": 0.0920003354549408,
"skip_count": 2.0,
"step": 244,
"text_loss": 0.3085818886756897
@@ -2335,13 +2335,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.00049,
- "loss": 0.0504,
+ "loss": 0.0501,
"macro_f1": 0.3333333432674408,
"num_tokens": 396485.0,
"repeat_count": 0.0,
- "routers_loss": 0.012750142253935337,
+ "routers_loss": 0.0129330949857831,
"skip_count": 0.0,
"step": 246,
"text_loss": 0.42803969979286194
@@ -2354,13 +2354,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.291015625,
+ "grad_norm": 0.296875,
"learning_rate": 0.000494,
- "loss": 0.0962,
+ "loss": 0.0945,
"macro_f1": 0.3144654333591461,
"num_tokens": 399923.0,
"repeat_count": 0.0,
- "routers_loss": 0.11287309974431992,
+ "routers_loss": 0.10677755624055862,
"skip_count": 3.0,
"step": 248,
"text_loss": 0.2908555567264557
@@ -2373,32 +2373,32 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.203125,
"learning_rate": 0.000498,
- "loss": 0.0821,
+ "loss": 0.0812,
"macro_f1": 0.3144654333591461,
"num_tokens": 403647.0,
"repeat_count": 0.0,
- "routers_loss": 0.1486474722623825,
+ "routers_loss": 0.1504337340593338,
"skip_count": 3.0,
"step": 250,
"text_loss": 0.333095908164978
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 1.183152333431171,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
+ "f1_skip": 0.0,
"grad_norm": 0.22265625,
"learning_rate": 0.0005020000000000001,
- "loss": 0.0832,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0828,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 409147.0,
"repeat_count": 0.0,
- "routers_loss": 0.06636594980955124,
+ "routers_loss": 0.06503184884786606,
"skip_count": 2.0,
"step": 252,
"text_loss": 0.16117942333221436
@@ -2411,13 +2411,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.267578125,
+ "grad_norm": 0.287109375,
"learning_rate": 0.000506,
- "loss": 0.1,
+ "loss": 0.0995,
"macro_f1": 0.3333333432674408,
"num_tokens": 412072.0,
"repeat_count": 0.0,
- "routers_loss": 0.015062150545418262,
+ "routers_loss": 0.016280122101306915,
"skip_count": 0.0,
"step": 254,
"text_loss": 0.4217492640018463
@@ -2430,13 +2430,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2138671875,
+ "grad_norm": 0.21484375,
"learning_rate": 0.00051,
- "loss": 0.0808,
+ "loss": 0.0803,
"macro_f1": 0.3144654333591461,
"num_tokens": 415052.0,
"repeat_count": 2.0,
- "routers_loss": 0.2051105946302414,
+ "routers_loss": 0.2117508500814438,
"skip_count": 1.0,
"step": 256,
"text_loss": 0.5795308947563171
@@ -2449,13 +2449,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2412109375,
+ "grad_norm": 0.2421875,
"learning_rate": 0.000514,
- "loss": 0.068,
+ "loss": 0.0668,
"macro_f1": 0.3272727429866791,
"num_tokens": 418099.0,
"repeat_count": 1.0,
- "routers_loss": 0.1467045396566391,
+ "routers_loss": 0.15002092719078064,
"skip_count": 0.0,
"step": 258,
"text_loss": 0.4840938448905945
@@ -2468,13 +2468,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.000518,
- "loss": 0.0543,
+ "loss": 0.0538,
"macro_f1": 0.3333333432674408,
"num_tokens": 422526.0,
"repeat_count": 0.0,
- "routers_loss": 0.013022038154304028,
+ "routers_loss": 0.012834074907004833,
"skip_count": 0.0,
"step": 260,
"text_loss": 0.36141225695610046
@@ -2487,13 +2487,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.216796875,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.000522,
- "loss": 0.0848,
+ "loss": 0.085,
"macro_f1": 0.3076923191547394,
"num_tokens": 425765.0,
"repeat_count": 2.0,
- "routers_loss": 0.2575930058956146,
+ "routers_loss": 0.23808011412620544,
"skip_count": 2.0,
"step": 262,
"text_loss": 0.27572691440582275
@@ -2506,13 +2506,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000526,
- "loss": 0.07,
+ "loss": 0.0708,
"macro_f1": 0.3272727429866791,
"num_tokens": 429048.0,
"repeat_count": 0.0,
- "routers_loss": 0.0558602549135685,
+ "routers_loss": 0.055687375366687775,
"skip_count": 1.0,
"step": 264,
"text_loss": 0.37020301818847656
@@ -2525,13 +2525,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.2080078125,
"learning_rate": 0.0005300000000000001,
- "loss": 0.082,
+ "loss": 0.0839,
"macro_f1": 0.3272727429866791,
"num_tokens": 431784.0,
"repeat_count": 0.0,
- "routers_loss": 0.09126655012369156,
+ "routers_loss": 0.0872957780957222,
"skip_count": 1.0,
"step": 266,
"text_loss": 0.5937283039093018
@@ -2544,13 +2544,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.263671875,
"learning_rate": 0.0005340000000000001,
- "loss": 0.0764,
+ "loss": 0.0733,
"macro_f1": 0.32098764181137085,
"num_tokens": 434297.0,
"repeat_count": 2.0,
- "routers_loss": 0.24805288016796112,
+ "routers_loss": 0.23507654666900635,
"skip_count": 0.0,
"step": 268,
"text_loss": 0.3367372453212738
@@ -2563,13 +2563,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.22265625,
+ "grad_norm": 0.2431640625,
"learning_rate": 0.0005380000000000001,
- "loss": 0.0686,
+ "loss": 0.0708,
"macro_f1": 0.32098764181137085,
"num_tokens": 437586.0,
"repeat_count": 0.0,
- "routers_loss": 0.13135533034801483,
+ "routers_loss": 0.12860390543937683,
"skip_count": 2.0,
"step": 270,
"text_loss": 0.7149854302406311
@@ -2582,13 +2582,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.2451171875,
"learning_rate": 0.0005420000000000001,
- "loss": 0.1083,
+ "loss": 0.1072,
"macro_f1": 0.3272727429866791,
"num_tokens": 440649.0,
"repeat_count": 0.0,
- "routers_loss": 0.04991440102458,
+ "routers_loss": 0.044308312237262726,
"skip_count": 1.0,
"step": 272,
"text_loss": 0.26778292655944824
@@ -2601,13 +2601,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.455078125,
+ "grad_norm": 0.44921875,
"learning_rate": 0.000546,
- "loss": 0.0991,
+ "loss": 0.0938,
"macro_f1": 0.3144654333591461,
"num_tokens": 443907.0,
"repeat_count": 0.0,
- "routers_loss": 0.12236632406711578,
+ "routers_loss": 0.11514109373092651,
"skip_count": 3.0,
"step": 274,
"text_loss": 0.23578761518001556
@@ -2620,13 +2620,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.25,
+ "grad_norm": 0.2578125,
"learning_rate": 0.00055,
- "loss": 0.0936,
+ "loss": 0.0932,
"macro_f1": 0.5492662787437439,
"num_tokens": 447147.0,
"repeat_count": 0.0,
- "routers_loss": 0.053506772965192795,
+ "routers_loss": 0.055705297738313675,
"skip_count": 2.0,
"step": 276,
"text_loss": 0.2513524889945984
@@ -2639,13 +2639,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.265625,
+ "grad_norm": 0.29296875,
"learning_rate": 0.000554,
- "loss": 0.066,
+ "loss": 0.0667,
"macro_f1": 0.32098764181137085,
"num_tokens": 450032.0,
"repeat_count": 0.0,
- "routers_loss": 0.13446088135242462,
+ "routers_loss": 0.13778971135616302,
"skip_count": 2.0,
"step": 278,
"text_loss": 0.4857243597507477
@@ -2658,32 +2658,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.185546875,
"learning_rate": 0.000558,
- "loss": 0.0682,
+ "loss": 0.0672,
"macro_f1": 0.3272727429866791,
"num_tokens": 453195.0,
"repeat_count": 1.0,
- "routers_loss": 0.07270720601081848,
+ "routers_loss": 0.0700262188911438,
"skip_count": 0.0,
"step": 280,
"text_loss": 0.7589789628982544
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 1.3240387437628411,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.28125,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25,
"learning_rate": 0.0005620000000000001,
- "loss": 0.0648,
- "macro_f1": 0.5427350401878357,
+ "loss": 0.0603,
+ "macro_f1": 0.3144654333591461,
"num_tokens": 455942.0,
"repeat_count": 1.0,
- "routers_loss": 0.13866399228572845,
+ "routers_loss": 0.11706235259771347,
"skip_count": 2.0,
"step": 282,
"text_loss": 0.4783432185649872
@@ -2696,13 +2696,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.236328125,
+ "grad_norm": 0.265625,
"learning_rate": 0.000566,
- "loss": 0.0782,
+ "loss": 0.0793,
"macro_f1": 0.3272727429866791,
"num_tokens": 458932.0,
"repeat_count": 0.0,
- "routers_loss": 0.0645354762673378,
+ "routers_loss": 0.07073967158794403,
"skip_count": 1.0,
"step": 284,
"text_loss": 0.7117193937301636
@@ -2715,13 +2715,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.00057,
- "loss": 0.0892,
+ "loss": 0.0915,
"macro_f1": 0.3272727429866791,
"num_tokens": 462650.0,
"repeat_count": 0.0,
- "routers_loss": 0.05967628210783005,
+ "routers_loss": 0.05301115661859512,
"skip_count": 1.0,
"step": 286,
"text_loss": 0.4175460636615753
@@ -2734,13 +2734,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23828125,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.000574,
- "loss": 0.0676,
+ "loss": 0.0675,
"macro_f1": 0.3272727429866791,
"num_tokens": 466290.0,
"repeat_count": 0.0,
- "routers_loss": 0.06438407301902771,
+ "routers_loss": 0.06356479972600937,
"skip_count": 1.0,
"step": 288,
"text_loss": 0.5832946300506592
@@ -2753,13 +2753,13 @@
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.28515625,
"learning_rate": 0.000578,
- "loss": 0.0781,
+ "loss": 0.0805,
"macro_f1": 0.3006536066532135,
"num_tokens": 469296.0,
"repeat_count": 1.0,
- "routers_loss": 0.21225209534168243,
+ "routers_loss": 0.21032999455928802,
"skip_count": 3.0,
"step": 290,
"text_loss": 0.36023473739624023
@@ -2772,13 +2772,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.244140625,
+ "grad_norm": 0.27734375,
"learning_rate": 0.0005819999999999999,
- "loss": 0.0664,
+ "loss": 0.0685,
"macro_f1": 0.32098764181137085,
"num_tokens": 472272.0,
"repeat_count": 1.0,
- "routers_loss": 0.08085516840219498,
+ "routers_loss": 0.08062280714511871,
"skip_count": 1.0,
"step": 292,
"text_loss": 0.37197956442832947
@@ -2791,13 +2791,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.28125,
"learning_rate": 0.0005859999999999999,
- "loss": 0.0874,
+ "loss": 0.0878,
"macro_f1": 0.32098764181137085,
"num_tokens": 475864.0,
"repeat_count": 0.0,
- "routers_loss": 0.05378658324480057,
+ "routers_loss": 0.05023600533604622,
"skip_count": 2.0,
"step": 294,
"text_loss": 0.4765273630619049
@@ -2810,13 +2810,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.2177734375,
"learning_rate": 0.00059,
- "loss": 0.0715,
+ "loss": 0.0728,
"macro_f1": 0.3333333432674408,
"num_tokens": 478916.0,
"repeat_count": 0.0,
- "routers_loss": 0.01145261898636818,
+ "routers_loss": 0.011689410544931889,
"skip_count": 0.0,
"step": 296,
"text_loss": 0.5878773927688599
@@ -2831,11 +2831,11 @@
"f1_skip": 0.0,
"grad_norm": 0.15625,
"learning_rate": 0.000594,
- "loss": 0.0737,
+ "loss": 0.0727,
"macro_f1": 0.3333333432674408,
"num_tokens": 482369.0,
"repeat_count": 0.0,
- "routers_loss": 0.009397956542670727,
+ "routers_loss": 0.010772093199193478,
"skip_count": 0.0,
"step": 298,
"text_loss": 0.4424116313457489
@@ -2848,13 +2848,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.181640625,
"learning_rate": 0.000598,
- "loss": 0.0802,
+ "loss": 0.0787,
"macro_f1": 0.3076923191547394,
"num_tokens": 486049.0,
"repeat_count": 2.0,
- "routers_loss": 0.2389357089996338,
+ "routers_loss": 0.23482851684093475,
"skip_count": 2.0,
"step": 300,
"text_loss": 0.21217775344848633
@@ -2862,18 +2862,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 1.417963017317288,
- "f1_execute": 0.9019607901573181,
+ "f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.2080078125,
"learning_rate": 0.000602,
- "loss": 0.0745,
- "macro_f1": 0.3006536066532135,
+ "loss": 0.073,
+ "macro_f1": 0.3076923191547394,
"num_tokens": 488683.0,
"repeat_count": 1.0,
- "routers_loss": 0.18252353370189667,
+ "routers_loss": 0.18843084573745728,
"skip_count": 3.0,
"step": 302,
"text_loss": 0.2109498232603073
@@ -2886,13 +2886,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.27734375,
+ "grad_norm": 0.279296875,
"learning_rate": 0.000606,
- "loss": 0.0935,
+ "loss": 0.0945,
"macro_f1": 0.3144654333591461,
"num_tokens": 492010.0,
"repeat_count": 0.0,
- "routers_loss": 0.18185268342494965,
+ "routers_loss": 0.17861786484718323,
"skip_count": 3.0,
"step": 304,
"text_loss": 0.8446305394172668
@@ -2905,13 +2905,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.00061,
- "loss": 0.0853,
+ "loss": 0.0827,
"macro_f1": 0.3333333432674408,
"num_tokens": 494764.0,
"repeat_count": 0.0,
- "routers_loss": 0.013210167177021503,
+ "routers_loss": 0.014124520123004913,
"skip_count": 0.0,
"step": 306,
"text_loss": 0.742735743522644
@@ -2924,13 +2924,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.26953125,
"learning_rate": 0.000614,
- "loss": 0.1089,
+ "loss": 0.1071,
"macro_f1": 0.3333333432674408,
"num_tokens": 497820.0,
"repeat_count": 0.0,
- "routers_loss": 0.016936838626861572,
+ "routers_loss": 0.017968112602829933,
"skip_count": 0.0,
"step": 308,
"text_loss": 0.28305482864379883
@@ -2943,13 +2943,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.0006180000000000001,
- "loss": 0.077,
+ "loss": 0.0775,
"macro_f1": 0.32098764181137085,
"num_tokens": 500694.0,
"repeat_count": 0.0,
- "routers_loss": 0.08630389720201492,
+ "routers_loss": 0.08593655377626419,
"skip_count": 2.0,
"step": 310,
"text_loss": 0.3496848940849304
@@ -2962,13 +2962,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.19140625,
"learning_rate": 0.000622,
- "loss": 0.0602,
+ "loss": 0.061,
"macro_f1": 0.3333333432674408,
"num_tokens": 503871.0,
"repeat_count": 0.0,
- "routers_loss": 0.013665963895618916,
+ "routers_loss": 0.016449492424726486,
"skip_count": 0.0,
"step": 312,
"text_loss": 0.6691372990608215
@@ -2981,13 +2981,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.205078125,
"learning_rate": 0.000626,
- "loss": 0.0794,
+ "loss": 0.0815,
"macro_f1": 0.3333333432674408,
"num_tokens": 506730.0,
"repeat_count": 0.0,
- "routers_loss": 0.01584783010184765,
+ "routers_loss": 0.014532964676618576,
"skip_count": 0.0,
"step": 314,
"text_loss": 0.6118118166923523
@@ -3000,13 +3000,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.224609375,
+ "grad_norm": 0.2216796875,
"learning_rate": 0.00063,
- "loss": 0.0762,
+ "loss": 0.0742,
"macro_f1": 0.3333333432674408,
"num_tokens": 510323.0,
"repeat_count": 0.0,
- "routers_loss": 0.01368923019617796,
+ "routers_loss": 0.013093139044940472,
"skip_count": 0.0,
"step": 316,
"text_loss": 0.38126271963119507
@@ -3019,13 +3019,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.388671875,
+ "grad_norm": 0.400390625,
"learning_rate": 0.000634,
- "loss": 0.0908,
+ "loss": 0.0915,
"macro_f1": 0.3333333432674408,
"num_tokens": 514075.0,
"repeat_count": 0.0,
- "routers_loss": 0.009135022759437561,
+ "routers_loss": 0.008627045899629593,
"skip_count": 0.0,
"step": 318,
"text_loss": 0.5983037948608398
@@ -3038,13 +3038,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000638,
- "loss": 0.0949,
+ "loss": 0.1008,
"macro_f1": 0.3272727429866791,
"num_tokens": 517418.0,
"repeat_count": 0.0,
- "routers_loss": 0.046641621738672256,
+ "routers_loss": 0.04561378434300423,
"skip_count": 1.0,
"step": 320,
"text_loss": 0.767257034778595
@@ -3052,18 +3052,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 1.5118872908717347,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23046875,
+ "grad_norm": 0.259765625,
"learning_rate": 0.000642,
- "loss": 0.0925,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0926,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 520443.0,
"repeat_count": 0.0,
- "routers_loss": 0.020637936890125275,
+ "routers_loss": 0.024372953921556473,
"skip_count": 0.0,
"step": 322,
"text_loss": 0.6572105884552002
@@ -3076,13 +3076,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26953125,
+ "grad_norm": 0.30078125,
"learning_rate": 0.000646,
"loss": 0.0822,
"macro_f1": 0.3272727429866791,
"num_tokens": 523317.0,
"repeat_count": 1.0,
- "routers_loss": 0.08289298415184021,
+ "routers_loss": 0.08099937438964844,
"skip_count": 0.0,
"step": 324,
"text_loss": 0.205499529838562
@@ -3090,18 +3090,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 1.530672145582624,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23828125,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.0006500000000000001,
- "loss": 0.0823,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0809,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 526355.0,
"repeat_count": 0.0,
- "routers_loss": 0.06960040330886841,
+ "routers_loss": 0.0657225176692009,
"skip_count": 1.0,
"step": 326,
"text_loss": 0.2587239742279053
@@ -3114,13 +3114,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1162109375,
+ "grad_norm": 0.111328125,
"learning_rate": 0.0006540000000000001,
- "loss": 0.0799,
+ "loss": 0.0779,
"macro_f1": 0.3333333432674408,
"num_tokens": 529689.0,
"repeat_count": 0.0,
- "routers_loss": 0.02087482251226902,
+ "routers_loss": 0.01849208027124405,
"skip_count": 0.0,
"step": 328,
"text_loss": 0.2172023057937622
@@ -3133,13 +3133,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.1845703125,
"learning_rate": 0.0006580000000000001,
- "loss": 0.0757,
+ "loss": 0.0758,
"macro_f1": 0.3333333432674408,
"num_tokens": 532603.0,
"repeat_count": 0.0,
- "routers_loss": 0.016592051833868027,
+ "routers_loss": 0.016184113919734955,
"skip_count": 0.0,
"step": 330,
"text_loss": 0.5980568528175354
@@ -3152,32 +3152,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.22265625,
+ "grad_norm": 0.220703125,
"learning_rate": 0.000662,
- "loss": 0.0438,
+ "loss": 0.0439,
"macro_f1": 0.3333333432674408,
"num_tokens": 536056.0,
"repeat_count": 0.0,
- "routers_loss": 0.012950568459928036,
+ "routers_loss": 0.01303898449987173,
"skip_count": 0.0,
"step": 332,
"text_loss": 0.5421966314315796
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
"epoch": 1.5682418550044028,
- "f1_execute": 0.8799999952316284,
+ "f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.310546875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.296875,
"learning_rate": 0.000666,
- "loss": 0.0964,
- "macro_f1": 0.29333335161209106,
+ "loss": 0.0963,
+ "macro_f1": 0.465986430644989,
"num_tokens": 539231.0,
"repeat_count": 3.0,
- "routers_loss": 0.3373340964317322,
+ "routers_loss": 0.3075675964355469,
"skip_count": 3.0,
"step": 334,
"text_loss": 0.19719554483890533
@@ -3190,13 +3190,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.171875,
+ "grad_norm": 0.173828125,
"learning_rate": 0.00067,
"loss": 0.0706,
"macro_f1": 0.3333333432674408,
"num_tokens": 542038.0,
"repeat_count": 0.0,
- "routers_loss": 0.008110735565423965,
+ "routers_loss": 0.009116224013268948,
"skip_count": 0.0,
"step": 336,
"text_loss": 0.3407036066055298
@@ -3209,13 +3209,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.248046875,
+ "grad_norm": 0.2421875,
"learning_rate": 0.000674,
- "loss": 0.0771,
+ "loss": 0.0768,
"macro_f1": 0.3333333432674408,
"num_tokens": 545019.0,
"repeat_count": 0.0,
- "routers_loss": 0.01841609925031662,
+ "routers_loss": 0.021463042125105858,
"skip_count": 0.0,
"step": 338,
"text_loss": 0.24486012756824493
@@ -3228,13 +3228,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.0006780000000000001,
- "loss": 0.0894,
+ "loss": 0.0889,
"macro_f1": 0.3333333432674408,
"num_tokens": 548036.0,
"repeat_count": 0.0,
- "routers_loss": 0.01612614095211029,
+ "routers_loss": 0.01857556402683258,
"skip_count": 0.0,
"step": 340,
"text_loss": 0.28140124678611755
@@ -3247,13 +3247,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.125,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0006820000000000001,
- "loss": 0.0611,
+ "loss": 0.0617,
"macro_f1": 0.3006536364555359,
"num_tokens": 551419.0,
"repeat_count": 2.0,
- "routers_loss": 0.26202192902565,
+ "routers_loss": 0.27090007066726685,
"skip_count": 3.0,
"step": 342,
"text_loss": 0.20690307021141052
@@ -3266,13 +3266,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.3046875,
"learning_rate": 0.0006860000000000001,
- "loss": 0.1013,
+ "loss": 0.1047,
"macro_f1": 0.32098764181137085,
"num_tokens": 554037.0,
"repeat_count": 0.0,
- "routers_loss": 0.09235779196023941,
+ "routers_loss": 0.09231195598840714,
"skip_count": 2.0,
"step": 344,
"text_loss": 0.4479128420352936
@@ -3285,13 +3285,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.255859375,
"learning_rate": 0.00069,
- "loss": 0.0856,
+ "loss": 0.0883,
"macro_f1": 0.3333333432674408,
"num_tokens": 556672.0,
"repeat_count": 0.0,
- "routers_loss": 0.010735333897173405,
+ "routers_loss": 0.00935924518853426,
"skip_count": 0.0,
"step": 346,
"text_loss": 0.6377320289611816
@@ -3304,13 +3304,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.000694,
- "loss": 0.0778,
+ "loss": 0.0781,
"macro_f1": 0.32098764181137085,
"num_tokens": 559756.0,
"repeat_count": 0.0,
- "routers_loss": 0.14742356538772583,
+ "routers_loss": 0.17641772329807281,
"skip_count": 2.0,
"step": 348,
"text_loss": 0.6097636222839355
@@ -3323,13 +3323,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.30859375,
+ "grad_norm": 0.30078125,
"learning_rate": 0.0006979999999999999,
- "loss": 0.0614,
+ "loss": 0.0616,
"macro_f1": 0.5492662787437439,
"num_tokens": 563415.0,
"repeat_count": 0.0,
- "routers_loss": 0.06606879830360413,
+ "routers_loss": 0.06240406632423401,
"skip_count": 2.0,
"step": 350,
"text_loss": 0.5291631817817688
@@ -3342,13 +3342,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.322265625,
+ "grad_norm": 0.296875,
"learning_rate": 0.0007019999999999999,
- "loss": 0.1033,
+ "loss": 0.1026,
"macro_f1": 0.3333333432674408,
"num_tokens": 566357.0,
"repeat_count": 0.0,
- "routers_loss": 0.012873432599008083,
+ "routers_loss": 0.012269247323274612,
"skip_count": 0.0,
"step": 352,
"text_loss": 0.5170195698738098
@@ -3361,13 +3361,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0007059999999999999,
- "loss": 0.0819,
+ "loss": 0.0815,
"macro_f1": 0.32098764181137085,
"num_tokens": 569449.0,
"repeat_count": 0.0,
- "routers_loss": 0.07853665202856064,
+ "routers_loss": 0.07515309751033783,
"skip_count": 2.0,
"step": 354,
"text_loss": 0.34507250785827637
@@ -3380,13 +3380,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.251953125,
+ "grad_norm": 0.263671875,
"learning_rate": 0.00071,
- "loss": 0.0804,
+ "loss": 0.0791,
"macro_f1": 0.3144654333591461,
"num_tokens": 572761.0,
"repeat_count": 1.0,
- "routers_loss": 0.2216549813747406,
+ "routers_loss": 0.20768006145954132,
"skip_count": 2.0,
"step": 356,
"text_loss": 0.3158532381057739
@@ -3399,13 +3399,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.185546875,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.000714,
- "loss": 0.0675,
+ "loss": 0.0682,
"macro_f1": 0.3333333432674408,
"num_tokens": 575909.0,
"repeat_count": 0.0,
- "routers_loss": 0.02423691377043724,
+ "routers_loss": 0.025329967960715294,
"skip_count": 0.0,
"step": 358,
"text_loss": 0.21455390751361847
@@ -3413,18 +3413,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 1.6903434106251836,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.21484375,
"learning_rate": 0.000718,
- "loss": 0.0781,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0775,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 579186.0,
"repeat_count": 1.0,
- "routers_loss": 0.07496294379234314,
+ "routers_loss": 0.07676175981760025,
"skip_count": 0.0,
"step": 360,
"text_loss": 0.61895352602005
@@ -3437,13 +3437,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2138671875,
+ "grad_norm": 0.197265625,
"learning_rate": 0.000722,
- "loss": 0.0778,
+ "loss": 0.0781,
"macro_f1": 0.32098767161369324,
"num_tokens": 582437.0,
"repeat_count": 0.0,
- "routers_loss": 0.08181872963905334,
+ "routers_loss": 0.08070661872625351,
"skip_count": 1.0,
"step": 362,
"text_loss": 0.20557661354541779
@@ -3456,13 +3456,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.2216796875,
"learning_rate": 0.000726,
- "loss": 0.1112,
+ "loss": 0.11,
"macro_f1": 0.3333333432674408,
"num_tokens": 586096.0,
"repeat_count": 0.0,
- "routers_loss": 0.016959719359874725,
+ "routers_loss": 0.015891313552856445,
"skip_count": 0.0,
"step": 364,
"text_loss": 0.597991943359375
@@ -3475,13 +3475,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.15625,
"learning_rate": 0.00073,
- "loss": 0.0577,
+ "loss": 0.0573,
"macro_f1": 0.3076923191547394,
"num_tokens": 589520.0,
"repeat_count": 1.0,
- "routers_loss": 0.13295969367027283,
+ "routers_loss": 0.12844261527061462,
"skip_count": 3.0,
"step": 366,
"text_loss": 0.2944789230823517
@@ -3494,13 +3494,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1455078125,
+ "grad_norm": 0.150390625,
"learning_rate": 0.000734,
- "loss": 0.0986,
+ "loss": 0.1005,
"macro_f1": 0.3333333432674408,
"num_tokens": 592691.0,
"repeat_count": 0.0,
- "routers_loss": 0.02476893551647663,
+ "routers_loss": 0.02382199838757515,
"skip_count": 0.0,
"step": 368,
"text_loss": 0.23989969491958618
@@ -3513,13 +3513,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1796875,
"learning_rate": 0.000738,
- "loss": 0.0682,
+ "loss": 0.0661,
"macro_f1": 0.3333333432674408,
"num_tokens": 596004.0,
"repeat_count": 0.0,
- "routers_loss": 0.019863395020365715,
+ "routers_loss": 0.018812084570527077,
"skip_count": 0.0,
"step": 370,
"text_loss": 0.22111408412456512
@@ -3532,13 +3532,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.2412109375,
"learning_rate": 0.000742,
- "loss": 0.0663,
+ "loss": 0.0666,
"macro_f1": 0.3272727429866791,
"num_tokens": 599087.0,
"repeat_count": 0.0,
- "routers_loss": 0.07230417430400848,
+ "routers_loss": 0.08290331065654755,
"skip_count": 1.0,
"step": 372,
"text_loss": 0.2567356526851654
@@ -3551,13 +3551,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.2412109375,
"learning_rate": 0.000746,
- "loss": 0.0986,
+ "loss": 0.0941,
"macro_f1": 0.32098764181137085,
"num_tokens": 602330.0,
"repeat_count": 1.0,
- "routers_loss": 0.11727793514728546,
+ "routers_loss": 0.11482042074203491,
"skip_count": 1.0,
"step": 374,
"text_loss": 0.7217292785644531
@@ -3570,13 +3570,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.224609375,
+ "grad_norm": 0.2265625,
"learning_rate": 0.00075,
- "loss": 0.0724,
+ "loss": 0.0728,
"macro_f1": 0.3272727429866791,
"num_tokens": 605503.0,
"repeat_count": 1.0,
- "routers_loss": 0.13495951890945435,
+ "routers_loss": 0.11849870532751083,
"skip_count": 0.0,
"step": 376,
"text_loss": 0.5122153759002686
@@ -3589,13 +3589,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23046875,
+ "grad_norm": 0.2333984375,
"learning_rate": 0.000754,
- "loss": 0.0823,
+ "loss": 0.0835,
"macro_f1": 0.32098767161369324,
"num_tokens": 608505.0,
"repeat_count": 0.0,
- "routers_loss": 0.07612533867359161,
+ "routers_loss": 0.07090992480516434,
"skip_count": 1.0,
"step": 378,
"text_loss": 0.2204965502023697
@@ -3608,13 +3608,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.1826171875,
"learning_rate": 0.000758,
- "loss": 0.0803,
+ "loss": 0.0794,
"macro_f1": 0.3272727429866791,
"num_tokens": 611193.0,
"repeat_count": 0.0,
- "routers_loss": 0.0484120175242424,
+ "routers_loss": 0.03812089189887047,
"skip_count": 1.0,
"step": 380,
"text_loss": 0.44909021258354187
@@ -3627,13 +3627,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.000762,
- "loss": 0.0866,
+ "loss": 0.0882,
"macro_f1": 0.3272727429866791,
"num_tokens": 614231.0,
"repeat_count": 1.0,
- "routers_loss": 0.10939671844244003,
+ "routers_loss": 0.10270529240369797,
"skip_count": 0.0,
"step": 382,
"text_loss": 0.13624964654445648
@@ -3646,13 +3646,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.326171875,
+ "grad_norm": 0.330078125,
"learning_rate": 0.0007660000000000001,
- "loss": 0.1083,
+ "loss": 0.1107,
"macro_f1": 0.32098764181137085,
"num_tokens": 617090.0,
"repeat_count": 1.0,
- "routers_loss": 0.11382336914539337,
+ "routers_loss": 0.11624004691839218,
"skip_count": 1.0,
"step": 384,
"text_loss": 0.7314052581787109
@@ -3667,11 +3667,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1396484375,
"learning_rate": 0.0007700000000000001,
- "loss": 0.0616,
+ "loss": 0.0628,
"macro_f1": 0.32098764181137085,
"num_tokens": 620596.0,
"repeat_count": 0.0,
- "routers_loss": 0.07494530081748962,
+ "routers_loss": 0.07114322483539581,
"skip_count": 2.0,
"step": 386,
"text_loss": 0.503322958946228
@@ -3684,13 +3684,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.298828125,
+ "grad_norm": 0.306640625,
"learning_rate": 0.0007740000000000001,
- "loss": 0.0816,
+ "loss": 0.0829,
"macro_f1": 0.32098764181137085,
"num_tokens": 624108.0,
"repeat_count": 0.0,
- "routers_loss": 0.05718417093157768,
+ "routers_loss": 0.06061873584985733,
"skip_count": 2.0,
"step": 388,
"text_loss": 0.11481904983520508
@@ -3703,13 +3703,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1982421875,
+ "grad_norm": 0.2099609375,
"learning_rate": 0.000778,
- "loss": 0.0783,
+ "loss": 0.0791,
"macro_f1": 0.3006536364555359,
"num_tokens": 626895.0,
"repeat_count": 1.0,
- "routers_loss": 0.2848989963531494,
+ "routers_loss": 0.2921771705150604,
"skip_count": 4.0,
"step": 390,
"text_loss": 0.3069624602794647
@@ -3722,13 +3722,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.30078125,
+ "grad_norm": 0.30859375,
"learning_rate": 0.000782,
- "loss": 0.0608,
+ "loss": 0.0605,
"macro_f1": 0.3076923191547394,
"num_tokens": 630204.0,
"repeat_count": 0.0,
- "routers_loss": 0.2050076276063919,
+ "routers_loss": 0.202707901597023,
"skip_count": 4.0,
"step": 392,
"text_loss": 0.6022785305976868
@@ -3741,13 +3741,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28125,
+ "grad_norm": 0.29296875,
"learning_rate": 0.000786,
- "loss": 0.0863,
+ "loss": 0.0877,
"macro_f1": 0.3333333432674408,
"num_tokens": 634373.0,
"repeat_count": 0.0,
- "routers_loss": 0.020946886390447617,
+ "routers_loss": 0.0221510399132967,
"skip_count": 0.0,
"step": 394,
"text_loss": 0.26787394285202026
@@ -3760,13 +3760,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.376953125,
+ "grad_norm": 0.37890625,
"learning_rate": 0.00079,
- "loss": 0.0798,
+ "loss": 0.0805,
"macro_f1": 0.32098764181137085,
"num_tokens": 637442.0,
"repeat_count": 2.0,
- "routers_loss": 0.1270289123058319,
+ "routers_loss": 0.12636390328407288,
"skip_count": 0.0,
"step": 396,
"text_loss": 0.2799781560897827
@@ -3779,13 +3779,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.2080078125,
"learning_rate": 0.0007940000000000001,
- "loss": 0.0701,
+ "loss": 0.0724,
"macro_f1": 0.32098764181137085,
"num_tokens": 641231.0,
"repeat_count": 0.0,
- "routers_loss": 0.08012636005878448,
+ "routers_loss": 0.07933453470468521,
"skip_count": 2.0,
"step": 398,
"text_loss": 0.2507784366607666
@@ -3798,13 +3798,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.0007980000000000001,
- "loss": 0.0901,
+ "loss": 0.0909,
"macro_f1": 0.3272727429866791,
"num_tokens": 644560.0,
"repeat_count": 1.0,
- "routers_loss": 0.09315784275531769,
+ "routers_loss": 0.10324911028146744,
"skip_count": 0.0,
"step": 400,
"text_loss": 0.7756280303001404
@@ -3817,13 +3817,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0008020000000000001,
- "loss": 0.078,
+ "loss": 0.0783,
"macro_f1": 0.3144654333591461,
"num_tokens": 647393.0,
"repeat_count": 1.0,
- "routers_loss": 0.18492189049720764,
+ "routers_loss": 0.18546262383460999,
"skip_count": 2.0,
"step": 402,
"text_loss": 0.5013328194618225
@@ -3836,13 +3836,13 @@
"f1_execute": 0.8571428656578064,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.283203125,
"learning_rate": 0.0008060000000000001,
- "loss": 0.0801,
+ "loss": 0.0787,
"macro_f1": 0.2857142984867096,
"num_tokens": 650355.0,
"repeat_count": 3.0,
- "routers_loss": 0.32641324400901794,
+ "routers_loss": 0.3280293643474579,
"skip_count": 4.0,
"step": 404,
"text_loss": 0.2842077314853668
@@ -3855,13 +3855,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2080078125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.0008100000000000001,
- "loss": 0.0905,
+ "loss": 0.0901,
"macro_f1": 0.3333333432674408,
"num_tokens": 654280.0,
"repeat_count": 0.0,
- "routers_loss": 0.02722037397325039,
+ "routers_loss": 0.02623247355222702,
"skip_count": 0.0,
"step": 406,
"text_loss": 0.46742817759513855
@@ -3874,13 +3874,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.216796875,
"learning_rate": 0.0008139999999999999,
- "loss": 0.0958,
+ "loss": 0.0945,
"macro_f1": 0.3333333432674408,
"num_tokens": 657568.0,
"repeat_count": 0.0,
- "routers_loss": 0.010129833593964577,
+ "routers_loss": 0.009744114242494106,
"skip_count": 0.0,
"step": 408,
"text_loss": 0.7168047428131104
@@ -3893,13 +3893,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2373046875,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.0008179999999999999,
- "loss": 0.1084,
+ "loss": 0.1065,
"macro_f1": 0.32098764181137085,
"num_tokens": 660593.0,
"repeat_count": 0.0,
- "routers_loss": 0.07298308610916138,
+ "routers_loss": 0.07591600716114044,
"skip_count": 2.0,
"step": 410,
"text_loss": 0.449823260307312
@@ -3912,13 +3912,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.15625,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0008219999999999999,
- "loss": 0.0802,
+ "loss": 0.0795,
"macro_f1": 0.3333333432674408,
"num_tokens": 663916.0,
"repeat_count": 0.0,
- "routers_loss": 0.024257874116301537,
+ "routers_loss": 0.02076602540910244,
"skip_count": 0.0,
"step": 412,
"text_loss": 0.4764713943004608
@@ -3931,13 +3931,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1904296875,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.000826,
- "loss": 0.0842,
+ "loss": 0.0836,
"macro_f1": 0.3272727429866791,
"num_tokens": 667502.0,
"repeat_count": 0.0,
- "routers_loss": 0.048864223062992096,
+ "routers_loss": 0.049170155078172684,
"skip_count": 1.0,
"step": 414,
"text_loss": 0.30333325266838074
@@ -3950,13 +3950,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1513671875,
"learning_rate": 0.00083,
- "loss": 0.1026,
+ "loss": 0.1021,
"macro_f1": 0.3272727429866791,
"num_tokens": 670510.0,
"repeat_count": 1.0,
- "routers_loss": 0.1592330038547516,
+ "routers_loss": 0.15554003417491913,
"skip_count": 0.0,
"step": 416,
"text_loss": 0.3691870868206024
@@ -3969,13 +3969,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000834,
- "loss": 0.0963,
+ "loss": 0.1013,
"macro_f1": 0.3333333432674408,
"num_tokens": 674761.0,
"repeat_count": 0.0,
- "routers_loss": 0.02291976846754551,
+ "routers_loss": 0.024516675621271133,
"skip_count": 0.0,
"step": 418,
"text_loss": 0.32850381731987
@@ -3988,13 +3988,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.000838,
- "loss": 0.0634,
+ "loss": 0.0649,
"macro_f1": 0.3333333432674408,
"num_tokens": 678055.0,
"repeat_count": 0.0,
- "routers_loss": 0.010272650048136711,
+ "routers_loss": 0.011026890948414803,
"skip_count": 0.0,
"step": 420,
"text_loss": 0.6637290716171265
@@ -4007,13 +4007,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28125,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000842,
- "loss": 0.0786,
+ "loss": 0.0771,
"macro_f1": 0.3272727429866791,
"num_tokens": 680979.0,
"repeat_count": 0.0,
- "routers_loss": 0.0692613497376442,
+ "routers_loss": 0.07451887428760529,
"skip_count": 1.0,
"step": 422,
"text_loss": 0.27131685614585876
@@ -4026,13 +4026,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12890625,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.000846,
- "loss": 0.0706,
+ "loss": 0.0714,
"macro_f1": 0.32098764181137085,
"num_tokens": 684144.0,
"repeat_count": 1.0,
- "routers_loss": 0.12713804841041565,
+ "routers_loss": 0.11341800540685654,
"skip_count": 1.0,
"step": 424,
"text_loss": 0.652126669883728
@@ -4045,13 +4045,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.00085,
- "loss": 0.0758,
+ "loss": 0.0754,
"macro_f1": 0.3272727429866791,
"num_tokens": 687004.0,
"repeat_count": 1.0,
- "routers_loss": 0.08670130372047424,
+ "routers_loss": 0.08985847979784012,
"skip_count": 0.0,
"step": 426,
"text_loss": 0.2589428424835205
@@ -4064,13 +4064,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.23828125,
"learning_rate": 0.000854,
- "loss": 0.0857,
+ "loss": 0.0866,
"macro_f1": 0.3333333432674408,
"num_tokens": 689702.0,
"repeat_count": 0.0,
- "routers_loss": 0.01053862925618887,
+ "routers_loss": 0.011355436407029629,
"skip_count": 0.0,
"step": 428,
"text_loss": 0.8909716010093689
@@ -4083,13 +4083,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.000858,
- "loss": 0.0615,
+ "loss": 0.0623,
"macro_f1": 0.3333333432674408,
"num_tokens": 692698.0,
"repeat_count": 0.0,
- "routers_loss": 0.012946994043886662,
+ "routers_loss": 0.013788948766887188,
"skip_count": 0.0,
"step": 430,
"text_loss": 0.19141142070293427
@@ -4102,13 +4102,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.000862,
- "loss": 0.0498,
+ "loss": 0.0499,
"macro_f1": 0.32098764181137085,
"num_tokens": 696007.0,
"repeat_count": 0.0,
- "routers_loss": 0.08222822099924088,
+ "routers_loss": 0.07998392730951309,
"skip_count": 2.0,
"step": 432,
"text_loss": 0.1611809879541397
@@ -4121,13 +4121,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.173828125,
"learning_rate": 0.000866,
- "loss": 0.0532,
+ "loss": 0.0541,
"macro_f1": 0.32098764181137085,
"num_tokens": 700271.0,
"repeat_count": 0.0,
- "routers_loss": 0.07086442410945892,
+ "routers_loss": 0.06988382339477539,
"skip_count": 2.0,
"step": 434,
"text_loss": 0.37254223227500916
@@ -4140,13 +4140,13 @@
"f1_execute": 0.8333333730697632,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.00087,
- "loss": 0.0825,
+ "loss": 0.0834,
"macro_f1": 0.2777777910232544,
"num_tokens": 703519.0,
"repeat_count": 3.0,
- "routers_loss": 0.29007306694984436,
+ "routers_loss": 0.28240787982940674,
"skip_count": 5.0,
"step": 436,
"text_loss": 0.29636648297309875
@@ -4159,13 +4159,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.396484375,
+ "grad_norm": 0.423828125,
"learning_rate": 0.000874,
- "loss": 0.0658,
+ "loss": 0.0657,
"macro_f1": 0.3333333432674408,
"num_tokens": 706826.0,
"repeat_count": 0.0,
- "routers_loss": 0.014652491547167301,
+ "routers_loss": 0.013924967497587204,
"skip_count": 0.0,
"step": 438,
"text_loss": 0.20867908000946045
@@ -4178,13 +4178,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.000878,
- "loss": 0.0685,
+ "loss": 0.0657,
"macro_f1": 0.3333333432674408,
"num_tokens": 710530.0,
"repeat_count": 0.0,
- "routers_loss": 0.013720969669520855,
+ "routers_loss": 0.01170142088085413,
"skip_count": 0.0,
"step": 440,
"text_loss": 0.7273373007774353
@@ -4197,13 +4197,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.173828125,
+ "grad_norm": 0.171875,
"learning_rate": 0.000882,
- "loss": 0.0771,
+ "loss": 0.076,
"macro_f1": 0.3333333432674408,
"num_tokens": 713503.0,
"repeat_count": 0.0,
- "routers_loss": 0.011687638238072395,
+ "routers_loss": 0.011930872686207294,
"skip_count": 0.0,
"step": 442,
"text_loss": 0.39314430952072144
@@ -4216,13 +4216,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.0008860000000000001,
- "loss": 0.0604,
+ "loss": 0.0592,
"macro_f1": 0.3333333432674408,
"num_tokens": 716582.0,
"repeat_count": 0.0,
- "routers_loss": 0.007869532331824303,
+ "routers_loss": 0.008630385622382164,
"skip_count": 0.0,
"step": 444,
"text_loss": 0.5925271511077881
@@ -4230,18 +4230,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 2.0939242735544465,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.23046875,
"learning_rate": 0.0008900000000000001,
- "loss": 0.0797,
- "macro_f1": 0.3076923191547394,
+ "loss": 0.0811,
+ "macro_f1": 0.3006536066532135,
"num_tokens": 719941.0,
"repeat_count": 3.0,
- "routers_loss": 0.3034668564796448,
+ "routers_loss": 0.3015584945678711,
"skip_count": 1.0,
"step": 446,
"text_loss": 0.5059905052185059
@@ -4254,13 +4254,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2314453125,
+ "grad_norm": 0.203125,
"learning_rate": 0.000894,
- "loss": 0.0823,
+ "loss": 0.0822,
"macro_f1": 0.31446540355682373,
"num_tokens": 723113.0,
"repeat_count": 1.0,
- "routers_loss": 0.11066079139709473,
+ "routers_loss": 0.10897493362426758,
"skip_count": 1.0,
"step": 448,
"text_loss": 0.19616436958312988
@@ -4273,13 +4273,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3046875,
+ "grad_norm": 0.33984375,
"learning_rate": 0.000898,
- "loss": 0.0773,
+ "loss": 0.0782,
"macro_f1": 0.32098764181137085,
"num_tokens": 726193.0,
"repeat_count": 0.0,
- "routers_loss": 0.0755370482802391,
+ "routers_loss": 0.07236456125974655,
"skip_count": 2.0,
"step": 450,
"text_loss": 0.1773054152727127
@@ -4292,13 +4292,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28125,
+ "grad_norm": 0.3203125,
"learning_rate": 0.000902,
- "loss": 0.0596,
+ "loss": 0.058,
"macro_f1": 0.3272727429866791,
"num_tokens": 729275.0,
"repeat_count": 1.0,
- "routers_loss": 0.08470689505338669,
+ "routers_loss": 0.08184371143579483,
"skip_count": 0.0,
"step": 452,
"text_loss": 0.4927310049533844
@@ -4311,13 +4311,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19921875,
+ "grad_norm": 0.1953125,
"learning_rate": 0.000906,
- "loss": 0.0608,
+ "loss": 0.0607,
"macro_f1": 0.3333333432674408,
"num_tokens": 731948.0,
"repeat_count": 0.0,
- "routers_loss": 0.0130238626152277,
+ "routers_loss": 0.014033539220690727,
"skip_count": 0.0,
"step": 454,
"text_loss": 0.4745742678642273
@@ -4330,13 +4330,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.00091,
- "loss": 0.0652,
+ "loss": 0.0651,
"macro_f1": 0.3333333432674408,
"num_tokens": 735351.0,
"repeat_count": 0.0,
- "routers_loss": 0.007108641788363457,
+ "routers_loss": 0.0071774693205952644,
"skip_count": 0.0,
"step": 456,
"text_loss": 0.18523462116718292
@@ -4351,11 +4351,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.400390625,
"learning_rate": 0.0009140000000000001,
- "loss": 0.0746,
+ "loss": 0.0738,
"macro_f1": 0.5492662787437439,
"num_tokens": 738587.0,
"repeat_count": 0.0,
- "routers_loss": 0.06834109872579575,
+ "routers_loss": 0.07781517505645752,
"skip_count": 2.0,
"step": 458,
"text_loss": 0.3459635376930237
@@ -4368,13 +4368,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.28125,
"learning_rate": 0.0009180000000000001,
- "loss": 0.0733,
+ "loss": 0.0723,
"macro_f1": 0.3076923191547394,
"num_tokens": 741779.0,
"repeat_count": 0.0,
- "routers_loss": 0.10230778902769089,
+ "routers_loss": 0.09529037028551102,
"skip_count": 2.0,
"step": 460,
"text_loss": 0.20197433233261108
@@ -4387,13 +4387,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.1865234375,
"learning_rate": 0.0009220000000000001,
- "loss": 0.0528,
+ "loss": 0.0519,
"macro_f1": 0.3333333432674408,
"num_tokens": 745355.0,
"repeat_count": 0.0,
- "routers_loss": 0.009987542405724525,
+ "routers_loss": 0.009765669703483582,
"skip_count": 0.0,
"step": 462,
"text_loss": 0.7031404376029968
@@ -4406,13 +4406,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.125,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009260000000000001,
- "loss": 0.0536,
+ "loss": 0.0527,
"macro_f1": 0.3272727429866791,
"num_tokens": 748628.0,
"repeat_count": 0.0,
- "routers_loss": 0.03448869287967682,
+ "routers_loss": 0.03344850242137909,
"skip_count": 1.0,
"step": 464,
"text_loss": 0.21274663507938385
@@ -4425,13 +4425,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.173828125,
"learning_rate": 0.00093,
- "loss": 0.053,
+ "loss": 0.0534,
"macro_f1": 0.3076923191547394,
"num_tokens": 751472.0,
"repeat_count": 2.0,
- "routers_loss": 0.13631699979305267,
+ "routers_loss": 0.1354292333126068,
"skip_count": 2.0,
"step": 466,
"text_loss": 0.5350717306137085
@@ -4444,13 +4444,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.142578125,
"learning_rate": 0.000934,
- "loss": 0.06,
+ "loss": 0.0598,
"macro_f1": 0.3272727429866791,
"num_tokens": 754479.0,
"repeat_count": 0.0,
- "routers_loss": 0.053951870650053024,
+ "routers_loss": 0.056420840322971344,
"skip_count": 1.0,
"step": 468,
"text_loss": 0.28153330087661743
@@ -4463,13 +4463,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.228515625,
+ "grad_norm": 0.234375,
"learning_rate": 0.0009379999999999999,
- "loss": 0.059,
+ "loss": 0.0597,
"macro_f1": 0.31446540355682373,
"num_tokens": 757872.0,
"repeat_count": 1.0,
- "routers_loss": 0.14479905366897583,
+ "routers_loss": 0.1622387170791626,
"skip_count": 1.0,
"step": 470,
"text_loss": 0.22956843674182892
@@ -4482,13 +4482,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.44140625,
+ "grad_norm": 0.5,
"learning_rate": 0.000942,
- "loss": 0.0913,
+ "loss": 0.0953,
"macro_f1": 0.32098764181137085,
"num_tokens": 760468.0,
"repeat_count": 0.0,
- "routers_loss": 0.056221429258584976,
+ "routers_loss": 0.05146972835063934,
"skip_count": 2.0,
"step": 472,
"text_loss": 0.4513966739177704
@@ -4501,13 +4501,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1904296875,
+ "grad_norm": 0.212890625,
"learning_rate": 0.000946,
- "loss": 0.0591,
+ "loss": 0.0592,
"macro_f1": 0.3272727429866791,
"num_tokens": 763519.0,
"repeat_count": 1.0,
- "routers_loss": 0.09729792177677155,
+ "routers_loss": 0.09022669494152069,
"skip_count": 0.0,
"step": 474,
"text_loss": 0.25758957862854004
@@ -4520,13 +4520,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12158203125,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.00095,
- "loss": 0.0496,
+ "loss": 0.0498,
"macro_f1": 0.3272727429866791,
"num_tokens": 767391.0,
"repeat_count": 0.0,
- "routers_loss": 0.029447713866829872,
+ "routers_loss": 0.03044828027486801,
"skip_count": 1.0,
"step": 476,
"text_loss": 0.21366681158542633
@@ -4539,13 +4539,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.271484375,
+ "grad_norm": 0.291015625,
"learning_rate": 0.000954,
- "loss": 0.0801,
+ "loss": 0.0802,
"macro_f1": 0.3272727429866791,
"num_tokens": 770338.0,
"repeat_count": 0.0,
- "routers_loss": 0.09337342530488968,
+ "routers_loss": 0.10397060960531235,
"skip_count": 1.0,
"step": 478,
"text_loss": 1.0396177768707275
@@ -4560,11 +4560,11 @@
"f1_skip": 0.0,
"grad_norm": 0.267578125,
"learning_rate": 0.000958,
- "loss": 0.1102,
+ "loss": 0.1099,
"macro_f1": 0.285714328289032,
"num_tokens": 773699.0,
"repeat_count": 2.0,
- "routers_loss": 0.23193210363388062,
+ "routers_loss": 0.22604143619537354,
"skip_count": 4.0,
"step": 480,
"text_loss": 0.2570283114910126
@@ -4572,18 +4572,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 2.2629879659524508,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.146484375,
"learning_rate": 0.000962,
- "loss": 0.0669,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0667,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 777473.0,
"repeat_count": 0.0,
- "routers_loss": 0.046257760375738144,
+ "routers_loss": 0.048258859664201736,
"skip_count": 1.0,
"step": 482,
"text_loss": 0.2540103495121002
@@ -4596,13 +4596,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1708984375,
+ "grad_norm": 0.197265625,
"learning_rate": 0.000966,
- "loss": 0.0552,
+ "loss": 0.0592,
"macro_f1": 0.3333333432674408,
"num_tokens": 780833.0,
"repeat_count": 0.0,
- "routers_loss": 0.01683143898844719,
+ "routers_loss": 0.023018671199679375,
"skip_count": 0.0,
"step": 484,
"text_loss": 0.38524550199508667
@@ -4615,13 +4615,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.326171875,
+ "grad_norm": 0.314453125,
"learning_rate": 0.0009699999999999999,
- "loss": 0.071,
+ "loss": 0.0709,
"macro_f1": 0.3272727429866791,
"num_tokens": 783656.0,
"repeat_count": 0.0,
- "routers_loss": 0.04129387438297272,
+ "routers_loss": 0.044845327734947205,
"skip_count": 1.0,
"step": 486,
"text_loss": 0.5859048366546631
@@ -4634,13 +4634,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000974,
- "loss": 0.0605,
+ "loss": 0.0615,
"macro_f1": 0.3333333432674408,
"num_tokens": 787173.0,
"repeat_count": 0.0,
- "routers_loss": 0.01262948103249073,
+ "routers_loss": 0.010898692533373833,
"skip_count": 0.0,
"step": 488,
"text_loss": 0.3456067442893982
@@ -4653,13 +4653,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000978,
- "loss": 0.081,
+ "loss": 0.0796,
"macro_f1": 0.32098764181137085,
"num_tokens": 790395.0,
"repeat_count": 0.0,
- "routers_loss": 0.07404553890228271,
+ "routers_loss": 0.06497956812381744,
"skip_count": 2.0,
"step": 490,
"text_loss": 0.3751123249530792
@@ -4672,13 +4672,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.000982,
- "loss": 0.0751,
+ "loss": 0.0772,
"macro_f1": 0.3272727429866791,
"num_tokens": 793137.0,
"repeat_count": 0.0,
- "routers_loss": 0.06795930862426758,
+ "routers_loss": 0.07763728499412537,
"skip_count": 1.0,
"step": 492,
"text_loss": 0.43296709656715393
@@ -4691,13 +4691,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.1416015625,
"learning_rate": 0.0009860000000000001,
- "loss": 0.0804,
+ "loss": 0.0819,
"macro_f1": 0.3333333432674408,
"num_tokens": 796497.0,
"repeat_count": 0.0,
- "routers_loss": 0.02233024686574936,
+ "routers_loss": 0.02127906307578087,
"skip_count": 0.0,
"step": 494,
"text_loss": 0.4841311275959015
@@ -4710,13 +4710,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.00099,
- "loss": 0.0731,
+ "loss": 0.073,
"macro_f1": 0.3272727429866791,
"num_tokens": 799361.0,
"repeat_count": 1.0,
- "routers_loss": 0.07979031652212143,
+ "routers_loss": 0.09518691152334213,
"skip_count": 0.0,
"step": 496,
"text_loss": 0.5094487071037292
@@ -4729,13 +4729,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.130859375,
"learning_rate": 0.000994,
- "loss": 0.0795,
+ "loss": 0.0789,
"macro_f1": 0.5492662787437439,
"num_tokens": 802629.0,
"repeat_count": 0.0,
- "routers_loss": 0.045646365731954575,
+ "routers_loss": 0.0563947930932045,
"skip_count": 2.0,
"step": 498,
"text_loss": 0.42783617973327637
@@ -4748,13 +4748,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.1865234375,
"learning_rate": 0.000998,
"loss": 0.0476,
"macro_f1": 0.3272727429866791,
"num_tokens": 805881.0,
"repeat_count": 1.0,
- "routers_loss": 0.09717849642038345,
+ "routers_loss": 0.10570426285266876,
"skip_count": 0.0,
"step": 500,
"text_loss": 0.28395503759384155
@@ -4767,13 +4767,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.30078125,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0009999999760498814,
- "loss": 0.0894,
+ "loss": 0.0849,
"macro_f1": 0.5492662787437439,
"num_tokens": 809283.0,
"repeat_count": 0.0,
- "routers_loss": 0.03948225453495979,
+ "routers_loss": 0.031202208250761032,
"skip_count": 2.0,
"step": 502,
"text_loss": 0.32970911264419556
@@ -4786,13 +4786,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.15625,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009999997844489475,
- "loss": 0.0557,
+ "loss": 0.0574,
"macro_f1": 0.3272727429866791,
"num_tokens": 812440.0,
"repeat_count": 0.0,
- "routers_loss": 0.0742638111114502,
+ "routers_loss": 0.07647835463285446,
"skip_count": 1.0,
"step": 504,
"text_loss": 0.4901447296142578
@@ -4805,13 +4805,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.25,
"learning_rate": 0.000999999401247153,
- "loss": 0.0682,
+ "loss": 0.0668,
"macro_f1": 0.32098764181137085,
"num_tokens": 815716.0,
"repeat_count": 0.0,
- "routers_loss": 0.08293049037456512,
+ "routers_loss": 0.08515176922082901,
"skip_count": 2.0,
"step": 506,
"text_loss": 0.6157599687576294
@@ -4824,13 +4824,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.25390625,
"learning_rate": 0.0009999988264446445,
- "loss": 0.0697,
+ "loss": 0.0686,
"macro_f1": 0.3333333432674408,
"num_tokens": 819086.0,
"repeat_count": 0.0,
- "routers_loss": 0.010080376639962196,
+ "routers_loss": 0.00946938619017601,
"skip_count": 0.0,
"step": 508,
"text_loss": 0.5053519010543823
@@ -4843,13 +4843,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009999980600416424,
- "loss": 0.0611,
+ "loss": 0.0574,
"macro_f1": 0.3333333432674408,
"num_tokens": 822268.0,
"repeat_count": 0.0,
- "routers_loss": 0.009179878048598766,
+ "routers_loss": 0.01058756373822689,
"skip_count": 0.0,
"step": 510,
"text_loss": 0.5570021867752075
@@ -4862,13 +4862,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11083984375,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.000999997102038441,
- "loss": 0.0689,
+ "loss": 0.0678,
"macro_f1": 0.3333333432674408,
"num_tokens": 825728.0,
"repeat_count": 0.0,
- "routers_loss": 0.006718529388308525,
+ "routers_loss": 0.008705209009349346,
"skip_count": 0.0,
"step": 512,
"text_loss": 0.6519040465354919
@@ -4881,13 +4881,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.220703125,
"learning_rate": 0.0009999959524354064,
- "loss": 0.0826,
+ "loss": 0.083,
"macro_f1": 0.3272727429866791,
"num_tokens": 829459.0,
"repeat_count": 0.0,
- "routers_loss": 0.049344487488269806,
+ "routers_loss": 0.04024193435907364,
"skip_count": 1.0,
"step": 514,
"text_loss": 0.5290043950080872
@@ -4900,13 +4900,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.259765625,
+ "grad_norm": 0.25390625,
"learning_rate": 0.00099999461123298,
- "loss": 0.0739,
+ "loss": 0.0727,
"macro_f1": 0.3333333432674408,
"num_tokens": 832291.0,
"repeat_count": 0.0,
- "routers_loss": 0.013402626849710941,
+ "routers_loss": 0.015742862597107887,
"skip_count": 0.0,
"step": 516,
"text_loss": 0.7910057902336121
@@ -4919,13 +4919,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.000999993078431675,
- "loss": 0.0761,
+ "loss": 0.0759,
"macro_f1": 0.3076923191547394,
"num_tokens": 835399.0,
"repeat_count": 1.0,
- "routers_loss": 0.16964484751224518,
+ "routers_loss": 0.16753782331943512,
"skip_count": 3.0,
"step": 518,
"text_loss": 0.45196083188056946
@@ -4938,13 +4938,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.236328125,
"learning_rate": 0.0009999913540320792,
- "loss": 0.095,
+ "loss": 0.0968,
"macro_f1": 0.31446540355682373,
"num_tokens": 838993.0,
"repeat_count": 0.0,
- "routers_loss": 0.08609295636415482,
+ "routers_loss": 0.09357143193483353,
"skip_count": 2.0,
"step": 520,
"text_loss": 0.5499435663223267
@@ -4957,13 +4957,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.2392578125,
+ "grad_norm": 0.2451171875,
"learning_rate": 0.0009999894380348536,
- "loss": 0.0816,
+ "loss": 0.0821,
"macro_f1": 0.5492662787437439,
"num_tokens": 842652.0,
"repeat_count": 0.0,
- "routers_loss": 0.05354784056544304,
+ "routers_loss": 0.056803856045007706,
"skip_count": 2.0,
"step": 522,
"text_loss": 0.197520449757576
@@ -4976,13 +4976,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.2333984375,
"learning_rate": 0.000999987330440732,
- "loss": 0.0715,
+ "loss": 0.0725,
"macro_f1": 0.4871794879436493,
"num_tokens": 847061.0,
"repeat_count": 0.0,
- "routers_loss": 0.09146631509065628,
+ "routers_loss": 0.08962195366621017,
"skip_count": 3.0,
"step": 524,
"text_loss": 0.27509039640426636
@@ -4995,13 +4995,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.189453125,
"learning_rate": 0.000999985031250522,
- "loss": 0.0574,
+ "loss": 0.0561,
"macro_f1": 0.3333333432674408,
"num_tokens": 850780.0,
"repeat_count": 0.0,
- "routers_loss": 0.02344255894422531,
+ "routers_loss": 0.022930558770895004,
"skip_count": 0.0,
"step": 526,
"text_loss": 0.13291706144809723
@@ -5014,13 +5014,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1982421875,
+ "grad_norm": 0.197265625,
"learning_rate": 0.0009999825404651053,
- "loss": 0.0621,
+ "loss": 0.0614,
"macro_f1": 0.3333333432674408,
"num_tokens": 853886.0,
"repeat_count": 0.0,
- "routers_loss": 0.018271517008543015,
+ "routers_loss": 0.017097990959882736,
"skip_count": 0.0,
"step": 528,
"text_loss": 0.21706295013427734
@@ -5033,13 +5033,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2060546875,
+ "grad_norm": 0.212890625,
"learning_rate": 0.0009999798580854356,
- "loss": 0.0717,
+ "loss": 0.0724,
"macro_f1": 0.3333333432674408,
"num_tokens": 857364.0,
"repeat_count": 0.0,
- "routers_loss": 0.026990914717316628,
+ "routers_loss": 0.02831801027059555,
"skip_count": 0.0,
"step": 530,
"text_loss": 0.9035662412643433
@@ -5052,13 +5052,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16015625,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.000999976984112541,
- "loss": 0.0681,
+ "loss": 0.0674,
"macro_f1": 0.3333333432674408,
"num_tokens": 860661.0,
"repeat_count": 0.0,
- "routers_loss": 0.019737249240279198,
+ "routers_loss": 0.019671892747282982,
"skip_count": 0.0,
"step": 532,
"text_loss": 0.8354863524436951
@@ -5071,13 +5071,13 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.3046875,
+ "grad_norm": 0.2890625,
"learning_rate": 0.0009999739185475231,
- "loss": 0.0978,
+ "loss": 0.0963,
"macro_f1": 0.47333335876464844,
"num_tokens": 864124.0,
"repeat_count": 2.0,
- "routers_loss": 0.212640181183815,
+ "routers_loss": 0.21383361518383026,
"skip_count": 3.0,
"step": 534,
"text_loss": 0.23422949016094208
@@ -5090,13 +5090,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.0009999706613915565,
- "loss": 0.0602,
+ "loss": 0.0598,
"macro_f1": 0.32098767161369324,
"num_tokens": 866976.0,
"repeat_count": 0.0,
- "routers_loss": 0.07302755117416382,
+ "routers_loss": 0.07158871740102768,
"skip_count": 1.0,
"step": 536,
"text_loss": 0.11800774186849594
@@ -5109,13 +5109,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.296875,
+ "grad_norm": 0.26953125,
"learning_rate": 0.0009999672126458894,
- "loss": 0.0825,
+ "loss": 0.0822,
"macro_f1": 0.3272727429866791,
"num_tokens": 870549.0,
"repeat_count": 0.0,
- "routers_loss": 0.08667246252298355,
+ "routers_loss": 0.08185924589633942,
"skip_count": 1.0,
"step": 538,
"text_loss": 0.19232480227947235
@@ -5128,13 +5128,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1318359375,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.000999963572311843,
- "loss": 0.0597,
+ "loss": 0.0604,
"macro_f1": 0.3333333432674408,
"num_tokens": 873733.0,
"repeat_count": 0.0,
- "routers_loss": 0.015047167427837849,
+ "routers_loss": 0.01633382774889469,
"skip_count": 0.0,
"step": 540,
"text_loss": 0.3725031912326813
@@ -5147,13 +5147,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.15234375,
"learning_rate": 0.0009999597403908128,
- "loss": 0.076,
+ "loss": 0.0761,
"macro_f1": 0.3272727429866791,
"num_tokens": 877099.0,
"repeat_count": 0.0,
- "routers_loss": 0.07481446117162704,
+ "routers_loss": 0.0782657191157341,
"skip_count": 1.0,
"step": 542,
"text_loss": 0.17589199542999268
@@ -5166,13 +5166,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.2177734375,
"learning_rate": 0.0009999557168842669,
- "loss": 0.0724,
+ "loss": 0.0716,
"macro_f1": 0.5492662787437439,
"num_tokens": 879883.0,
"repeat_count": 0.0,
- "routers_loss": 0.049495212733745575,
+ "routers_loss": 0.05275818333029747,
"skip_count": 2.0,
"step": 544,
"text_loss": 0.26448264718055725
@@ -5185,13 +5185,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.0009999515017937468,
- "loss": 0.0718,
+ "loss": 0.071,
"macro_f1": 0.32098764181137085,
"num_tokens": 882223.0,
"repeat_count": 0.0,
- "routers_loss": 0.08043002337217331,
+ "routers_loss": 0.09335892647504807,
"skip_count": 2.0,
"step": 546,
"text_loss": 0.208544060587883
@@ -5204,13 +5204,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.34765625,
+ "grad_norm": 0.376953125,
"learning_rate": 0.0009999470951208684,
- "loss": 0.086,
+ "loss": 0.0855,
"macro_f1": 0.32098764181137085,
"num_tokens": 885241.0,
"repeat_count": 2.0,
- "routers_loss": 0.22461950778961182,
+ "routers_loss": 0.22983254492282867,
"skip_count": 0.0,
"step": 548,
"text_loss": 0.6612338423728943
@@ -5223,13 +5223,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.216796875,
"learning_rate": 0.00099994249686732,
- "loss": 0.0798,
+ "loss": 0.0786,
"macro_f1": 0.3272727429866791,
"num_tokens": 887897.0,
"repeat_count": 1.0,
- "routers_loss": 0.11754962801933289,
+ "routers_loss": 0.12858282029628754,
"skip_count": 0.0,
"step": 550,
"text_loss": 0.4673548936843872
@@ -5242,13 +5242,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1611328125,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.0009999377070348638,
- "loss": 0.0978,
+ "loss": 0.0944,
"macro_f1": 0.3333333432674408,
"num_tokens": 891224.0,
"repeat_count": 0.0,
- "routers_loss": 0.017412789165973663,
+ "routers_loss": 0.017421770840883255,
"skip_count": 0.0,
"step": 552,
"text_loss": 0.6419258117675781
@@ -5261,13 +5261,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.15625,
"learning_rate": 0.000999932725625335,
- "loss": 0.0792,
+ "loss": 0.0791,
"macro_f1": 0.32098764181137085,
"num_tokens": 894578.0,
"repeat_count": 0.0,
- "routers_loss": 0.08969525247812271,
+ "routers_loss": 0.07890026271343231,
"skip_count": 2.0,
"step": 554,
"text_loss": 0.5970752239227295
@@ -5280,13 +5280,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2158203125,
+ "grad_norm": 0.216796875,
"learning_rate": 0.0009999275526406427,
- "loss": 0.0803,
+ "loss": 0.0796,
"macro_f1": 0.31446540355682373,
"num_tokens": 897145.0,
"repeat_count": 1.0,
- "routers_loss": 0.09876437485218048,
+ "routers_loss": 0.09836960583925247,
"skip_count": 1.0,
"step": 556,
"text_loss": 0.752425491809845
@@ -5299,13 +5299,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.1875,
"learning_rate": 0.0009999221880827693,
- "loss": 0.0887,
+ "loss": 0.0882,
"macro_f1": 0.3333333432674408,
"num_tokens": 900565.0,
"repeat_count": 0.0,
- "routers_loss": 0.019108204171061516,
+ "routers_loss": 0.017694659531116486,
"skip_count": 0.0,
"step": 558,
"text_loss": 0.195619136095047
@@ -5318,32 +5318,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.2021484375,
"learning_rate": 0.0009999166319537703,
- "loss": 0.0573,
+ "loss": 0.0561,
"macro_f1": 0.3333333432674408,
"num_tokens": 903506.0,
"repeat_count": 0.0,
- "routers_loss": 0.019048813730478287,
+ "routers_loss": 0.019375264644622803,
"skip_count": 0.0,
"step": 560,
"text_loss": 0.4603337347507477
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
"epoch": 2.638685060170238,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1435546875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.146484375,
"learning_rate": 0.0009999108842557748,
- "loss": 0.0947,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0953,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 906380.0,
"repeat_count": 0.0,
- "routers_loss": 0.11889495700597763,
+ "routers_loss": 0.12013207376003265,
"skip_count": 3.0,
"step": 562,
"text_loss": 0.6279402375221252
@@ -5356,13 +5356,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.228515625,
+ "grad_norm": 0.255859375,
"learning_rate": 0.0009999049449909854,
- "loss": 0.0771,
+ "loss": 0.0799,
"macro_f1": 0.3272727429866791,
"num_tokens": 909116.0,
"repeat_count": 0.0,
- "routers_loss": 0.06202332302927971,
+ "routers_loss": 0.06441342830657959,
"skip_count": 1.0,
"step": 564,
"text_loss": 0.23741699755191803
@@ -5375,13 +5375,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.15234375,
"learning_rate": 0.0009998988141616781,
- "loss": 0.0623,
+ "loss": 0.064,
"macro_f1": 0.32098767161369324,
"num_tokens": 912189.0,
"repeat_count": 0.0,
- "routers_loss": 0.08294244855642319,
+ "routers_loss": 0.08309414982795715,
"skip_count": 1.0,
"step": 566,
"text_loss": 0.27780941128730774
@@ -5394,13 +5394,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009998924917702023,
- "loss": 0.0885,
+ "loss": 0.0876,
"macro_f1": 0.3272727429866791,
"num_tokens": 916279.0,
"repeat_count": 1.0,
- "routers_loss": 0.07545182853937149,
+ "routers_loss": 0.07197169959545135,
"skip_count": 0.0,
"step": 568,
"text_loss": 0.6371755599975586
@@ -5413,13 +5413,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.2255859375,
"learning_rate": 0.0009998859778189806,
- "loss": 0.0712,
+ "loss": 0.0706,
"macro_f1": 0.3333333432674408,
"num_tokens": 919490.0,
"repeat_count": 0.0,
- "routers_loss": 0.008711219765245914,
+ "routers_loss": 0.008022273890674114,
"skip_count": 0.0,
"step": 570,
"text_loss": 0.6028938889503479
@@ -5432,13 +5432,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.000999879272310509,
- "loss": 0.0837,
+ "loss": 0.084,
"macro_f1": 0.3333333432674408,
"num_tokens": 923694.0,
"repeat_count": 0.0,
- "routers_loss": 0.01639273390173912,
+ "routers_loss": 0.01634674146771431,
"skip_count": 0.0,
"step": 572,
"text_loss": 0.7177054286003113
@@ -5451,13 +5451,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1669921875,
+ "grad_norm": 0.17578125,
"learning_rate": 0.0009998723752473574,
- "loss": 0.0707,
+ "loss": 0.0716,
"macro_f1": 0.3272727429866791,
"num_tokens": 926933.0,
"repeat_count": 0.0,
- "routers_loss": 0.04997137933969498,
+ "routers_loss": 0.060559045523405075,
"skip_count": 1.0,
"step": 574,
"text_loss": 0.5203254818916321
@@ -5470,13 +5470,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1845703125,
+ "grad_norm": 0.185546875,
"learning_rate": 0.0009998652866321687,
- "loss": 0.0799,
+ "loss": 0.0801,
"macro_f1": 0.3333333432674408,
"num_tokens": 929832.0,
"repeat_count": 0.0,
- "routers_loss": 0.011360209435224533,
+ "routers_loss": 0.011485611088573933,
"skip_count": 0.0,
"step": 576,
"text_loss": 0.6147452592849731
@@ -5489,13 +5489,13 @@
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1669921875,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.000999858006467659,
- "loss": 0.0658,
+ "loss": 0.0649,
"macro_f1": 0.29333335161209106,
"num_tokens": 933266.0,
"repeat_count": 2.0,
- "routers_loss": 0.31349560618400574,
+ "routers_loss": 0.2929030954837799,
"skip_count": 4.0,
"step": 578,
"text_loss": 0.1720666140317917
@@ -5508,13 +5508,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.24609375,
"learning_rate": 0.0009998505347566186,
- "loss": 0.0801,
+ "loss": 0.0782,
"macro_f1": 0.32098764181137085,
"num_tokens": 937545.0,
"repeat_count": 0.0,
- "routers_loss": 0.058660347014665604,
+ "routers_loss": 0.053780000656843185,
"skip_count": 2.0,
"step": 580,
"text_loss": 0.3258405327796936
@@ -5527,13 +5527,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.1416015625,
"learning_rate": 0.00099984287150191,
- "loss": 0.0578,
+ "loss": 0.0582,
"macro_f1": 0.3333333432674408,
"num_tokens": 941001.0,
"repeat_count": 0.0,
- "routers_loss": 0.025836754590272903,
+ "routers_loss": 0.02637636847794056,
"skip_count": 0.0,
"step": 582,
"text_loss": 0.23762771487236023
@@ -5546,13 +5546,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009998350167064705,
- "loss": 0.0683,
+ "loss": 0.0672,
"macro_f1": 0.3333333432674408,
"num_tokens": 943989.0,
"repeat_count": 0.0,
- "routers_loss": 0.016504868865013123,
+ "routers_loss": 0.01637580618262291,
"skip_count": 0.0,
"step": 584,
"text_loss": 0.7460582852363586
@@ -5565,13 +5565,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1787109375,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009998269703733096,
- "loss": 0.0685,
+ "loss": 0.0686,
"macro_f1": 0.3272727429866791,
"num_tokens": 947245.0,
"repeat_count": 1.0,
- "routers_loss": 0.1379794180393219,
+ "routers_loss": 0.13934117555618286,
"skip_count": 0.0,
"step": 586,
"text_loss": 0.5284690260887146
@@ -5584,13 +5584,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.13671875,
"learning_rate": 0.0009998187325055106,
- "loss": 0.0657,
+ "loss": 0.0667,
"macro_f1": 0.3333333432674408,
"num_tokens": 950116.0,
"repeat_count": 0.0,
- "routers_loss": 0.01802757754921913,
+ "routers_loss": 0.02138397842645645,
"skip_count": 0.0,
"step": 588,
"text_loss": 0.3920256197452545
@@ -5603,13 +5603,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0009998103031062305,
- "loss": 0.0762,
+ "loss": 0.0778,
"macro_f1": 0.3333333432674408,
"num_tokens": 953277.0,
"repeat_count": 0.0,
- "routers_loss": 0.006902900990098715,
+ "routers_loss": 0.007098200265318155,
"skip_count": 0.0,
"step": 590,
"text_loss": 0.7472905516624451
@@ -5622,13 +5622,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3046875,
+ "grad_norm": 0.318359375,
"learning_rate": 0.0009998016821786994,
- "loss": 0.0912,
+ "loss": 0.0872,
"macro_f1": 0.32098764181137085,
"num_tokens": 958229.0,
"repeat_count": 1.0,
- "routers_loss": 0.08348741382360458,
+ "routers_loss": 0.07946522533893585,
"skip_count": 1.0,
"step": 592,
"text_loss": 0.5506448745727539
@@ -5641,13 +5641,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.000999792869726221,
- "loss": 0.0527,
+ "loss": 0.0523,
"macro_f1": 0.3272727429866791,
"num_tokens": 961016.0,
"repeat_count": 0.0,
- "routers_loss": 0.08290062099695206,
+ "routers_loss": 0.0850791186094284,
"skip_count": 1.0,
"step": 594,
"text_loss": 0.3824431002140045
@@ -5660,13 +5660,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.0009997838657521717,
- "loss": 0.0643,
+ "loss": 0.0632,
"macro_f1": 0.3333333432674408,
"num_tokens": 963847.0,
"repeat_count": 0.0,
- "routers_loss": 0.018620988354086876,
+ "routers_loss": 0.016370445489883423,
"skip_count": 0.0,
"step": 596,
"text_loss": 0.2139475792646408
@@ -5679,13 +5679,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009997746702600026,
- "loss": 0.073,
+ "loss": 0.0702,
"macro_f1": 0.307692289352417,
"num_tokens": 966619.0,
"repeat_count": 0.0,
- "routers_loss": 0.1211671382188797,
+ "routers_loss": 0.1310746818780899,
"skip_count": 3.0,
"step": 598,
"text_loss": 0.3651018440723419
@@ -5698,13 +5698,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.23828125,
"learning_rate": 0.0009997652832532372,
- "loss": 0.079,
+ "loss": 0.0792,
"macro_f1": 0.3272727429866791,
"num_tokens": 970418.0,
"repeat_count": 1.0,
- "routers_loss": 0.15485027432441711,
+ "routers_loss": 0.14303378760814667,
"skip_count": 0.0,
"step": 600,
"text_loss": 0.7094736099243164
@@ -5717,13 +5717,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009997557047354722,
- "loss": 0.0562,
+ "loss": 0.0531,
"macro_f1": 0.3272727429866791,
"num_tokens": 973491.0,
"repeat_count": 0.0,
- "routers_loss": 0.036684274673461914,
+ "routers_loss": 0.03334212675690651,
"skip_count": 1.0,
"step": 602,
"text_loss": 0.4812237024307251
@@ -5731,18 +5731,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 2.835926034634576,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.302734375,
+ "grad_norm": 0.2890625,
"learning_rate": 0.0009997459347103783,
- "loss": 0.0985,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0956,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 976672.0,
"repeat_count": 0.0,
- "routers_loss": 0.026901578530669212,
+ "routers_loss": 0.02831871062517166,
"skip_count": 0.0,
"step": 604,
"text_loss": 0.21737146377563477
@@ -5755,13 +5755,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12158203125,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009997359731816998,
- "loss": 0.0632,
+ "loss": 0.0646,
"macro_f1": 0.3333333432674408,
"num_tokens": 979898.0,
"repeat_count": 0.0,
- "routers_loss": 0.01700405217707157,
+ "routers_loss": 0.017968013882637024,
"skip_count": 0.0,
"step": 606,
"text_loss": 0.5458008050918579
@@ -5774,13 +5774,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2099609375,
+ "grad_norm": 0.224609375,
"learning_rate": 0.0009997258201532536,
- "loss": 0.0758,
+ "loss": 0.0751,
"macro_f1": 0.3333333432674408,
"num_tokens": 982811.0,
"repeat_count": 0.0,
- "routers_loss": 0.015013590455055237,
+ "routers_loss": 0.016256732866168022,
"skip_count": 0.0,
"step": 608,
"text_loss": 0.8643257021903992
@@ -5793,13 +5793,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0009997154756289303,
- "loss": 0.0576,
+ "loss": 0.0561,
"macro_f1": 0.3333333432674408,
"num_tokens": 985245.0,
"repeat_count": 0.0,
- "routers_loss": 0.02037946693599224,
+ "routers_loss": 0.021214161068201065,
"skip_count": 0.0,
"step": 610,
"text_loss": 0.2204967886209488
@@ -5812,13 +5812,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.150390625,
"learning_rate": 0.000999704939612694,
- "loss": 0.0648,
+ "loss": 0.0636,
"macro_f1": 0.3006536364555359,
"num_tokens": 988539.0,
"repeat_count": 3.0,
- "routers_loss": 0.22834022343158722,
+ "routers_loss": 0.23249399662017822,
"skip_count": 2.0,
"step": 612,
"text_loss": 0.32489025592803955
@@ -5831,13 +5831,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009996942121085824,
- "loss": 0.0449,
+ "loss": 0.0445,
"macro_f1": 0.3333333432674408,
"num_tokens": 991660.0,
"repeat_count": 0.0,
- "routers_loss": 0.009838113561272621,
+ "routers_loss": 0.010706410743296146,
"skip_count": 0.0,
"step": 614,
"text_loss": 0.4551754891872406
@@ -5850,13 +5850,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.353515625,
+ "grad_norm": 0.3671875,
"learning_rate": 0.000999683293120706,
- "loss": 0.1009,
+ "loss": 0.1016,
"macro_f1": 0.3333333432674408,
"num_tokens": 994828.0,
"repeat_count": 0.0,
- "routers_loss": 0.005943270865827799,
+ "routers_loss": 0.006676184479147196,
"skip_count": 0.0,
"step": 616,
"text_loss": 0.6212068200111389
@@ -5869,13 +5869,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.38671875,
+ "grad_norm": 0.408203125,
"learning_rate": 0.0009996721826532491,
- "loss": 0.0941,
+ "loss": 0.0976,
"macro_f1": 0.3076923191547394,
"num_tokens": 997951.0,
"repeat_count": 2.0,
- "routers_loss": 0.21597740054130554,
+ "routers_loss": 0.2148125320672989,
"skip_count": 2.0,
"step": 618,
"text_loss": 0.26514527201652527
@@ -5888,13 +5888,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.1904296875,
"learning_rate": 0.000999660880710469,
- "loss": 0.0896,
+ "loss": 0.0909,
"macro_f1": 0.3333333432674408,
"num_tokens": 1001139.0,
"repeat_count": 0.0,
- "routers_loss": 0.023726588115096092,
+ "routers_loss": 0.022332455962896347,
"skip_count": 0.0,
"step": 620,
"text_loss": 0.26131340861320496
@@ -5907,13 +5907,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009996493872966971,
"loss": 0.0732,
"macro_f1": 0.3272727429866791,
"num_tokens": 1003678.0,
"repeat_count": 1.0,
- "routers_loss": 0.08467255532741547,
+ "routers_loss": 0.08348730951547623,
"skip_count": 0.0,
"step": 622,
"text_loss": 0.19151706993579865
@@ -5926,13 +5926,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.173828125,
"learning_rate": 0.0009996377024163374,
- "loss": 0.0816,
+ "loss": 0.0822,
"macro_f1": 0.3333333432674408,
"num_tokens": 1007082.0,
"repeat_count": 0.0,
- "routers_loss": 0.029468854889273643,
+ "routers_loss": 0.028577150776982307,
"skip_count": 0.0,
"step": 624,
"text_loss": 0.305387407541275
@@ -5945,13 +5945,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.11279296875,
"learning_rate": 0.0009996258260738676,
- "loss": 0.0891,
+ "loss": 0.0892,
"macro_f1": 0.3272727429866791,
"num_tokens": 1010064.0,
"repeat_count": 1.0,
- "routers_loss": 0.09438466280698776,
+ "routers_loss": 0.08312026411294937,
"skip_count": 0.0,
"step": 626,
"text_loss": 0.49436143040657043
@@ -5964,13 +5964,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009996137582738388,
- "loss": 0.0581,
+ "loss": 0.0591,
"macro_f1": 0.3333333432674408,
"num_tokens": 1013462.0,
"repeat_count": 0.0,
- "routers_loss": 0.013679586350917816,
+ "routers_loss": 0.013337327167391777,
"skip_count": 0.0,
"step": 628,
"text_loss": 0.6515294313430786
@@ -5983,13 +5983,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.140625,
"learning_rate": 0.000999601499020875,
- "loss": 0.0528,
+ "loss": 0.0537,
"macro_f1": 0.3333333432674408,
"num_tokens": 1016246.0,
"repeat_count": 0.0,
- "routers_loss": 0.029532987624406815,
+ "routers_loss": 0.029126765206456184,
"skip_count": 0.0,
"step": 630,
"text_loss": 0.18834827840328217
@@ -6002,13 +6002,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009995890483196746,
- "loss": 0.0601,
+ "loss": 0.0602,
"macro_f1": 0.3272727429866791,
"num_tokens": 1019286.0,
"repeat_count": 0.0,
- "routers_loss": 0.05516733601689339,
+ "routers_loss": 0.054844800382852554,
"skip_count": 1.0,
"step": 632,
"text_loss": 0.6988179087638855
@@ -6021,13 +6021,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.357421875,
+ "grad_norm": 0.322265625,
"learning_rate": 0.0009995764061750086,
- "loss": 0.0785,
+ "loss": 0.0767,
"macro_f1": 0.3333333432674408,
"num_tokens": 1022207.0,
"repeat_count": 0.0,
- "routers_loss": 0.010254866443574429,
+ "routers_loss": 0.010095693171024323,
"skip_count": 0.0,
"step": 634,
"text_loss": 0.558451771736145
@@ -6040,13 +6040,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.2890625,
"learning_rate": 0.000999563572591721,
- "loss": 0.0518,
+ "loss": 0.0521,
"macro_f1": 0.32098764181137085,
"num_tokens": 1025319.0,
"repeat_count": 1.0,
- "routers_loss": 0.07528360933065414,
+ "routers_loss": 0.0698433518409729,
"skip_count": 1.0,
"step": 636,
"text_loss": 0.5961872935295105
@@ -6059,13 +6059,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1064453125,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009995505475747302,
- "loss": 0.0844,
+ "loss": 0.0849,
"macro_f1": 0.3272727429866791,
"num_tokens": 1028362.0,
"repeat_count": 0.0,
- "routers_loss": 0.04301584139466286,
+ "routers_loss": 0.040211405605077744,
"skip_count": 1.0,
"step": 638,
"text_loss": 0.546863317489624
@@ -6078,13 +6078,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11572265625,
+ "grad_norm": 0.119140625,
"learning_rate": 0.0009995373311290272,
- "loss": 0.0699,
+ "loss": 0.0709,
"macro_f1": 0.3144654333591461,
"num_tokens": 1032199.0,
"repeat_count": 2.0,
- "routers_loss": 0.14521080255508423,
+ "routers_loss": 0.1457643061876297,
"skip_count": 1.0,
"step": 640,
"text_loss": 0.2137298285961151
@@ -6097,13 +6097,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1328125,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.0009995239232596764,
- "loss": 0.0543,
+ "loss": 0.0545,
"macro_f1": 0.3333333432674408,
"num_tokens": 1035801.0,
"repeat_count": 0.0,
- "routers_loss": 0.01074797473847866,
+ "routers_loss": 0.011394930072128773,
"skip_count": 0.0,
"step": 642,
"text_loss": 0.43054503202438354
@@ -6116,13 +6116,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009995103239718163,
- "loss": 0.0659,
+ "loss": 0.0665,
"macro_f1": 0.3333333432674408,
"num_tokens": 1039223.0,
"repeat_count": 0.0,
- "routers_loss": 0.009271817281842232,
+ "routers_loss": 0.00997432041913271,
"skip_count": 0.0,
"step": 644,
"text_loss": 0.7749615907669067
@@ -6135,13 +6135,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0009994965332706573,
- "loss": 0.0737,
+ "loss": 0.0755,
"macro_f1": 0.3144654333591461,
"num_tokens": 1042154.0,
"repeat_count": 3.0,
- "routers_loss": 0.10257050395011902,
+ "routers_loss": 0.10589150339365005,
"skip_count": 0.0,
"step": 646,
"text_loss": 0.7812211513519287
@@ -6154,13 +6154,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.0009994825511614846,
- "loss": 0.0363,
+ "loss": 0.0383,
"macro_f1": 0.3272727429866791,
"num_tokens": 1045250.0,
"repeat_count": 0.0,
- "routers_loss": 0.07091924548149109,
+ "routers_loss": 0.0748734176158905,
"skip_count": 1.0,
"step": 648,
"text_loss": 0.844803512096405
@@ -6173,13 +6173,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11572265625,
+ "grad_norm": 0.1220703125,
"learning_rate": 0.0009994683776496562,
- "loss": 0.0421,
+ "loss": 0.0433,
"macro_f1": 0.3272727429866791,
"num_tokens": 1048446.0,
"repeat_count": 0.0,
- "routers_loss": 0.034446243196725845,
+ "routers_loss": 0.03742415830492973,
"skip_count": 1.0,
"step": 650,
"text_loss": 0.2098839282989502
@@ -6192,13 +6192,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009994540127406034,
- "loss": 0.0593,
+ "loss": 0.0591,
"macro_f1": 0.32098764181137085,
"num_tokens": 1051840.0,
"repeat_count": 0.0,
- "routers_loss": 0.06077485531568527,
+ "routers_loss": 0.06025516986846924,
"skip_count": 2.0,
"step": 652,
"text_loss": 0.27727583050727844
@@ -6211,13 +6211,13 @@
"f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.181640625,
"learning_rate": 0.0009994394564398306,
- "loss": 0.0537,
+ "loss": 0.0519,
"macro_f1": 0.521541953086853,
"num_tokens": 1055142.0,
"repeat_count": 4.0,
- "routers_loss": 0.2382282167673111,
+ "routers_loss": 0.22807340323925018,
"skip_count": 2.0,
"step": 654,
"text_loss": 0.9672397971153259
@@ -6230,13 +6230,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.142578125,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009994247087529158,
- "loss": 0.0613,
+ "loss": 0.0618,
"macro_f1": 0.3333333432674408,
"num_tokens": 1057698.0,
"repeat_count": 0.0,
- "routers_loss": 0.011971636675298214,
+ "routers_loss": 0.01348950993269682,
"skip_count": 0.0,
"step": 656,
"text_loss": 0.6375506520271301
@@ -6249,13 +6249,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.212890625,
+ "grad_norm": 0.1953125,
"learning_rate": 0.0009994097696855106,
- "loss": 0.0414,
+ "loss": 0.0412,
"macro_f1": 0.3333333432674408,
"num_tokens": 1060624.0,
"repeat_count": 0.0,
- "routers_loss": 0.010221127420663834,
+ "routers_loss": 0.009649243205785751,
"skip_count": 0.0,
"step": 658,
"text_loss": 0.5315385460853577
@@ -6268,13 +6268,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2265625,
+ "grad_norm": 0.2041015625,
"learning_rate": 0.0009993946392433395,
- "loss": 0.061,
+ "loss": 0.0609,
"macro_f1": 0.307692289352417,
"num_tokens": 1065076.0,
"repeat_count": 0.0,
- "routers_loss": 0.11860335618257523,
+ "routers_loss": 0.1250980943441391,
"skip_count": 3.0,
"step": 660,
"text_loss": 0.25780341029167175
@@ -6287,13 +6287,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009993793174322006,
- "loss": 0.0485,
+ "loss": 0.0471,
"macro_f1": 0.3333333432674408,
"num_tokens": 1068365.0,
"repeat_count": 0.0,
- "routers_loss": 0.011139829643070698,
+ "routers_loss": 0.011544390581548214,
"skip_count": 0.0,
"step": 662,
"text_loss": 0.34876301884651184
@@ -6306,13 +6306,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.166015625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009993638042579654,
- "loss": 0.0478,
+ "loss": 0.0473,
"macro_f1": 0.3272727429866791,
"num_tokens": 1071693.0,
"repeat_count": 0.0,
- "routers_loss": 0.03978770971298218,
+ "routers_loss": 0.03777370601892471,
"skip_count": 1.0,
"step": 664,
"text_loss": 0.21811571717262268
@@ -6327,11 +6327,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.203125,
"learning_rate": 0.0009993480997265783,
- "loss": 0.0481,
+ "loss": 0.0475,
"macro_f1": 0.5492662787437439,
"num_tokens": 1074733.0,
"repeat_count": 0.0,
- "routers_loss": 0.051231011748313904,
+ "routers_loss": 0.049949806183576584,
"skip_count": 2.0,
"step": 666,
"text_loss": 0.38410288095474243
@@ -6344,13 +6344,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.10302734375,
"learning_rate": 0.0009993322038440572,
- "loss": 0.0615,
+ "loss": 0.0605,
"macro_f1": 0.3333333432674408,
"num_tokens": 1077993.0,
"repeat_count": 0.0,
- "routers_loss": 0.024917088449001312,
+ "routers_loss": 0.0247171800583601,
"skip_count": 0.0,
"step": 668,
"text_loss": 0.25576895475387573
@@ -6363,13 +6363,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1982421875,
+ "grad_norm": 0.216796875,
"learning_rate": 0.000999316116616494,
- "loss": 0.0627,
+ "loss": 0.0619,
"macro_f1": 0.3333333432674408,
"num_tokens": 1080491.0,
"repeat_count": 0.0,
- "routers_loss": 0.008834881708025932,
+ "routers_loss": 0.008118715137243271,
"skip_count": 0.0,
"step": 670,
"text_loss": 0.6269792914390564
@@ -6382,13 +6382,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.173828125,
"learning_rate": 0.0009992998380500527,
"loss": 0.0462,
"macro_f1": 0.3272727429866791,
"num_tokens": 1083817.0,
"repeat_count": 0.0,
- "routers_loss": 0.033405229449272156,
+ "routers_loss": 0.03366057574748993,
"skip_count": 1.0,
"step": 672,
"text_loss": 0.26891493797302246
@@ -6401,13 +6401,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.13671875,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009992833681509716,
- "loss": 0.0523,
+ "loss": 0.0529,
"macro_f1": 0.3333333432674408,
"num_tokens": 1087368.0,
"repeat_count": 0.0,
- "routers_loss": 0.020753704011440277,
+ "routers_loss": 0.020552074536681175,
"skip_count": 0.0,
"step": 674,
"text_loss": 0.14421936869621277
@@ -6420,13 +6420,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.18359375,
"learning_rate": 0.0009992667069255619,
- "loss": 0.0698,
+ "loss": 0.0696,
"macro_f1": 0.31446540355682373,
"num_tokens": 1090452.0,
"repeat_count": 0.0,
- "routers_loss": 0.06932353973388672,
+ "routers_loss": 0.06937336176633835,
"skip_count": 2.0,
"step": 676,
"text_loss": 0.24999259412288666
@@ -6439,13 +6439,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.08740234375,
"learning_rate": 0.0009992498543802085,
- "loss": 0.059,
+ "loss": 0.0588,
"macro_f1": 0.3272727429866791,
"num_tokens": 1093996.0,
"repeat_count": 1.0,
- "routers_loss": 0.032903749495744705,
+ "routers_loss": 0.0380021296441555,
"skip_count": 0.0,
"step": 678,
"text_loss": 0.42473849654197693
@@ -6458,32 +6458,32 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.2099609375,
+ "grad_norm": 0.2119140625,
"learning_rate": 0.0009992328105213688,
- "loss": 0.0417,
+ "loss": 0.0411,
"macro_f1": 0.4400000274181366,
"num_tokens": 1096837.0,
"repeat_count": 1.0,
- "routers_loss": 0.19733747839927673,
+ "routers_loss": 0.20885063707828522,
"skip_count": 4.0,
"step": 680,
"text_loss": 0.3829527199268341
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 26.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 3.2019371881420606,
- "f1_execute": 1.0,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.154296875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009992155753555747,
- "loss": 0.0729,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0722,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1100320.0,
"repeat_count": 0.0,
- "routers_loss": 0.013452666811645031,
+ "routers_loss": 0.018230699002742767,
"skip_count": 2.0,
"step": 682,
"text_loss": 0.6190969944000244
@@ -6496,13 +6496,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.30859375,
"learning_rate": 0.0009991981488894303,
"loss": 0.0681,
"macro_f1": 0.32098767161369324,
"num_tokens": 1103682.0,
"repeat_count": 0.0,
- "routers_loss": 0.05302857980132103,
+ "routers_loss": 0.05550144240260124,
"skip_count": 1.0,
"step": 684,
"text_loss": 0.44418027997016907
@@ -6515,13 +6515,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.0009991805311296133,
- "loss": 0.0527,
+ "loss": 0.0507,
"macro_f1": 0.32098764181137085,
"num_tokens": 1106427.0,
"repeat_count": 0.0,
- "routers_loss": 0.08124994486570358,
+ "routers_loss": 0.07990608364343643,
"skip_count": 2.0,
"step": 686,
"text_loss": 0.5577231645584106
@@ -6534,13 +6534,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.22265625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009991627220828753,
- "loss": 0.0579,
+ "loss": 0.0568,
"macro_f1": 0.32098764181137085,
"num_tokens": 1109314.0,
"repeat_count": 0.0,
- "routers_loss": 0.058633625507354736,
+ "routers_loss": 0.05167485028505325,
"skip_count": 2.0,
"step": 688,
"text_loss": 0.27325430512428284
@@ -6553,13 +6553,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009991447217560408,
- "loss": 0.0533,
+ "loss": 0.0521,
"macro_f1": 0.5492662787437439,
"num_tokens": 1112748.0,
"repeat_count": 0.0,
- "routers_loss": 0.04703643172979355,
+ "routers_loss": 0.04621964320540428,
"skip_count": 2.0,
"step": 690,
"text_loss": 0.5288321375846863
@@ -6572,13 +6572,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.000999126530156007,
- "loss": 0.0485,
+ "loss": 0.0499,
"macro_f1": 0.307692289352417,
"num_tokens": 1116965.0,
"repeat_count": 1.0,
- "routers_loss": 0.11615128815174103,
+ "routers_loss": 0.11950276792049408,
"skip_count": 2.0,
"step": 692,
"text_loss": 0.14215624332427979
@@ -6591,13 +6591,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2314453125,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.0009991081472897454,
- "loss": 0.0718,
+ "loss": 0.0722,
"macro_f1": 0.3333333432674408,
"num_tokens": 1120570.0,
"repeat_count": 0.0,
- "routers_loss": 0.017403846606612206,
+ "routers_loss": 0.01905500330030918,
"skip_count": 0.0,
"step": 694,
"text_loss": 0.41862696409225464
@@ -6610,13 +6610,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.0009990895731643002,
- "loss": 0.0444,
+ "loss": 0.0464,
"macro_f1": 0.3272727429866791,
"num_tokens": 1124009.0,
"repeat_count": 1.0,
- "routers_loss": 0.07067303359508514,
+ "routers_loss": 0.06974572688341141,
"skip_count": 0.0,
"step": 696,
"text_loss": 0.41160130500793457
@@ -6629,13 +6629,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.000999070807786789,
- "loss": 0.0527,
+ "loss": 0.0531,
"macro_f1": 0.3272727429866791,
"num_tokens": 1127370.0,
"repeat_count": 1.0,
- "routers_loss": 0.07131028175354004,
+ "routers_loss": 0.07055293023586273,
"skip_count": 0.0,
"step": 698,
"text_loss": 0.48068273067474365
@@ -6648,13 +6648,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.18359375,
+ "grad_norm": 0.197265625,
"learning_rate": 0.000999051851164403,
- "loss": 0.0629,
+ "loss": 0.0619,
"macro_f1": 0.32098764181137085,
"num_tokens": 1130234.0,
"repeat_count": 1.0,
- "routers_loss": 0.1152748316526413,
+ "routers_loss": 0.12506946921348572,
"skip_count": 1.0,
"step": 700,
"text_loss": 0.47925490140914917
@@ -6667,13 +6667,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.216796875,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.000999032703304406,
- "loss": 0.0663,
+ "loss": 0.0674,
"macro_f1": 0.3333333432674408,
"num_tokens": 1132874.0,
"repeat_count": 0.0,
- "routers_loss": 0.0077212234027683735,
+ "routers_loss": 0.00809287466108799,
"skip_count": 0.0,
"step": 702,
"text_loss": 0.47433632612228394
@@ -6686,13 +6686,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.1064453125,
"learning_rate": 0.0009990133642141358,
- "loss": 0.0494,
+ "loss": 0.0497,
"macro_f1": 0.5492662787437439,
"num_tokens": 1136011.0,
"repeat_count": 0.0,
- "routers_loss": 0.02726336568593979,
+ "routers_loss": 0.0319170281291008,
"skip_count": 2.0,
"step": 704,
"text_loss": 0.6574832201004028
@@ -6705,13 +6705,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.32421875,
+ "grad_norm": 0.33984375,
"learning_rate": 0.000998993833901003,
- "loss": 0.0615,
+ "loss": 0.0619,
"macro_f1": 0.32098764181137085,
"num_tokens": 1139674.0,
"repeat_count": 0.0,
- "routers_loss": 0.0958542674779892,
+ "routers_loss": 0.09850362688302994,
"skip_count": 2.0,
"step": 706,
"text_loss": 0.7660127282142639
@@ -6724,13 +6724,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009989741123724919,
- "loss": 0.0583,
+ "loss": 0.0574,
"macro_f1": 0.3333333432674408,
"num_tokens": 1143558.0,
"repeat_count": 0.0,
- "routers_loss": 0.007100600749254227,
+ "routers_loss": 0.006673311349004507,
"skip_count": 0.0,
"step": 708,
"text_loss": 0.5976111888885498
@@ -6743,13 +6743,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009989541996361594,
- "loss": 0.0445,
+ "loss": 0.045,
"macro_f1": 0.3333333432674408,
"num_tokens": 1146122.0,
"repeat_count": 0.0,
- "routers_loss": 0.0047812811098992825,
+ "routers_loss": 0.004988791421055794,
"skip_count": 0.0,
"step": 710,
"text_loss": 0.5256119966506958
@@ -6762,13 +6762,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009989340956996367,
- "loss": 0.052,
+ "loss": 0.0528,
"macro_f1": 0.3333333432674408,
"num_tokens": 1149546.0,
"repeat_count": 0.0,
- "routers_loss": 0.006643407512456179,
+ "routers_loss": 0.0067769973538815975,
"skip_count": 0.0,
"step": 712,
"text_loss": 0.5040497779846191
@@ -6781,13 +6781,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2890625,
+ "grad_norm": 0.26953125,
"learning_rate": 0.0009989138005706273,
- "loss": 0.0719,
+ "loss": 0.0735,
"macro_f1": 0.32098764181137085,
"num_tokens": 1153195.0,
"repeat_count": 0.0,
- "routers_loss": 0.0910436138510704,
+ "routers_loss": 0.09899546951055527,
"skip_count": 2.0,
"step": 714,
"text_loss": 0.20803412795066833
@@ -6800,13 +6800,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1484375,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.000998893314256908,
- "loss": 0.0649,
+ "loss": 0.064,
"macro_f1": 0.3333333432674408,
"num_tokens": 1157081.0,
"repeat_count": 0.0,
- "routers_loss": 0.010978946462273598,
+ "routers_loss": 0.010492355562746525,
"skip_count": 0.0,
"step": 716,
"text_loss": 0.23077639937400818
@@ -6819,13 +6819,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009988726367663298,
- "loss": 0.0543,
+ "loss": 0.0539,
"macro_f1": 0.3333333432674408,
"num_tokens": 1160079.0,
"repeat_count": 0.0,
- "routers_loss": 0.009956461377441883,
+ "routers_loss": 0.01063773687928915,
"skip_count": 0.0,
"step": 718,
"text_loss": 0.6085864901542664
@@ -6838,13 +6838,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009988517681068163,
- "loss": 0.0412,
+ "loss": 0.0421,
"macro_f1": 0.3272727429866791,
"num_tokens": 1163249.0,
"repeat_count": 1.0,
- "routers_loss": 0.057210199534893036,
+ "routers_loss": 0.05981874838471413,
"skip_count": 0.0,
"step": 720,
"text_loss": 0.4047050476074219
@@ -6857,32 +6857,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009988307082863638,
- "loss": 0.0364,
+ "loss": 0.0361,
"macro_f1": 0.3333333432674408,
"num_tokens": 1166259.0,
"repeat_count": 0.0,
- "routers_loss": 0.01035996899008751,
+ "routers_loss": 0.009750043973326683,
"skip_count": 0.0,
"step": 722,
"text_loss": 0.5306474566459656
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 3.3991781626063986,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.2412109375,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.240234375,
"learning_rate": 0.0009988094573130434,
- "loss": 0.0661,
- "macro_f1": 0.3076923191547394,
+ "loss": 0.063,
+ "macro_f1": 0.5359477400779724,
"num_tokens": 1168887.0,
"repeat_count": 2.0,
- "routers_loss": 0.18087820708751678,
+ "routers_loss": 0.18601104617118835,
"skip_count": 2.0,
"step": 724,
"text_loss": 0.53528892993927
@@ -6895,32 +6895,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009987880151949974,
- "loss": 0.0505,
+ "loss": 0.0496,
"macro_f1": 0.3272727429866791,
"num_tokens": 1172625.0,
"repeat_count": 0.0,
- "routers_loss": 0.04720238968729973,
+ "routers_loss": 0.02845010720193386,
"skip_count": 1.0,
"step": 726,
"text_loss": 0.4760453701019287
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 26.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 3.417963017317288,
- "f1_execute": 1.0,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.2216796875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2177734375,
"learning_rate": 0.0009987663819404434,
- "loss": 0.0603,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.06,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1176580.0,
"repeat_count": 0.0,
- "routers_loss": 0.015407778322696686,
+ "routers_loss": 0.017596980556845665,
"skip_count": 2.0,
"step": 728,
"text_loss": 0.5146099328994751
@@ -6933,13 +6933,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.000998744557557671,
- "loss": 0.0489,
+ "loss": 0.0484,
"macro_f1": 0.3272727429866791,
"num_tokens": 1179804.0,
"repeat_count": 0.0,
- "routers_loss": 0.060891781002283096,
+ "routers_loss": 0.0625474750995636,
"skip_count": 1.0,
"step": 730,
"text_loss": 0.27738022804260254
@@ -6947,18 +6947,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 3.436747872028177,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.203125,
"learning_rate": 0.0009987225420550433,
- "loss": 0.0825,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0796,
+ "macro_f1": 0.307692289352417,
"num_tokens": 1182658.0,
"repeat_count": 1.0,
- "routers_loss": 0.1661442220211029,
+ "routers_loss": 0.16188351809978485,
"skip_count": 2.0,
"step": 732,
"text_loss": 0.23231445252895355
@@ -6966,18 +6966,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 3.446140299383622,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.2001953125,
"learning_rate": 0.0009987003354409965,
- "loss": 0.0634,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0626,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1185451.0,
"repeat_count": 0.0,
- "routers_loss": 0.02108248695731163,
+ "routers_loss": 0.02391529455780983,
"skip_count": 0.0,
"step": 734,
"text_loss": 0.4496627151966095
@@ -6990,13 +6990,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.248046875,
+ "grad_norm": 0.234375,
"learning_rate": 0.0009986779377240405,
- "loss": 0.0534,
+ "loss": 0.0513,
"macro_f1": 0.32098767161369324,
"num_tokens": 1188666.0,
"repeat_count": 0.0,
- "routers_loss": 0.08318125456571579,
+ "routers_loss": 0.08435963839292526,
"skip_count": 1.0,
"step": 736,
"text_loss": 0.4950787127017975
@@ -7009,13 +7009,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11962890625,
+ "grad_norm": 0.1220703125,
"learning_rate": 0.000998655348912758,
- "loss": 0.0514,
+ "loss": 0.0515,
"macro_f1": 0.3333333432674408,
"num_tokens": 1193035.0,
"repeat_count": 0.0,
- "routers_loss": 0.015889234840869904,
+ "routers_loss": 0.01648722216486931,
"skip_count": 0.0,
"step": 738,
"text_loss": 0.24761848151683807
@@ -7028,13 +7028,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1513671875,
"learning_rate": 0.0009986325690158051,
"loss": 0.0435,
"macro_f1": 0.3333333432674408,
"num_tokens": 1196840.0,
"repeat_count": 0.0,
- "routers_loss": 0.01378484908491373,
+ "routers_loss": 0.013143910095095634,
"skip_count": 0.0,
"step": 740,
"text_loss": 0.15662719309329987
@@ -7047,13 +7047,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1787109375,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009986095980419113,
- "loss": 0.076,
+ "loss": 0.0757,
"macro_f1": 0.3333333432674408,
"num_tokens": 1200573.0,
"repeat_count": 0.0,
- "routers_loss": 0.02673683874309063,
+ "routers_loss": 0.026706280186772346,
"skip_count": 0.0,
"step": 742,
"text_loss": 0.16725164651870728
@@ -7066,13 +7066,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.185546875,
+ "grad_norm": 0.1982421875,
"learning_rate": 0.0009985864359998787,
- "loss": 0.0778,
+ "loss": 0.0795,
"macro_f1": 0.3006536364555359,
"num_tokens": 1203589.0,
"repeat_count": 2.0,
- "routers_loss": 0.27776041626930237,
+ "routers_loss": 0.28607678413391113,
"skip_count": 3.0,
"step": 744,
"text_loss": 0.6350882053375244
@@ -7085,13 +7085,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009985630828985835,
- "loss": 0.0575,
+ "loss": 0.0572,
"macro_f1": 0.3272727429866791,
"num_tokens": 1206422.0,
"repeat_count": 0.0,
- "routers_loss": 0.0575483962893486,
+ "routers_loss": 0.05685260891914368,
"skip_count": 1.0,
"step": 746,
"text_loss": 0.33779552578926086
@@ -7104,13 +7104,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009985395387469742,
- "loss": 0.0478,
+ "loss": 0.0458,
"macro_f1": 0.5492662787437439,
"num_tokens": 1211588.0,
"repeat_count": 0.0,
- "routers_loss": 0.0458797849714756,
+ "routers_loss": 0.0437830351293087,
"skip_count": 2.0,
"step": 748,
"text_loss": 0.28664472699165344
@@ -7123,13 +7123,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.15625,
"learning_rate": 0.0009985158035540735,
- "loss": 0.0701,
+ "loss": 0.0714,
"macro_f1": 0.32098764181137085,
"num_tokens": 1214580.0,
"repeat_count": 2.0,
- "routers_loss": 0.07850238680839539,
+ "routers_loss": 0.07074898481369019,
"skip_count": 0.0,
"step": 750,
"text_loss": 0.3939313292503357
@@ -7142,13 +7142,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.21484375,
"learning_rate": 0.0009984918773289762,
- "loss": 0.0702,
+ "loss": 0.0699,
"macro_f1": 0.3333333432674408,
"num_tokens": 1217388.0,
"repeat_count": 0.0,
- "routers_loss": 0.009507967159152031,
+ "routers_loss": 0.009757856838405132,
"skip_count": 0.0,
"step": 752,
"text_loss": 0.37641215324401855
@@ -7161,13 +7161,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1484375,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009984677600808512,
- "loss": 0.0543,
+ "loss": 0.054,
"macro_f1": 0.3333333432674408,
"num_tokens": 1219960.0,
"repeat_count": 0.0,
- "routers_loss": 0.02620997279882431,
+ "routers_loss": 0.02515069581568241,
"skip_count": 0.0,
"step": 754,
"text_loss": 0.155938982963562
@@ -7180,13 +7180,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3359375,
+ "grad_norm": 0.30078125,
"learning_rate": 0.0009984434518189405,
- "loss": 0.0791,
+ "loss": 0.0764,
"macro_f1": 0.3333333432674408,
"num_tokens": 1223234.0,
"repeat_count": 0.0,
- "routers_loss": 0.02798631228506565,
+ "routers_loss": 0.025766927748918533,
"skip_count": 0.0,
"step": 756,
"text_loss": 0.691118061542511
@@ -7201,11 +7201,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1416015625,
"learning_rate": 0.0009984189525525584,
- "loss": 0.046,
+ "loss": 0.0451,
"macro_f1": 0.5359477400779724,
"num_tokens": 1225764.0,
"repeat_count": 2.0,
- "routers_loss": 0.16614431142807007,
+ "routers_loss": 0.1782722771167755,
"skip_count": 2.0,
"step": 758,
"text_loss": 0.3592209219932556
@@ -7218,13 +7218,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.193359375,
+ "grad_norm": 0.189453125,
"learning_rate": 0.0009983942622910935,
- "loss": 0.0669,
+ "loss": 0.0659,
"macro_f1": 0.3333333432674408,
"num_tokens": 1230097.0,
"repeat_count": 0.0,
- "routers_loss": 0.008541896007955074,
+ "routers_loss": 0.00825568474829197,
"skip_count": 0.0,
"step": 760,
"text_loss": 0.4646475315093994
@@ -7237,13 +7237,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009983693810440074,
- "loss": 0.0478,
+ "loss": 0.0477,
"macro_f1": 0.32098764181137085,
"num_tokens": 1233140.0,
"repeat_count": 0.0,
- "routers_loss": 0.045411624014377594,
+ "routers_loss": 0.04156976938247681,
"skip_count": 2.0,
"step": 762,
"text_loss": 0.298682302236557
@@ -7256,13 +7256,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.380859375,
+ "grad_norm": 0.3515625,
"learning_rate": 0.000998344308820834,
- "loss": 0.0689,
+ "loss": 0.0666,
"macro_f1": 0.3272727429866791,
"num_tokens": 1236305.0,
"repeat_count": 0.0,
- "routers_loss": 0.052299100905656815,
+ "routers_loss": 0.05697929114103317,
"skip_count": 1.0,
"step": 764,
"text_loss": 0.5249121189117432
@@ -7275,13 +7275,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.18359375,
"learning_rate": 0.0009983190456311817,
- "loss": 0.0602,
+ "loss": 0.0592,
"macro_f1": 0.3144654333591461,
"num_tokens": 1239673.0,
"repeat_count": 0.0,
- "routers_loss": 0.09140212833881378,
+ "routers_loss": 0.09547408670186996,
"skip_count": 3.0,
"step": 766,
"text_loss": 0.41277334094047546
@@ -7294,13 +7294,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.201171875,
+ "grad_norm": 0.185546875,
"learning_rate": 0.000998293591484731,
- "loss": 0.0475,
+ "loss": 0.0484,
"macro_f1": 0.5492662787437439,
"num_tokens": 1242292.0,
"repeat_count": 0.0,
- "routers_loss": 0.030750583857297897,
+ "routers_loss": 0.030693158507347107,
"skip_count": 2.0,
"step": 768,
"text_loss": 0.1583656519651413
@@ -7313,13 +7313,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000998267946391236,
- "loss": 0.052,
+ "loss": 0.051,
"macro_f1": 0.3333333432674408,
"num_tokens": 1244661.0,
"repeat_count": 0.0,
- "routers_loss": 0.010202950797975063,
+ "routers_loss": 0.01211300864815712,
"skip_count": 0.0,
"step": 770,
"text_loss": 0.4629349112510681
@@ -7332,13 +7332,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0009982421103605238,
- "loss": 0.0434,
+ "loss": 0.0441,
"macro_f1": 0.32098764181137085,
"num_tokens": 1248688.0,
"repeat_count": 0.0,
- "routers_loss": 0.07364192605018616,
+ "routers_loss": 0.0665968507528305,
"skip_count": 2.0,
"step": 772,
"text_loss": 0.4019293785095215
@@ -7353,11 +7353,11 @@
"f1_skip": 0.0,
"grad_norm": 0.2890625,
"learning_rate": 0.000998216083402495,
- "loss": 0.0606,
+ "loss": 0.0613,
"macro_f1": 0.32098764181137085,
"num_tokens": 1251395.0,
"repeat_count": 0.0,
- "routers_loss": 0.06553081423044205,
+ "routers_loss": 0.07186859846115112,
"skip_count": 2.0,
"step": 774,
"text_loss": 0.4659276604652405
@@ -7370,13 +7370,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.302734375,
"learning_rate": 0.0009981898655271235,
- "loss": 0.0475,
+ "loss": 0.0488,
"macro_f1": 0.3333333432674408,
"num_tokens": 1254888.0,
"repeat_count": 0.0,
- "routers_loss": 0.008751659654080868,
+ "routers_loss": 0.007823926396667957,
"skip_count": 0.0,
"step": 776,
"text_loss": 0.5160359740257263
@@ -7389,13 +7389,13 @@
"f1_execute": 0.9130434989929199,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.11962890625,
"learning_rate": 0.0009981634567444557,
- "loss": 0.0777,
+ "loss": 0.0775,
"macro_f1": 0.590062141418457,
"num_tokens": 1258250.0,
"repeat_count": 3.0,
- "routers_loss": 0.24522721767425537,
+ "routers_loss": 0.24624499678611755,
"skip_count": 4.0,
"step": 778,
"text_loss": 0.29319918155670166
@@ -7408,13 +7408,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.263671875,
"learning_rate": 0.0009981368570646115,
"loss": 0.0885,
"macro_f1": 0.3272727429866791,
"num_tokens": 1260916.0,
"repeat_count": 0.0,
- "routers_loss": 0.03767623379826546,
+ "routers_loss": 0.030730176717042923,
"skip_count": 1.0,
"step": 780,
"text_loss": 0.624981164932251
@@ -7427,13 +7427,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009981100664977838,
- "loss": 0.0708,
+ "loss": 0.0699,
"macro_f1": 0.3333333432674408,
"num_tokens": 1264004.0,
"repeat_count": 0.0,
- "routers_loss": 0.006098059006035328,
+ "routers_loss": 0.006829176563769579,
"skip_count": 0.0,
"step": 782,
"text_loss": 0.6137266159057617
@@ -7446,13 +7446,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009980830850542391,
- "loss": 0.0589,
+ "loss": 0.058,
"macro_f1": 0.3333333432674408,
"num_tokens": 1267130.0,
"repeat_count": 0.0,
- "routers_loss": 0.01731623336672783,
+ "routers_loss": 0.018471000716090202,
"skip_count": 0.0,
"step": 784,
"text_loss": 0.15213175117969513
@@ -7465,13 +7465,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.0009980559127443166,
- "loss": 0.0526,
+ "loss": 0.052,
"macro_f1": 0.3333333432674408,
"num_tokens": 1271129.0,
"repeat_count": 0.0,
- "routers_loss": 0.0076471962966024876,
+ "routers_loss": 0.007903140969574451,
"skip_count": 0.0,
"step": 786,
"text_loss": 0.5768613219261169
@@ -7484,13 +7484,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.130859375,
"learning_rate": 0.000998028549578429,
- "loss": 0.0745,
+ "loss": 0.0719,
"macro_f1": 0.307692289352417,
"num_tokens": 1274232.0,
"repeat_count": 0.0,
- "routers_loss": 0.0637628585100174,
+ "routers_loss": 0.06737866252660751,
"skip_count": 3.0,
"step": 788,
"text_loss": 0.2877073585987091
@@ -7503,13 +7503,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009980009955670615,
- "loss": 0.0699,
+ "loss": 0.0698,
"macro_f1": 0.3144654333591461,
"num_tokens": 1277193.0,
"repeat_count": 0.0,
- "routers_loss": 0.10882514715194702,
+ "routers_loss": 0.10194934904575348,
"skip_count": 3.0,
"step": 790,
"text_loss": 0.11860492825508118
@@ -7522,13 +7522,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.126953125,
"learning_rate": 0.000997973250720773,
- "loss": 0.056,
+ "loss": 0.0552,
"macro_f1": 0.32098764181137085,
"num_tokens": 1280960.0,
"repeat_count": 0.0,
- "routers_loss": 0.10924118757247925,
+ "routers_loss": 0.10297708213329315,
"skip_count": 2.0,
"step": 792,
"text_loss": 0.13477706909179688
@@ -7541,13 +7541,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009979453150501954,
- "loss": 0.0664,
+ "loss": 0.0663,
"macro_f1": 0.32098764181137085,
"num_tokens": 1284611.0,
"repeat_count": 1.0,
- "routers_loss": 0.06571807712316513,
+ "routers_loss": 0.06122037023305893,
"skip_count": 1.0,
"step": 794,
"text_loss": 0.40569379925727844
@@ -7560,13 +7560,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1181640625,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.000997917188566034,
- "loss": 0.0616,
+ "loss": 0.062,
"macro_f1": 0.32098764181137085,
"num_tokens": 1287834.0,
"repeat_count": 0.0,
- "routers_loss": 0.058966971933841705,
+ "routers_loss": 0.061135001480579376,
"skip_count": 2.0,
"step": 796,
"text_loss": 0.2829287648200989
@@ -7579,32 +7579,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.109375,
"learning_rate": 0.0009978888712790664,
- "loss": 0.067,
+ "loss": 0.0654,
"macro_f1": 0.3272727429866791,
"num_tokens": 1291666.0,
"repeat_count": 0.0,
- "routers_loss": 0.04844636470079422,
+ "routers_loss": 0.04841872677206993,
"skip_count": 1.0,
"step": 798,
"text_loss": 1.011757254600525
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.4000000059604645,
- "avg_layers": 26.0,
+ "acc_skip": 0.20000000298023224,
+ "avg_layers": 27.0,
"epoch": 3.756090402113296,
- "f1_execute": 0.9166666865348816,
+ "f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
- "f1_skip": 0.5714285969734192,
- "grad_norm": 0.1416015625,
+ "f1_skip": 0.3333333134651184,
+ "grad_norm": 0.14453125,
"learning_rate": 0.0009978603632001444,
- "loss": 0.0634,
- "macro_f1": 0.4960317611694336,
+ "loss": 0.0636,
+ "macro_f1": 0.4104308485984802,
"num_tokens": 1294627.0,
"repeat_count": 1.0,
- "routers_loss": 0.1591777801513672,
+ "routers_loss": 0.15698759257793427,
"skip_count": 5.0,
"step": 800,
"text_loss": 0.4457623362541199
@@ -7617,13 +7617,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.283203125,
"learning_rate": 0.0009978316643401916,
- "loss": 0.0694,
+ "loss": 0.0688,
"macro_f1": 0.3333333432674408,
"num_tokens": 1297711.0,
"repeat_count": 0.0,
- "routers_loss": 0.017735568806529045,
+ "routers_loss": 0.018952010199427605,
"skip_count": 0.0,
"step": 802,
"text_loss": 0.2069481462240219
@@ -7636,13 +7636,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.14453125,
"learning_rate": 0.0009978027747102062,
- "loss": 0.0477,
+ "loss": 0.0479,
"macro_f1": 0.3333333432674408,
"num_tokens": 1300569.0,
"repeat_count": 0.0,
- "routers_loss": 0.012401525862514973,
+ "routers_loss": 0.014538386836647987,
"skip_count": 0.0,
"step": 804,
"text_loss": 0.4983852505683899
@@ -7655,13 +7655,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2080078125,
+ "grad_norm": 0.2109375,
"learning_rate": 0.0009977736943212584,
- "loss": 0.0735,
+ "loss": 0.0721,
"macro_f1": 0.32098764181137085,
"num_tokens": 1303969.0,
"repeat_count": 0.0,
- "routers_loss": 0.10736164450645447,
+ "routers_loss": 0.11164087057113647,
"skip_count": 2.0,
"step": 806,
"text_loss": 0.2910642921924591
@@ -7674,13 +7674,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2001953125,
+ "grad_norm": 0.1826171875,
"learning_rate": 0.000997744423184492,
- "loss": 0.0428,
+ "loss": 0.0424,
"macro_f1": 0.3272727429866791,
"num_tokens": 1307263.0,
"repeat_count": 0.0,
- "routers_loss": 0.0595436617732048,
+ "routers_loss": 0.06073406711220741,
"skip_count": 1.0,
"step": 808,
"text_loss": 0.18831779062747955
@@ -7693,13 +7693,13 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.26171875,
"learning_rate": 0.0009977149613111236,
- "loss": 0.0494,
+ "loss": 0.0486,
"macro_f1": 0.4400000274181366,
"num_tokens": 1309953.0,
"repeat_count": 1.0,
- "routers_loss": 0.12617000937461853,
+ "routers_loss": 0.11035524308681488,
"skip_count": 4.0,
"step": 810,
"text_loss": 0.7872759699821472
@@ -7712,13 +7712,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1669921875,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.0009976853087124433,
- "loss": 0.0537,
+ "loss": 0.0536,
"macro_f1": 0.3333333432674408,
"num_tokens": 1313243.0,
"repeat_count": 0.0,
- "routers_loss": 0.021242506802082062,
+ "routers_loss": 0.021804286167025566,
"skip_count": 0.0,
"step": 812,
"text_loss": 0.22349292039871216
@@ -7731,13 +7731,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.318359375,
+ "grad_norm": 0.28125,
"learning_rate": 0.0009976554653998138,
- "loss": 0.0617,
+ "loss": 0.0612,
"macro_f1": 0.31446540355682373,
"num_tokens": 1316165.0,
"repeat_count": 0.0,
- "routers_loss": 0.10387415438890457,
+ "routers_loss": 0.10715524107217789,
"skip_count": 2.0,
"step": 814,
"text_loss": 0.18035532534122467
@@ -7750,13 +7750,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.000997625431384671,
- "loss": 0.0565,
+ "loss": 0.0564,
"macro_f1": 0.3333333432674408,
"num_tokens": 1319206.0,
"repeat_count": 0.0,
- "routers_loss": 0.007816939614713192,
+ "routers_loss": 0.007173649035394192,
"skip_count": 0.0,
"step": 816,
"text_loss": 0.48928648233413696
@@ -7769,13 +7769,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.0009975952066785243,
- "loss": 0.0654,
+ "loss": 0.0655,
"macro_f1": 0.3006536364555359,
"num_tokens": 1322549.0,
"repeat_count": 1.0,
- "routers_loss": 0.22526368498802185,
+ "routers_loss": 0.22308112680912018,
"skip_count": 4.0,
"step": 818,
"text_loss": 0.5211259722709656
@@ -7788,13 +7788,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.0009975647912929557,
- "loss": 0.056,
+ "loss": 0.0564,
"macro_f1": 0.3333333432674408,
"num_tokens": 1325213.0,
"repeat_count": 0.0,
- "routers_loss": 0.010998851619660854,
+ "routers_loss": 0.00998698640614748,
"skip_count": 0.0,
"step": 820,
"text_loss": 0.7117052674293518
@@ -7807,13 +7807,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.15234375,
"learning_rate": 0.0009975341852396205,
- "loss": 0.0712,
+ "loss": 0.0723,
"macro_f1": 0.32098764181137085,
"num_tokens": 1328383.0,
"repeat_count": 0.0,
- "routers_loss": 0.07115054875612259,
+ "routers_loss": 0.07454588264226913,
"skip_count": 2.0,
"step": 822,
"text_loss": 0.34539610147476196
@@ -7826,13 +7826,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009975033885302469,
- "loss": 0.0611,
+ "loss": 0.0604,
"macro_f1": 0.3333333432674408,
"num_tokens": 1331406.0,
"repeat_count": 0.0,
- "routers_loss": 0.008062695153057575,
+ "routers_loss": 0.009157589636743069,
"skip_count": 0.0,
"step": 824,
"text_loss": 0.7484824657440186
@@ -7845,13 +7845,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.0009974724011766363,
- "loss": 0.0496,
+ "loss": 0.0474,
"macro_f1": 0.3272727429866791,
"num_tokens": 1334410.0,
"repeat_count": 1.0,
- "routers_loss": 0.16666285693645477,
+ "routers_loss": 0.17149391770362854,
"skip_count": 0.0,
"step": 826,
"text_loss": 0.5913820266723633
@@ -7864,13 +7864,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1708984375,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009974412231906632,
- "loss": 0.0567,
+ "loss": 0.058,
"macro_f1": 0.32098764181137085,
"num_tokens": 1337653.0,
"repeat_count": 1.0,
- "routers_loss": 0.0908689796924591,
+ "routers_loss": 0.09743282198905945,
"skip_count": 1.0,
"step": 828,
"text_loss": 0.2505693733692169
@@ -7883,13 +7883,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16015625,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0009974098545842748,
- "loss": 0.0648,
+ "loss": 0.0638,
"macro_f1": 0.3272727429866791,
"num_tokens": 1340860.0,
"repeat_count": 0.0,
- "routers_loss": 0.04364728182554245,
+ "routers_loss": 0.041490405797958374,
"skip_count": 1.0,
"step": 830,
"text_loss": 0.5585370063781738
@@ -7897,18 +7897,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 3.906369239800411,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2060546875,
+ "grad_norm": 0.193359375,
"learning_rate": 0.0009973782953694918,
- "loss": 0.0772,
- "macro_f1": 0.3076923191547394,
+ "loss": 0.0746,
+ "macro_f1": 0.3006536066532135,
"num_tokens": 1344232.0,
"repeat_count": 1.0,
- "routers_loss": 0.15315109491348267,
+ "routers_loss": 0.16080693900585175,
"skip_count": 3.0,
"step": 832,
"text_loss": 0.4782734513282776
@@ -7921,13 +7921,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.000997346545558408,
- "loss": 0.0527,
+ "loss": 0.0522,
"macro_f1": 0.3333333432674408,
"num_tokens": 1347667.0,
"repeat_count": 0.0,
- "routers_loss": 0.01342768594622612,
+ "routers_loss": 0.01173500344157219,
"skip_count": 0.0,
"step": 834,
"text_loss": 0.25036177039146423
@@ -7940,13 +7940,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1748046875,
+ "grad_norm": 0.173828125,
"learning_rate": 0.0009973146051631895,
- "loss": 0.0513,
+ "loss": 0.0522,
"macro_f1": 0.3333333432674408,
"num_tokens": 1350707.0,
"repeat_count": 0.0,
- "routers_loss": 0.01158806961029768,
+ "routers_loss": 0.011477196589112282,
"skip_count": 0.0,
"step": 836,
"text_loss": 0.5482863187789917
@@ -7959,13 +7959,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.0009972824741960764,
- "loss": 0.0549,
+ "loss": 0.0536,
"macro_f1": 0.3333333432674408,
"num_tokens": 1353704.0,
"repeat_count": 0.0,
- "routers_loss": 0.01255605649203062,
+ "routers_loss": 0.010528896935284138,
"skip_count": 0.0,
"step": 838,
"text_loss": 0.6732596158981323
@@ -7978,13 +7978,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12255859375,
+ "grad_norm": 0.1181640625,
"learning_rate": 0.000997250152669381,
- "loss": 0.0578,
+ "loss": 0.0573,
"macro_f1": 0.3333333432674408,
"num_tokens": 1356608.0,
"repeat_count": 0.0,
- "routers_loss": 0.010225459933280945,
+ "routers_loss": 0.010678744874894619,
"skip_count": 0.0,
"step": 840,
"text_loss": 0.5479338765144348
@@ -7997,13 +7997,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.181640625,
"learning_rate": 0.000997217640595489,
- "loss": 0.0633,
+ "loss": 0.0631,
"macro_f1": 0.3333333432674408,
"num_tokens": 1359809.0,
"repeat_count": 0.0,
- "routers_loss": 0.007837744429707527,
+ "routers_loss": 0.00835978239774704,
"skip_count": 0.0,
"step": 842,
"text_loss": 0.42543259263038635
@@ -8016,13 +8016,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.0009971849379868593,
- "loss": 0.0674,
+ "loss": 0.0653,
"macro_f1": 0.3333333432674408,
"num_tokens": 1362201.0,
"repeat_count": 0.0,
- "routers_loss": 0.008631376549601555,
+ "routers_loss": 0.009930923581123352,
"skip_count": 0.0,
"step": 844,
"text_loss": 0.720462441444397
@@ -8035,13 +8035,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009971520448560235,
- "loss": 0.0612,
+ "loss": 0.0615,
"macro_f1": 0.3272727429866791,
"num_tokens": 1365790.0,
"repeat_count": 0.0,
- "routers_loss": 0.06206027418375015,
+ "routers_loss": 0.06344373524188995,
"skip_count": 1.0,
"step": 846,
"text_loss": 0.8423607349395752
@@ -8049,18 +8049,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 3.9815086586439685,
- "f1_execute": 0.9411765336990356,
+ "f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
- "f1_skip": 0.5,
- "grad_norm": 0.16015625,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.16796875,
"learning_rate": 0.000997118961215586,
- "loss": 0.0678,
- "macro_f1": 0.480392187833786,
+ "loss": 0.0674,
+ "macro_f1": 0.4533333480358124,
"num_tokens": 1368387.0,
"repeat_count": 1.0,
- "routers_loss": 0.1463794708251953,
+ "routers_loss": 0.14688406884670258,
"skip_count": 3.0,
"step": 848,
"text_loss": 0.3933577537536621
@@ -8073,13 +8073,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000997085687078225,
- "loss": 0.052,
+ "loss": 0.0518,
"macro_f1": 0.3333333432674408,
"num_tokens": 1371189.0,
"repeat_count": 0.0,
- "routers_loss": 0.01140492781996727,
+ "routers_loss": 0.009953443892300129,
"skip_count": 0.0,
"step": 850,
"text_loss": 0.41469162702560425
@@ -8092,13 +8092,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.15625,
"learning_rate": 0.0009970522224566909,
- "loss": 0.0563,
+ "loss": 0.0555,
"macro_f1": 0.32098767161369324,
"num_tokens": 1374008.0,
"repeat_count": 0.0,
- "routers_loss": 0.05136030167341232,
+ "routers_loss": 0.048870690166950226,
"skip_count": 1.0,
"step": 852,
"text_loss": 0.613615870475769
@@ -8111,32 +8111,32 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.283203125,
"learning_rate": 0.0009970185673638075,
- "loss": 0.0627,
+ "loss": 0.0629,
"macro_f1": 0.32098764181137085,
"num_tokens": 1376662.0,
"repeat_count": 1.0,
- "routers_loss": 0.07274381071329117,
+ "routers_loss": 0.06865929812192917,
"skip_count": 1.0,
"step": 854,
"text_loss": 0.4392736256122589
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 4.01878485471089,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.162109375,
"learning_rate": 0.0009969847218124716,
- "loss": 0.0503,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0506,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1380049.0,
"repeat_count": 0.0,
- "routers_loss": 0.024335317313671112,
+ "routers_loss": 0.02382219396531582,
"skip_count": 1.0,
"step": 856,
"text_loss": 0.19115346670150757
@@ -8149,13 +8149,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009969506858156527,
- "loss": 0.0359,
+ "loss": 0.0344,
"macro_f1": 0.3272727429866791,
"num_tokens": 1383008.0,
"repeat_count": 0.0,
- "routers_loss": 0.046614740043878555,
+ "routers_loss": 0.03907281160354614,
"skip_count": 1.0,
"step": 858,
"text_loss": 0.34842637181282043
@@ -8168,13 +8168,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11181640625,
+ "grad_norm": 0.12060546875,
"learning_rate": 0.0009969164593863935,
- "loss": 0.0372,
+ "loss": 0.0365,
"macro_f1": 0.3333333432674408,
"num_tokens": 1387051.0,
"repeat_count": 0.0,
- "routers_loss": 0.006380240898579359,
+ "routers_loss": 0.007645803038030863,
"skip_count": 0.0,
"step": 860,
"text_loss": 0.3810436725616455
@@ -8187,13 +8187,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009968820425378098,
- "loss": 0.0473,
+ "loss": 0.0463,
"macro_f1": 0.3272727429866791,
"num_tokens": 1390244.0,
"repeat_count": 1.0,
- "routers_loss": 0.04770716652274132,
+ "routers_loss": 0.04435238987207413,
"skip_count": 0.0,
"step": 862,
"text_loss": 0.34853485226631165
@@ -8206,32 +8206,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3359375,
+ "grad_norm": 0.28515625,
"learning_rate": 0.00099684743528309,
- "loss": 0.0434,
+ "loss": 0.0424,
"macro_f1": 0.3333333432674408,
"num_tokens": 1392976.0,
"repeat_count": 0.0,
- "routers_loss": 0.006983708590269089,
+ "routers_loss": 0.006071661598980427,
"skip_count": 0.0,
"step": 864,
"text_loss": 0.6395178437232971
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 4.065746991488113,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.080078125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0810546875,
"learning_rate": 0.0009968126376354958,
- "loss": 0.0476,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0477,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1396061.0,
"repeat_count": 0.0,
- "routers_loss": 0.046313900500535965,
+ "routers_loss": 0.05011235550045967,
"skip_count": 2.0,
"step": 866,
"text_loss": 0.09103966504335403
@@ -8244,32 +8244,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009967776496083616,
"loss": 0.0509,
"macro_f1": 0.3272727429866791,
"num_tokens": 1398993.0,
"repeat_count": 1.0,
- "routers_loss": 0.0401870422065258,
+ "routers_loss": 0.03979124873876572,
"skip_count": 0.0,
"step": 868,
"text_loss": 0.27257058024406433
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 4.084531846199002,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.14453125,
"learning_rate": 0.000996742471215095,
- "loss": 0.0505,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0516,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1402080.0,
"repeat_count": 0.0,
- "routers_loss": 0.03313451260328293,
+ "routers_loss": 0.030823837965726852,
"skip_count": 2.0,
"step": 870,
"text_loss": 0.7047103047370911
@@ -8282,13 +8282,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009967071024691763,
- "loss": 0.0468,
+ "loss": 0.0461,
"macro_f1": 0.3333333432674408,
"num_tokens": 1404890.0,
"repeat_count": 0.0,
- "routers_loss": 0.010118982754647732,
+ "routers_loss": 0.009721715934574604,
"skip_count": 0.0,
"step": 872,
"text_loss": 0.959106981754303
@@ -8301,13 +8301,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.000996671543384159,
- "loss": 0.0498,
+ "loss": 0.05,
"macro_f1": 0.3333333432674408,
"num_tokens": 1407853.0,
"repeat_count": 0.0,
- "routers_loss": 0.005856200121343136,
+ "routers_loss": 0.006025883834809065,
"skip_count": 0.0,
"step": 874,
"text_loss": 0.47571972012519836
@@ -8320,13 +8320,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.09765625,
"learning_rate": 0.0009966357939736692,
- "loss": 0.0417,
+ "loss": 0.0416,
"macro_f1": 0.3272727429866791,
"num_tokens": 1410723.0,
"repeat_count": 0.0,
- "routers_loss": 0.02768322452902794,
+ "routers_loss": 0.025964925065636635,
"skip_count": 0.0,
"step": 876,
"text_loss": 0.4964611530303955
@@ -8339,13 +8339,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1025390625,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009965998542514065,
- "loss": 0.0419,
+ "loss": 0.0415,
"macro_f1": 0.32098764181137085,
"num_tokens": 1414008.0,
"repeat_count": 0.0,
- "routers_loss": 0.09382032603025436,
+ "routers_loss": 0.09509637206792831,
"skip_count": 2.0,
"step": 878,
"text_loss": 0.621494710445404
@@ -8358,32 +8358,32 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009965637242311427,
- "loss": 0.0466,
+ "loss": 0.0472,
"macro_f1": 0.542222261428833,
"num_tokens": 1417447.0,
"repeat_count": 0.0,
- "routers_loss": 0.026867631822824478,
+ "routers_loss": 0.02520318515598774,
"skip_count": 4.0,
"step": 880,
"text_loss": 0.40209758281707764
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6666666865348816,
- "avg_layers": 24.0,
+ "acc_skip": 0.5,
+ "avg_layers": 25.0,
"epoch": 4.14088641033167,
- "f1_execute": 0.95652174949646,
+ "f1_execute": 0.936170220375061,
"f1_repeat": 0.0,
- "f1_skip": 0.800000011920929,
- "grad_norm": 0.26171875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000996527403926723,
- "loss": 0.0496,
- "macro_f1": 0.5855072736740112,
+ "loss": 0.0495,
+ "macro_f1": 0.5342789888381958,
"num_tokens": 1419905.0,
"repeat_count": 0.0,
- "routers_loss": 0.12731307744979858,
+ "routers_loss": 0.13183781504631042,
"skip_count": 6.0,
"step": 882,
"text_loss": 0.642185389995575
@@ -8396,13 +8396,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.0009964908933520655,
- "loss": 0.039,
+ "loss": 0.0375,
"macro_f1": 0.3333333432674408,
"num_tokens": 1423436.0,
"repeat_count": 0.0,
- "routers_loss": 0.008483970537781715,
+ "routers_loss": 0.009429510682821274,
"skip_count": 0.0,
"step": 884,
"text_loss": 0.48232755064964294
@@ -8415,13 +8415,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.18359375,
+ "grad_norm": 0.1669921875,
"learning_rate": 0.0009964541925211613,
- "loss": 0.0348,
+ "loss": 0.0349,
"macro_f1": 0.32098764181137085,
"num_tokens": 1426842.0,
"repeat_count": 0.0,
- "routers_loss": 0.07847871631383896,
+ "routers_loss": 0.07629609107971191,
"skip_count": 2.0,
"step": 886,
"text_loss": 0.16620934009552002
@@ -8434,13 +8434,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0009964173014480738,
- "loss": 0.036,
+ "loss": 0.0348,
"macro_f1": 0.5492662787437439,
"num_tokens": 1430430.0,
"repeat_count": 0.0,
- "routers_loss": 0.04574459046125412,
+ "routers_loss": 0.036814019083976746,
"skip_count": 2.0,
"step": 888,
"text_loss": 0.4866008758544922
@@ -8453,13 +8453,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10595703125,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009963802201469398,
- "loss": 0.0485,
+ "loss": 0.0476,
"macro_f1": 0.3333333432674408,
"num_tokens": 1433821.0,
"repeat_count": 0.0,
- "routers_loss": 0.004683624487370253,
+ "routers_loss": 0.0041250260546803474,
"skip_count": 0.0,
"step": 890,
"text_loss": 0.578216552734375
@@ -8472,13 +8472,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2158203125,
+ "grad_norm": 0.2373046875,
"learning_rate": 0.0009963429486319693,
- "loss": 0.0476,
+ "loss": 0.0463,
"macro_f1": 0.32098764181137085,
"num_tokens": 1436976.0,
"repeat_count": 0.0,
- "routers_loss": 0.06499828398227692,
+ "routers_loss": 0.06213559955358505,
"skip_count": 2.0,
"step": 892,
"text_loss": 0.221701517701149
@@ -8486,18 +8486,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
- "avg_layers": 25.0,
+ "avg_layers": 26.0,
"epoch": 4.197240974464338,
- "f1_execute": 0.9411764740943909,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.4000000059604645,
- "grad_norm": 0.310546875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.361328125,
"learning_rate": 0.0009963054869174446,
- "loss": 0.0326,
- "macro_f1": 0.44705885648727417,
+ "loss": 0.0313,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 1440397.0,
"repeat_count": 0.0,
- "routers_loss": 0.08285653591156006,
+ "routers_loss": 0.07532428950071335,
"skip_count": 2.0,
"step": 894,
"text_loss": 0.6922838091850281
@@ -8510,13 +8510,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.154296875,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.0009962678350177209,
- "loss": 0.0497,
+ "loss": 0.0472,
"macro_f1": 0.3272727429866791,
"num_tokens": 1443604.0,
"repeat_count": 0.0,
- "routers_loss": 0.04252336546778679,
+ "routers_loss": 0.0419243648648262,
"skip_count": 1.0,
"step": 896,
"text_loss": 0.22092342376708984
@@ -8524,18 +8524,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 4.216025829175227,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10302734375,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009962299929472268,
- "loss": 0.0349,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.034,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 1446257.0,
"repeat_count": 2.0,
- "routers_loss": 0.126711905002594,
+ "routers_loss": 0.10849297791719437,
"skip_count": 0.0,
"step": 898,
"text_loss": 0.26394811272621155
@@ -8548,13 +8548,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.000996191960720463,
- "loss": 0.0392,
+ "loss": 0.0394,
"macro_f1": 0.3333333432674408,
"num_tokens": 1449669.0,
"repeat_count": 0.0,
- "routers_loss": 0.00955706462264061,
+ "routers_loss": 0.0092767970636487,
"skip_count": 0.0,
"step": 900,
"text_loss": 0.5338577628135681
@@ -8567,13 +8567,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009961537383520042,
- "loss": 0.0377,
+ "loss": 0.0354,
"macro_f1": 0.3272727429866791,
"num_tokens": 1452450.0,
"repeat_count": 1.0,
- "routers_loss": 0.03127318620681763,
+ "routers_loss": 0.02985367365181446,
"skip_count": 0.0,
"step": 902,
"text_loss": 0.5875228047370911
@@ -8586,13 +8586,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.0009961153258564966,
- "loss": 0.0389,
+ "loss": 0.0378,
"macro_f1": 0.3144654333591461,
"num_tokens": 1456909.0,
"repeat_count": 0.0,
- "routers_loss": 0.06743519753217697,
+ "routers_loss": 0.06794842332601547,
"skip_count": 3.0,
"step": 904,
"text_loss": 0.40959444642066956
@@ -8605,13 +8605,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009960767232486604,
- "loss": 0.0477,
+ "loss": 0.0476,
"macro_f1": 0.3333333432674408,
"num_tokens": 1461712.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025313226506114006,
+ "routers_loss": 0.0023562447167932987,
"skip_count": 0.0,
"step": 906,
"text_loss": 0.3932875096797943
@@ -8624,13 +8624,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.08203125,
"learning_rate": 0.000996037930543288,
- "loss": 0.052,
+ "loss": 0.0505,
"macro_f1": 0.3272727429866791,
"num_tokens": 1464817.0,
"repeat_count": 0.0,
- "routers_loss": 0.037147488445043564,
+ "routers_loss": 0.03880339860916138,
"skip_count": 1.0,
"step": 908,
"text_loss": 0.17482402920722961
@@ -8643,13 +8643,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.2119140625,
"learning_rate": 0.000995998947755245,
- "loss": 0.0501,
+ "loss": 0.0479,
"macro_f1": 0.3272727429866791,
"num_tokens": 1467810.0,
"repeat_count": 0.0,
- "routers_loss": 0.021232586354017258,
+ "routers_loss": 0.01736828312277794,
"skip_count": 1.0,
"step": 910,
"text_loss": 0.4140470325946808
@@ -8662,13 +8662,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009959597748994695,
- "loss": 0.0759,
+ "loss": 0.0752,
"macro_f1": 0.3333333432674408,
"num_tokens": 1470802.0,
"repeat_count": 0.0,
- "routers_loss": 0.010563847608864307,
+ "routers_loss": 0.011824851855635643,
"skip_count": 0.0,
"step": 912,
"text_loss": 0.7153383493423462
@@ -8681,13 +8681,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009959204119909726,
- "loss": 0.0425,
+ "loss": 0.0421,
"macro_f1": 0.3272727429866791,
"num_tokens": 1474539.0,
"repeat_count": 0.0,
- "routers_loss": 0.0267612524330616,
+ "routers_loss": 0.025456594303250313,
"skip_count": 0.0,
"step": 914,
"text_loss": 0.42812058329582214
@@ -8700,13 +8700,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009958808590448385,
- "loss": 0.0501,
+ "loss": 0.0489,
"macro_f1": 0.3333333432674408,
"num_tokens": 1477552.0,
"repeat_count": 0.0,
- "routers_loss": 0.005838244222104549,
+ "routers_loss": 0.006795851048082113,
"skip_count": 0.0,
"step": 916,
"text_loss": 0.5402814149856567
@@ -8719,13 +8719,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009958411160762234,
- "loss": 0.0383,
+ "loss": 0.039,
"macro_f1": 0.3333333432674408,
"num_tokens": 1482547.0,
"repeat_count": 0.0,
- "routers_loss": 0.014642171561717987,
+ "routers_loss": 0.015615932643413544,
"skip_count": 0.0,
"step": 918,
"text_loss": 0.3836168050765991
@@ -8738,32 +8738,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009958011831003577,
- "loss": 0.0457,
+ "loss": 0.0448,
"macro_f1": 0.3272727429866791,
"num_tokens": 1485807.0,
"repeat_count": 0.0,
- "routers_loss": 0.04119620472192764,
+ "routers_loss": 0.043541423976421356,
"skip_count": 1.0,
"step": 920,
"text_loss": 0.4333936274051666
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 4.328734957440563,
- "f1_execute": 0.943396270275116,
- "f1_repeat": 0.0,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.154296875,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.000995761060132543,
- "loss": 0.0433,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0418,
+ "macro_f1": 0.6538461446762085,
"num_tokens": 1488941.0,
"repeat_count": 1.0,
- "routers_loss": 0.06713195145130157,
+ "routers_loss": 0.05866432189941406,
"skip_count": 2.0,
"step": 922,
"text_loss": 0.4106994867324829
@@ -8776,13 +8776,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009957207471881552,
- "loss": 0.0533,
+ "loss": 0.0531,
"macro_f1": 0.5492662787437439,
"num_tokens": 1492026.0,
"repeat_count": 0.0,
- "routers_loss": 0.024023180827498436,
+ "routers_loss": 0.02714901603758335,
"skip_count": 2.0,
"step": 924,
"text_loss": 0.542091429233551
@@ -8795,13 +8795,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.17578125,
+ "grad_norm": 0.1796875,
"learning_rate": 0.0009956802442826415,
- "loss": 0.0373,
+ "loss": 0.0386,
"macro_f1": 0.3272727429866791,
"num_tokens": 1494543.0,
"repeat_count": 1.0,
- "routers_loss": 0.05399841442704201,
+ "routers_loss": 0.0563737191259861,
"skip_count": 0.0,
"step": 926,
"text_loss": 0.47209203243255615
@@ -8814,13 +8814,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0009956395514315235,
- "loss": 0.0488,
+ "loss": 0.0496,
"macro_f1": 0.3272727429866791,
"num_tokens": 1497831.0,
"repeat_count": 1.0,
- "routers_loss": 0.0299264844506979,
+ "routers_loss": 0.03285066783428192,
"skip_count": 0.0,
"step": 928,
"text_loss": 0.6628931164741516
@@ -8833,13 +8833,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009955986686503943,
- "loss": 0.0467,
+ "loss": 0.0466,
"macro_f1": 0.3272727429866791,
"num_tokens": 1501375.0,
"repeat_count": 0.0,
- "routers_loss": 0.023478010669350624,
+ "routers_loss": 0.024297121912240982,
"skip_count": 1.0,
"step": 930,
"text_loss": 0.495676189661026
@@ -8852,13 +8852,13 @@
"f1_execute": 0.9387754797935486,
"f1_repeat": 1.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0009955575959549202,
- "loss": 0.0447,
+ "loss": 0.0424,
"macro_f1": 0.7795917987823486,
"num_tokens": 1504363.0,
"repeat_count": 1.0,
- "routers_loss": 0.12116194516420364,
+ "routers_loss": 0.12196464836597443,
"skip_count": 4.0,
"step": 932,
"text_loss": 0.26123273372650146
@@ -8871,13 +8871,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.0009955163333608408,
- "loss": 0.053,
+ "loss": 0.0538,
"macro_f1": 0.3333333432674408,
"num_tokens": 1507178.0,
"repeat_count": 0.0,
- "routers_loss": 0.011879723519086838,
+ "routers_loss": 0.012947078794240952,
"skip_count": 0.0,
"step": 934,
"text_loss": 0.32552677392959595
@@ -8890,13 +8890,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009954748808839674,
- "loss": 0.0373,
+ "loss": 0.0379,
"macro_f1": 0.3333333432674408,
"num_tokens": 1509910.0,
"repeat_count": 0.0,
- "routers_loss": 0.009245929308235645,
+ "routers_loss": 0.008946365676820278,
"skip_count": 0.0,
"step": 936,
"text_loss": 0.533141016960144
@@ -8909,13 +8909,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.140625,
"learning_rate": 0.000995433238540185,
- "loss": 0.0461,
+ "loss": 0.0466,
"macro_f1": 0.6538461446762085,
"num_tokens": 1512826.0,
"repeat_count": 1.0,
- "routers_loss": 0.032464127987623215,
+ "routers_loss": 0.029975678771734238,
"skip_count": 1.0,
"step": 938,
"text_loss": 0.2953577935695648
@@ -8928,13 +8928,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009953914063454512,
- "loss": 0.0515,
+ "loss": 0.0497,
"macro_f1": 0.3144654333591461,
"num_tokens": 1517230.0,
"repeat_count": 1.0,
- "routers_loss": 0.08835392445325851,
+ "routers_loss": 0.0889134630560875,
"skip_count": 2.0,
"step": 940,
"text_loss": 0.5368834733963013
@@ -8947,13 +8947,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.193359375,
"learning_rate": 0.000995349384315796,
- "loss": 0.0405,
+ "loss": 0.0413,
"macro_f1": 0.3333333432674408,
"num_tokens": 1519876.0,
"repeat_count": 0.0,
- "routers_loss": 0.014307246543467045,
+ "routers_loss": 0.013458753935992718,
"skip_count": 0.0,
"step": 942,
"text_loss": 0.2005518227815628
@@ -8966,13 +8966,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.000995307172467322,
- "loss": 0.0449,
+ "loss": 0.0444,
"macro_f1": 0.31446540355682373,
"num_tokens": 1522998.0,
"repeat_count": 1.0,
- "routers_loss": 0.10261563211679459,
+ "routers_loss": 0.08850377053022385,
"skip_count": 1.0,
"step": 944,
"text_loss": 0.227926567196846
@@ -8985,13 +8985,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009952647708162054,
- "loss": 0.0507,
+ "loss": 0.0503,
"macro_f1": 0.3272727429866791,
"num_tokens": 1527100.0,
"repeat_count": 0.0,
- "routers_loss": 0.03316422924399376,
+ "routers_loss": 0.03199794515967369,
"skip_count": 1.0,
"step": 946,
"text_loss": 0.4859686493873596
@@ -9004,13 +9004,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009952221793786942,
- "loss": 0.0352,
+ "loss": 0.0354,
"macro_f1": 0.3333333432674408,
"num_tokens": 1530028.0,
"repeat_count": 0.0,
- "routers_loss": 0.00902469176799059,
+ "routers_loss": 0.006507779937237501,
"skip_count": 0.0,
"step": 948,
"text_loss": 0.6855354905128479
@@ -9023,13 +9023,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.0009951793981711097,
- "loss": 0.0581,
+ "loss": 0.0584,
"macro_f1": 0.6538461446762085,
"num_tokens": 1533254.0,
"repeat_count": 1.0,
- "routers_loss": 0.06710167229175568,
+ "routers_loss": 0.06175103038549423,
"skip_count": 1.0,
"step": 950,
"text_loss": 0.7590400576591492
@@ -9042,13 +9042,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009951364272098458,
- "loss": 0.0294,
+ "loss": 0.0295,
"macro_f1": 0.5492662787437439,
"num_tokens": 1536239.0,
"repeat_count": 0.0,
- "routers_loss": 0.04208769276738167,
+ "routers_loss": 0.03773383051156998,
"skip_count": 2.0,
"step": 952,
"text_loss": 0.669784665107727
@@ -9061,13 +9061,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009950932665113688,
- "loss": 0.0505,
+ "loss": 0.0507,
"macro_f1": 0.32098764181137085,
"num_tokens": 1539682.0,
"repeat_count": 0.0,
- "routers_loss": 0.06530380249023438,
+ "routers_loss": 0.07280613481998444,
"skip_count": 2.0,
"step": 954,
"text_loss": 0.3365570902824402
@@ -9080,13 +9080,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009950499160922184,
- "loss": 0.0545,
+ "loss": 0.0541,
"macro_f1": 0.3333333432674408,
"num_tokens": 1542875.0,
"repeat_count": 0.0,
- "routers_loss": 0.01803453080356121,
+ "routers_loss": 0.01770266517996788,
"skip_count": 0.0,
"step": 956,
"text_loss": 0.0921545997262001
@@ -9099,13 +9099,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.09375,
"learning_rate": 0.000995006375969006,
- "loss": 0.0481,
+ "loss": 0.0473,
"macro_f1": 0.3272727429866791,
"num_tokens": 1547135.0,
"repeat_count": 1.0,
- "routers_loss": 0.08461762219667435,
+ "routers_loss": 0.07672002166509628,
"skip_count": 0.0,
"step": 958,
"text_loss": 0.5887606739997864
@@ -9120,11 +9120,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1376953125,
"learning_rate": 0.0009949626461584165,
- "loss": 0.0441,
+ "loss": 0.043,
"macro_f1": 0.3333333432674408,
"num_tokens": 1550100.0,
"repeat_count": 0.0,
- "routers_loss": 0.007111486047506332,
+ "routers_loss": 0.006247182376682758,
"skip_count": 0.0,
"step": 960,
"text_loss": 0.5777931213378906
@@ -9137,13 +9137,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.11181640625,
+ "grad_norm": 0.119140625,
"learning_rate": 0.0009949187266772076,
- "loss": 0.0361,
+ "loss": 0.0366,
"macro_f1": 0.5492662787437439,
"num_tokens": 1553192.0,
"repeat_count": 0.0,
- "routers_loss": 0.029776185750961304,
+ "routers_loss": 0.030319908633828163,
"skip_count": 2.0,
"step": 962,
"text_loss": 0.2370252162218094
@@ -9156,13 +9156,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009948746175422088,
- "loss": 0.0506,
+ "loss": 0.0511,
"macro_f1": 0.3333333432674408,
"num_tokens": 1556318.0,
"repeat_count": 0.0,
- "routers_loss": 0.007108999416232109,
+ "routers_loss": 0.006004320923238993,
"skip_count": 0.0,
"step": 964,
"text_loss": 0.6271032094955444
@@ -9175,13 +9175,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000994830318770323,
- "loss": 0.0498,
+ "loss": 0.0514,
"macro_f1": 0.3333333432674408,
"num_tokens": 1559195.0,
"repeat_count": 0.0,
- "routers_loss": 0.01126947533339262,
+ "routers_loss": 0.011544366367161274,
"skip_count": 0.0,
"step": 966,
"text_loss": 0.47256720066070557
@@ -9194,13 +9194,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009947858303785255,
- "loss": 0.0366,
+ "loss": 0.0374,
"macro_f1": 0.6603773832321167,
"num_tokens": 1561813.0,
"repeat_count": 1.0,
- "routers_loss": 0.05142999067902565,
+ "routers_loss": 0.05258861929178238,
"skip_count": 1.0,
"step": 968,
"text_loss": 0.7703132629394531
@@ -9213,13 +9213,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.0009947411523838648,
- "loss": 0.0461,
+ "loss": 0.0453,
"macro_f1": 0.3333333432674408,
"num_tokens": 1564634.0,
"repeat_count": 0.0,
- "routers_loss": 0.010770819149911404,
+ "routers_loss": 0.011216280050575733,
"skip_count": 0.0,
"step": 970,
"text_loss": 0.4666804075241089
@@ -9232,13 +9232,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0009946962848034608,
- "loss": 0.0692,
+ "loss": 0.0696,
"macro_f1": 0.3333333432674408,
"num_tokens": 1567959.0,
"repeat_count": 0.0,
- "routers_loss": 0.008775795809924603,
+ "routers_loss": 0.009387624450027943,
"skip_count": 0.0,
"step": 972,
"text_loss": 0.4067264199256897
@@ -9251,13 +9251,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.203125,
"learning_rate": 0.0009946512276545075,
- "loss": 0.0403,
+ "loss": 0.0397,
"macro_f1": 0.3272727429866791,
"num_tokens": 1571221.0,
"repeat_count": 1.0,
- "routers_loss": 0.05100395902991295,
+ "routers_loss": 0.041713520884513855,
"skip_count": 0.0,
"step": 974,
"text_loss": 0.5242366194725037
@@ -9270,13 +9270,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.228515625,
"learning_rate": 0.0009946059809542705,
- "loss": 0.0503,
+ "loss": 0.0487,
"macro_f1": 0.7644445300102234,
"num_tokens": 1575033.0,
"repeat_count": 2.0,
- "routers_loss": 0.06653711199760437,
+ "routers_loss": 0.05748331546783447,
"skip_count": 2.0,
"step": 976,
"text_loss": 0.5704690217971802
@@ -9284,18 +9284,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 4.591722923393014,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0009945605447200887,
- "loss": 0.0435,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0445,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1579050.0,
"repeat_count": 0.0,
- "routers_loss": 0.009865665808320045,
+ "routers_loss": 0.016765203326940536,
"skip_count": 0.0,
"step": 978,
"text_loss": 0.4804173707962036
@@ -9308,13 +9308,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.0009945149189693732,
- "loss": 0.0399,
+ "loss": 0.0406,
"macro_f1": 0.5492662787437439,
"num_tokens": 1582967.0,
"repeat_count": 0.0,
- "routers_loss": 0.021175632253289223,
+ "routers_loss": 0.021518222987651825,
"skip_count": 2.0,
"step": 980,
"text_loss": 0.4138598144054413
@@ -9327,32 +9327,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11181640625,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.0009944691037196078,
- "loss": 0.0472,
+ "loss": 0.0456,
"macro_f1": 0.3333333432674408,
"num_tokens": 1586282.0,
"repeat_count": 0.0,
- "routers_loss": 0.011803832836449146,
+ "routers_loss": 0.012246460653841496,
"skip_count": 0.0,
"step": 982,
"text_loss": 0.22561736404895782
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 0.5,
"acc_skip": 0.800000011920929,
- "avg_layers": 23.0,
+ "avg_layers": 24.0,
"epoch": 4.6199002054593485,
- "f1_execute": 0.9090908765792847,
- "f1_repeat": 0.0,
+ "f1_execute": 0.930232584476471,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 0.8000000715255737,
- "grad_norm": 0.142578125,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009944230989883491,
- "loss": 0.0467,
- "macro_f1": 0.5696970224380493,
+ "loss": 0.0456,
+ "macro_f1": 0.7989664077758789,
"num_tokens": 1589279.0,
"repeat_count": 2.0,
- "routers_loss": 0.08856551349163055,
+ "routers_loss": 0.09344895929098129,
"skip_count": 5.0,
"step": 984,
"text_loss": 0.4416656494140625
@@ -9365,13 +9365,13 @@
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.111328125,
"learning_rate": 0.0009943769047932264,
- "loss": 0.0413,
+ "loss": 0.0404,
"macro_f1": 0.5359477400779724,
"num_tokens": 1592398.0,
"repeat_count": 2.0,
- "routers_loss": 0.08593414723873138,
+ "routers_loss": 0.08916857838630676,
"skip_count": 2.0,
"step": 986,
"text_loss": 0.5536438822746277
@@ -9384,13 +9384,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.154296875,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000994330521151941,
- "loss": 0.0399,
+ "loss": 0.039,
"macro_f1": 0.32098764181137085,
"num_tokens": 1596213.0,
"repeat_count": 1.0,
- "routers_loss": 0.07049509882926941,
+ "routers_loss": 0.06114347651600838,
"skip_count": 1.0,
"step": 988,
"text_loss": 0.5835405588150024
@@ -9403,13 +9403,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.1953125,
"learning_rate": 0.000994283948082267,
- "loss": 0.0595,
+ "loss": 0.0573,
"macro_f1": 0.3333333432674408,
"num_tokens": 1598827.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019258069805800915,
+ "routers_loss": 0.0017335431184619665,
"skip_count": 0.0,
"step": 990,
"text_loss": 0.5857380032539368
@@ -9422,13 +9422,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009942371856020522,
- "loss": 0.0335,
+ "loss": 0.0341,
"macro_f1": 0.3333333432674408,
"num_tokens": 1602915.0,
"repeat_count": 0.0,
- "routers_loss": 0.014094089157879353,
+ "routers_loss": 0.014606470242142677,
"skip_count": 0.0,
"step": 992,
"text_loss": 0.6939892768859863
@@ -9436,18 +9436,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 31.0,
"epoch": 4.666862342236572,
- "f1_execute": 0.9583333134651184,
+ "f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009941902337292155,
- "loss": 0.0603,
- "macro_f1": 0.6527777910232544,
+ "loss": 0.06,
+ "macro_f1": 0.6598639488220215,
"num_tokens": 1605776.0,
"repeat_count": 3.0,
- "routers_loss": 0.06360147893428802,
+ "routers_loss": 0.06297315657138824,
"skip_count": 1.0,
"step": 994,
"text_loss": 0.37616831064224243
@@ -9460,13 +9460,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009941430924817487,
- "loss": 0.0573,
+ "loss": 0.0572,
"macro_f1": 0.5492662787437439,
"num_tokens": 1609856.0,
"repeat_count": 0.0,
- "routers_loss": 0.0326208658516407,
+ "routers_loss": 0.03297794610261917,
"skip_count": 2.0,
"step": 996,
"text_loss": 0.2098303586244583
@@ -9479,13 +9479,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09912109375,
+ "grad_norm": 0.10107421875,
"learning_rate": 0.000994095761877717,
- "loss": 0.0502,
+ "loss": 0.0499,
"macro_f1": 0.3333333432674408,
"num_tokens": 1612904.0,
"repeat_count": 0.0,
- "routers_loss": 0.012660752050578594,
+ "routers_loss": 0.012901155278086662,
"skip_count": 0.0,
"step": 998,
"text_loss": 0.20103533565998077
@@ -9498,13 +9498,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.265625,
+ "grad_norm": 0.259765625,
"learning_rate": 0.000994048241935257,
- "loss": 0.0537,
+ "loss": 0.0535,
"macro_f1": 0.3272727429866791,
"num_tokens": 1615540.0,
"repeat_count": 0.0,
- "routers_loss": 0.021756287664175034,
+ "routers_loss": 0.020434845238924026,
"skip_count": 0.0,
"step": 1000,
"text_loss": 0.32709044218063354
diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin
index deeea733277b4031781a5b299881dd8e675e7606..a3d3ae372faf14539639f54454aa52b6ee730c4a 100644
--- a/checkpoint-1000/training_args.bin
+++ b/checkpoint-1000/training_args.bin
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:0b3f5975f57762b552c7ee29776bf32a4dbb125781a0658488d3884fb25c5296
+oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8
size 5880
diff --git a/checkpoint-10000/chat_template.jinja b/checkpoint-10000/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0
--- /dev/null
+++ b/checkpoint-10000/chat_template.jinja
@@ -0,0 +1,93 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+ {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+ {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+ {%- if strftime_now is defined %}
+ {%- set date_string = strftime_now("%d %b %Y") %}
+ {%- else %}
+ {%- set date_string = "26 Jul 2024" %}
+ {%- endif %}
+{%- endif %}
+{%- if not tools is defined %}
+ {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+ {%- set system_message = messages[0]['content']|trim %}
+ {%- set messages = messages[1:] %}
+{%- else %}
+ {%- set system_message = "" %}
+{%- endif %}
+
+{#- System message #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if tools is not none %}
+ {{- "Environment: ipython\n" }}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+ {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+ {{- "Do not use variables.\n\n" }}
+ {%- for t in tools %}
+ {{- t | tojson(indent=4) }}
+ {{- "\n\n" }}
+ {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+ {#- Extract the first user message so we can plug it in here #}
+ {%- if messages | length != 0 %}
+ {%- set first_user_message = messages[0]['content']|trim %}
+ {%- set messages = messages[1:] %}
+ {%- else %}
+ {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+ {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+ {{- "Given the following functions, please respond with a JSON for a function call " }}
+ {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+ {{- "Do not use variables.\n\n" }}
+ {%- for t in tools %}
+ {{- t | tojson(indent=4) }}
+ {{- "\n\n" }}
+ {%- endfor %}
+ {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+ {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+ {%- elif 'tool_calls' in message %}
+ {%- if not message.tool_calls|length == 1 %}
+ {{- raise_exception("This model only supports single tool-calls at once!") }}
+ {%- endif %}
+ {%- set tool_call = message.tool_calls[0].function %}
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+ {{- '{"name": "' + tool_call.name + '", ' }}
+ {{- '"parameters": ' }}
+ {{- tool_call.arguments | tojson }}
+ {{- "}" }}
+ {{- "<|eot_id|>" }}
+ {%- elif message.role == "tool" or message.role == "ipython" %}
+ {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+ {%- if message.content is mapping or message.content is iterable %}
+ {{- message.content | tojson }}
+ {%- else %}
+ {{- message.content }}
+ {%- endif %}
+ {{- "<|eot_id|>" }}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
diff --git a/checkpoint-10000/config.json b/checkpoint-10000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3552bd1c531626bd125241ad5dfcd7fb677462cd
--- /dev/null
+++ b/checkpoint-10000/config.json
@@ -0,0 +1,39 @@
+{
+ "architectures": [
+ "LlamaForCausalLM"
+ ],
+ "attention_bias": false,
+ "attention_dropout": 0.0,
+ "bos_token_id": 128000,
+ "eos_token_id": [
+ 128001,
+ 128008,
+ 128009
+ ],
+ "head_dim": 128,
+ "hidden_act": "silu",
+ "hidden_size": 3072,
+ "initializer_range": 0.02,
+ "intermediate_size": 8192,
+ "max_position_embeddings": 131072,
+ "mlp_bias": false,
+ "model_type": "llama",
+ "num_attention_heads": 24,
+ "num_hidden_layers": 28,
+ "num_key_value_heads": 8,
+ "pretraining_tp": 1,
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": {
+ "factor": 32.0,
+ "high_freq_factor": 4.0,
+ "low_freq_factor": 1.0,
+ "original_max_position_embeddings": 8192,
+ "rope_type": "llama3"
+ },
+ "rope_theta": 500000.0,
+ "tie_word_embeddings": true,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.55.2",
+ "use_cache": true,
+ "vocab_size": 128256
+}
diff --git a/checkpoint-10000/generation_config.json b/checkpoint-10000/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b513e54e3195b917260c9a8a04c9f3683f19de35
--- /dev/null
+++ b/checkpoint-10000/generation_config.json
@@ -0,0 +1,12 @@
+{
+ "bos_token_id": 128000,
+ "do_sample": true,
+ "eos_token_id": [
+ 128001,
+ 128008,
+ 128009
+ ],
+ "temperature": 0.6,
+ "top_p": 0.9,
+ "transformers_version": "4.55.2"
+}
diff --git a/checkpoint-10000/model-00001-of-00002.safetensors b/checkpoint-10000/model-00001-of-00002.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..08a01e1ba553cdcb2222f034a209861d7b54e284
--- /dev/null
+++ b/checkpoint-10000/model-00001-of-00002.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13cbd6d16e927a0c5bad54102514e6e18b4a47b3a6eb911e39d678d328d19f55
+size 4965799096
diff --git a/checkpoint-10000/model-00002-of-00002.safetensors b/checkpoint-10000/model-00002-of-00002.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..22312c108c4857773753d52c1f1a230315388e35
--- /dev/null
+++ b/checkpoint-10000/model-00002-of-00002.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b58dd61aa65becc607555e6f23c5942f6e74879d50af451a1fa3137e6aca6ea
+size 1481790520
diff --git a/checkpoint-10000/model.safetensors.index.json b/checkpoint-10000/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..21bb567761d75ade0c0eef6495c450697dd3ff18
--- /dev/null
+++ b/checkpoint-10000/model.safetensors.index.json
@@ -0,0 +1,374 @@
+{
+ "metadata": {
+ "total_parameters": 3223774292,
+ "total_size": 6447548584
+ },
+ "weight_map": {
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.norm.weight": "model-00002-of-00002.safetensors",
+ "model.routers.0.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.0.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.0.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.0.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.1.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.1.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.1.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.1.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.10.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.10.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.10.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.10.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.11.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.11.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.11.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.11.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.12.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.12.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.12.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.12.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.13.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.13.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.13.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.13.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.14.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.14.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.14.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.14.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.15.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.15.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.15.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.15.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.16.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.16.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.16.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.16.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.17.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.17.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.17.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.17.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.18.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.18.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.18.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.18.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.19.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.19.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.19.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.19.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.2.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.2.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.2.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.2.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.20.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.20.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.20.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.20.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.21.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.21.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.21.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.21.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.22.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.22.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.22.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.22.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.23.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.23.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.23.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.23.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.24.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.24.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.24.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.24.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.25.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.25.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.25.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.25.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.26.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.26.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.26.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.26.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.27.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.27.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.27.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.27.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.3.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.3.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.3.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.3.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.4.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.4.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.4.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.4.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.5.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.5.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.5.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.5.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.6.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.6.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.6.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.6.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.7.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.7.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.7.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.7.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.8.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.8.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.8.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.8.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.9.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.9.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.9.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.9.linear2.weight": "model-00002-of-00002.safetensors"
+ }
+}
diff --git a/checkpoint-10000/optimizer.pt b/checkpoint-10000/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2ec455b471f1a42016d92c425d6a270ff218ceea
--- /dev/null
+++ b/checkpoint-10000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:840250979ca530333499796a7bf10f85f6e6db757e225d1fa8ebc9adffb26459
+size 44191162
diff --git a/checkpoint-10000/rng_state.pth b/checkpoint-10000/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ec62bcc28f4b60e6182ac81ef0e159a2cf3e7183
--- /dev/null
+++ b/checkpoint-10000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dec9f38518272c3edcef2f4b76e7cf3ba41857ee958849b9cec81d28afbeefdc
+size 14244
diff --git a/checkpoint-10000/scheduler.pt b/checkpoint-10000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a7b2696f98af8e73f5f371279422f32926e8d228
--- /dev/null
+++ b/checkpoint-10000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f553e44a7bafda5633bddec9889354deb82b5be5f750b2e4c9bae414b2b61fd3
+size 1064
diff --git a/checkpoint-10000/special_tokens_map.json b/checkpoint-10000/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..165b36bc2293dda9a2fb3c0daf6577d9eba9df7a
--- /dev/null
+++ b/checkpoint-10000/special_tokens_map.json
@@ -0,0 +1,17 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "<|finetune_right_pad_id|>"
+}
diff --git a/checkpoint-10000/tokenizer.json b/checkpoint-10000/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-10000/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-10000/tokenizer_config.json b/checkpoint-10000/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c68051fe3c4d23234a59316bc52d21f6e3a4182c
--- /dev/null
+++ b/checkpoint-10000/tokenizer_config.json
@@ -0,0 +1,2063 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|finetune_right_pad_id|>",
+ "tokenizer_class": "PreTrainedTokenizerFast"
+}
diff --git a/checkpoint-10000/trainer_state.json b/checkpoint-10000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ceed09df682bd89d0877b30a2e15e27091dbc1d8
--- /dev/null
+++ b/checkpoint-10000/trainer_state.json
@@ -0,0 +1,95034 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 46.94863516289991,
+ "eval_steps": 500,
+ "global_step": 10000,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 25.0,
+ "epoch": 0.009392427355444672,
+ "f1_execute": 0.6976743936538696,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 2.25,
+ "learning_rate": 2e-06,
+ "loss": 0.4974,
+ "macro_f1": 0.23255813121795654,
+ "num_tokens": 3175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.4339469373226166,
+ "skip_count": 0.0,
+ "step": 2,
+ "text_loss": 0.3330848515033722
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 23.0,
+ "epoch": 0.018784854710889344,
+ "f1_execute": 0.7272726893424988,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.8359375,
+ "learning_rate": 6e-06,
+ "loss": 0.4988,
+ "macro_f1": 0.24242423474788666,
+ "num_tokens": 5816.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.4511934816837311,
+ "skip_count": 1.0,
+ "step": 4,
+ "text_loss": 0.4571273922920227
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.02817728206633402,
+ "f1_execute": 0.6666666865348816,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 2.234375,
+ "learning_rate": 1e-05,
+ "loss": 0.5113,
+ "macro_f1": 0.222222238779068,
+ "num_tokens": 9739.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.49306994676589966,
+ "skip_count": 0.0,
+ "step": 6,
+ "text_loss": 0.41060560941696167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.03756970942177869,
+ "f1_execute": 0.5641025900840759,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.7265625,
+ "learning_rate": 1.4e-05,
+ "loss": 0.4766,
+ "macro_f1": 0.18803420662879944,
+ "num_tokens": 12869.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.48872503638267517,
+ "skip_count": 2.0,
+ "step": 8,
+ "text_loss": 0.36678561568260193
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.046962136777223364,
+ "f1_execute": 0.6976743936538696,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.78125,
+ "learning_rate": 1.8e-05,
+ "loss": 0.4806,
+ "macro_f1": 0.23255813121795654,
+ "num_tokens": 15845.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.45077216625213623,
+ "skip_count": 0.0,
+ "step": 10,
+ "text_loss": 0.5597779154777527
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 26.0,
+ "epoch": 0.05635456413266804,
+ "f1_execute": 0.7179487347602844,
+ "f1_repeat": 0.2857142984867096,
+ "f1_skip": 0.20000000298023224,
+ "grad_norm": 1.5390625,
+ "learning_rate": 2.2e-05,
+ "loss": 0.4557,
+ "macro_f1": 0.40122103691101074,
+ "num_tokens": 19353.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.4130440056324005,
+ "skip_count": 3.0,
+ "step": 12,
+ "text_loss": 0.2056603729724884
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.06574699148811271,
+ "f1_execute": 0.6976743936538696,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 2.4375,
+ "learning_rate": 2.6e-05,
+ "loss": 0.5129,
+ "macro_f1": 0.23255813121795654,
+ "num_tokens": 22675.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.4582902193069458,
+ "skip_count": 0.0,
+ "step": 14,
+ "text_loss": 0.32989829778671265
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 0.07513941884355738,
+ "f1_execute": 0.6829268336296082,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.2222222238779068,
+ "grad_norm": 1.7421875,
+ "learning_rate": 3e-05,
+ "loss": 0.4729,
+ "macro_f1": 0.3017163574695587,
+ "num_tokens": 26022.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.42910993099212646,
+ "skip_count": 1.0,
+ "step": 16,
+ "text_loss": 0.1353905349969864
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.08453184619900206,
+ "f1_execute": 0.7555555105209351,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.4765625,
+ "learning_rate": 3.4000000000000007e-05,
+ "loss": 0.4274,
+ "macro_f1": 0.2518518567085266,
+ "num_tokens": 29251.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.3990713059902191,
+ "skip_count": 0.0,
+ "step": 18,
+ "text_loss": 0.3806765377521515
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 26.0,
+ "epoch": 0.09392427355444673,
+ "f1_execute": 0.6829268336296082,
+ "f1_repeat": 0.2857142984867096,
+ "f1_skip": 0.0,
+ "grad_norm": 1.3125,
+ "learning_rate": 3.8e-05,
+ "loss": 0.4261,
+ "macro_f1": 0.3228803873062134,
+ "num_tokens": 32545.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.40146592259407043,
+ "skip_count": 0.0,
+ "step": 20,
+ "text_loss": 0.25648367404937744
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 26.0,
+ "epoch": 0.1033167009098914,
+ "f1_execute": 0.7272727489471436,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.625,
+ "learning_rate": 4.2000000000000004e-05,
+ "loss": 0.404,
+ "macro_f1": 0.24242424964904785,
+ "num_tokens": 36560.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.372715026140213,
+ "skip_count": 0.0,
+ "step": 22,
+ "text_loss": 0.2799522578716278
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.11270912826533608,
+ "f1_execute": 0.7555555105209351,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.6328125,
+ "learning_rate": 4.6e-05,
+ "loss": 0.4218,
+ "macro_f1": 0.2518518567085266,
+ "num_tokens": 39597.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.4504941403865814,
+ "skip_count": 0.0,
+ "step": 24,
+ "text_loss": 0.6635695695877075
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.12210155562078075,
+ "f1_execute": 0.8085106015205383,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.7109375,
+ "learning_rate": 5e-05,
+ "loss": 0.3886,
+ "macro_f1": 0.26950353384017944,
+ "num_tokens": 43080.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.3498791456222534,
+ "skip_count": 0.0,
+ "step": 26,
+ "text_loss": 0.7035041451454163
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.13149398297622542,
+ "f1_execute": 0.8085106015205383,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.34375,
+ "learning_rate": 5.4e-05,
+ "loss": 0.3724,
+ "macro_f1": 0.26950353384017944,
+ "num_tokens": 46406.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.31265875697135925,
+ "skip_count": 0.0,
+ "step": 28,
+ "text_loss": 0.6388277411460876
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.1408864103316701,
+ "f1_execute": 0.8571428060531616,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.2578125,
+ "learning_rate": 5.800000000000001e-05,
+ "loss": 0.341,
+ "macro_f1": 0.2857142686843872,
+ "num_tokens": 49966.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.3200918138027191,
+ "skip_count": 2.0,
+ "step": 30,
+ "text_loss": 0.17372547090053558
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 25.0,
+ "epoch": 0.15027883768711475,
+ "f1_execute": 0.8571428060531616,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.4140625,
+ "learning_rate": 6.2e-05,
+ "loss": 0.3207,
+ "macro_f1": 0.2857142686843872,
+ "num_tokens": 53378.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.32304447889328003,
+ "skip_count": 1.0,
+ "step": 32,
+ "text_loss": 0.18196581304073334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 25.0,
+ "epoch": 0.15967126504255943,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.46875,
+ "learning_rate": 6.6e-05,
+ "loss": 0.3304,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 56933.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.24814388155937195,
+ "skip_count": 0.0,
+ "step": 34,
+ "text_loss": 0.28823015093803406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 26.0,
+ "epoch": 0.16906369239800412,
+ "f1_execute": 0.9019607901573181,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.1171875,
+ "learning_rate": 7.000000000000001e-05,
+ "loss": 0.2778,
+ "macro_f1": 0.3006536066532135,
+ "num_tokens": 60744.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.22411039471626282,
+ "skip_count": 0.0,
+ "step": 36,
+ "text_loss": 0.5260357856750488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.17845611975344877,
+ "f1_execute": 0.8571428656578064,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.484375,
+ "learning_rate": 7.4e-05,
+ "loss": 0.2738,
+ "macro_f1": 0.2857142984867096,
+ "num_tokens": 64900.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.44355395436286926,
+ "skip_count": 0.0,
+ "step": 38,
+ "text_loss": 0.5382097363471985
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 26.0,
+ "epoch": 0.18784854710889345,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.3828125,
+ "learning_rate": 7.8e-05,
+ "loss": 0.2137,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 68000.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.202330082654953,
+ "skip_count": 0.0,
+ "step": 40,
+ "text_loss": 0.5946118831634521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 25.0,
+ "epoch": 0.19724097446433814,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.78125,
+ "learning_rate": 8.2e-05,
+ "loss": 0.21,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 70529.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.18023855984210968,
+ "skip_count": 0.0,
+ "step": 42,
+ "text_loss": 0.5550904273986816
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.2066334018197828,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.609375,
+ "learning_rate": 8.599999999999999e-05,
+ "loss": 0.1918,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 73427.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.2101590931415558,
+ "skip_count": 0.0,
+ "step": 44,
+ "text_loss": 0.4636923372745514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.21602582917522747,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.53125,
+ "learning_rate": 8.999999999999999e-05,
+ "loss": 0.1881,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 76472.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.11800424009561539,
+ "skip_count": 0.0,
+ "step": 46,
+ "text_loss": 0.4187001883983612
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.22541825653067216,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.953125,
+ "learning_rate": 9.400000000000001e-05,
+ "loss": 0.1446,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 79124.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11632519960403442,
+ "skip_count": 0.0,
+ "step": 48,
+ "text_loss": 0.2253919243812561
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.2348106838861168,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.58984375,
+ "learning_rate": 9.800000000000001e-05,
+ "loss": 0.1543,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 81980.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.09669367223978043,
+ "skip_count": 0.0,
+ "step": 50,
+ "text_loss": 0.6053179502487183
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 26.0,
+ "epoch": 0.2442031112415615,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.8515625,
+ "learning_rate": 0.000102,
+ "loss": 0.1393,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 85236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.12471720576286316,
+ "skip_count": 0.0,
+ "step": 52,
+ "text_loss": 0.6027331948280334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.2535955385970062,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.421875,
+ "learning_rate": 0.000106,
+ "loss": 0.1473,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 88238.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.1376056969165802,
+ "skip_count": 2.0,
+ "step": 54,
+ "text_loss": 0.2861751616001129
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.26298796595245083,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.35546875,
+ "learning_rate": 0.00011,
+ "loss": 0.1082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 91056.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07449393719434738,
+ "skip_count": 0.0,
+ "step": 56,
+ "text_loss": 0.48106974363327026
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 26.0,
+ "epoch": 0.2723803933078955,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.271484375,
+ "learning_rate": 0.000114,
+ "loss": 0.1123,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 94987.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07064720243215561,
+ "skip_count": 0.0,
+ "step": 58,
+ "text_loss": 0.3554874658584595
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.2817728206633402,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.5390625,
+ "learning_rate": 0.000118,
+ "loss": 0.1234,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 97909.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.16835889220237732,
+ "skip_count": 2.0,
+ "step": 60,
+ "text_loss": 0.5475804805755615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.29116524801878485,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2353515625,
+ "learning_rate": 0.000122,
+ "loss": 0.1224,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 101043.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06127442046999931,
+ "skip_count": 0.0,
+ "step": 62,
+ "text_loss": 0.5966938734054565
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.3005576753742295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.212890625,
+ "learning_rate": 0.000126,
+ "loss": 0.0931,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 104103.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.047825805842876434,
+ "skip_count": 0.0,
+ "step": 64,
+ "text_loss": 0.5480486750602722
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.3099501027296742,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.00013000000000000002,
+ "loss": 0.1088,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 107009.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.275174081325531,
+ "skip_count": 4.0,
+ "step": 66,
+ "text_loss": 0.41714492440223694
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.31934253008511887,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1923828125,
+ "learning_rate": 0.000134,
+ "loss": 0.1123,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 110486.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.029025178402662277,
+ "skip_count": 0.0,
+ "step": 68,
+ "text_loss": 0.6775627732276917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.3287349574405635,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.314453125,
+ "learning_rate": 0.00013800000000000002,
+ "loss": 0.1049,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 113878.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.10141710191965103,
+ "skip_count": 1.0,
+ "step": 70,
+ "text_loss": 0.6678873896598816
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.33812738479600823,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.248046875,
+ "learning_rate": 0.00014199999999999998,
+ "loss": 0.1119,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 116989.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08002066612243652,
+ "skip_count": 1.0,
+ "step": 72,
+ "text_loss": 0.405692994594574
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.3475198121514529,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1787109375,
+ "learning_rate": 0.000146,
+ "loss": 0.0944,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 119883.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.1867009848356247,
+ "skip_count": 3.0,
+ "step": 74,
+ "text_loss": 0.44616150856018066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.35691223950689754,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.333984375,
+ "learning_rate": 0.00015,
+ "loss": 0.1003,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 123325.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07042168825864792,
+ "skip_count": 2.0,
+ "step": 76,
+ "text_loss": 0.11340200901031494
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.36630466686234225,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.26171875,
+ "learning_rate": 0.000154,
+ "loss": 0.1066,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 126131.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.11535373330116272,
+ "skip_count": 2.0,
+ "step": 78,
+ "text_loss": 0.3269135355949402
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.3756970942177869,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.255859375,
+ "learning_rate": 0.000158,
+ "loss": 0.0891,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 130349.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09497501701116562,
+ "skip_count": 1.0,
+ "step": 80,
+ "text_loss": 0.15273472666740417
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.38508952157323156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1943359375,
+ "learning_rate": 0.000162,
+ "loss": 0.0929,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 133607.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030639523640275,
+ "skip_count": 0.0,
+ "step": 82,
+ "text_loss": 0.282884806394577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.3944819489286763,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1806640625,
+ "learning_rate": 0.00016600000000000002,
+ "loss": 0.1254,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 136694.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07906441390514374,
+ "skip_count": 1.0,
+ "step": 84,
+ "text_loss": 0.459094375371933
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.40387437628412093,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.212890625,
+ "learning_rate": 0.00017,
+ "loss": 0.1071,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 139966.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.1124570444226265,
+ "skip_count": 2.0,
+ "step": 86,
+ "text_loss": 0.29985448718070984
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.4132668036395656,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25390625,
+ "learning_rate": 0.000174,
+ "loss": 0.1031,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 142788.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.1966402679681778,
+ "skip_count": 0.0,
+ "step": 88,
+ "text_loss": 0.6435291767120361
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.4226592309950103,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.349609375,
+ "learning_rate": 0.000178,
+ "loss": 0.0963,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 146192.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0325632207095623,
+ "skip_count": 0.0,
+ "step": 90,
+ "text_loss": 0.35170626640319824
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.43205165835045495,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2265625,
+ "learning_rate": 0.000182,
+ "loss": 0.1073,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 149792.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.15115146338939667,
+ "skip_count": 1.0,
+ "step": 92,
+ "text_loss": 0.83159339427948
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.4414440857058996,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.205078125,
+ "learning_rate": 0.000186,
+ "loss": 0.1073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 152766.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.043313540518283844,
+ "skip_count": 0.0,
+ "step": 94,
+ "text_loss": 0.49707934260368347
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.4508365130613443,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.00019,
+ "loss": 0.0947,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 156112.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.032021280378103256,
+ "skip_count": 0.0,
+ "step": 96,
+ "text_loss": 0.27608928084373474
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.46022894041678897,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2099609375,
+ "learning_rate": 0.000194,
+ "loss": 0.0846,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 159454.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.24473154544830322,
+ "skip_count": 2.0,
+ "step": 98,
+ "text_loss": 0.6026689410209656
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.4696213677722336,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.271484375,
+ "learning_rate": 0.00019800000000000002,
+ "loss": 0.1028,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 163661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.11468276381492615,
+ "skip_count": 2.0,
+ "step": 100,
+ "text_loss": 0.46733155846595764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.47901379512767833,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1806640625,
+ "learning_rate": 0.000202,
+ "loss": 0.1089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 167134.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021144939586520195,
+ "skip_count": 0.0,
+ "step": 102,
+ "text_loss": 0.6362994909286499
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.488406222483123,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1943359375,
+ "learning_rate": 0.000206,
+ "loss": 0.0621,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 170433.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06594710797071457,
+ "skip_count": 1.0,
+ "step": 104,
+ "text_loss": 0.4515477120876312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.49779864983856764,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1591796875,
+ "learning_rate": 0.00021,
+ "loss": 0.0929,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 173387.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.032923027873039246,
+ "skip_count": 0.0,
+ "step": 106,
+ "text_loss": 0.6638453006744385
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.5071910771940124,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.240234375,
+ "learning_rate": 0.000214,
+ "loss": 0.0883,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 176170.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08034781366586685,
+ "skip_count": 0.0,
+ "step": 108,
+ "text_loss": 1.186936855316162
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.516583504549457,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.267578125,
+ "learning_rate": 0.000218,
+ "loss": 0.0794,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 179877.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07814185321331024,
+ "skip_count": 1.0,
+ "step": 110,
+ "text_loss": 0.5488709211349487
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.5259759319049017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2353515625,
+ "learning_rate": 0.000222,
+ "loss": 0.0946,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 182726.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01884695515036583,
+ "skip_count": 0.0,
+ "step": 112,
+ "text_loss": 0.5195863842964172
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.5353683592603463,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.19921875,
+ "learning_rate": 0.00022600000000000002,
+ "loss": 0.0974,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 185624.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09657823294401169,
+ "skip_count": 2.0,
+ "step": 114,
+ "text_loss": 0.43858134746551514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.544760786615791,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.3046875,
+ "learning_rate": 0.00023,
+ "loss": 0.0753,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 188155.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01463601179420948,
+ "skip_count": 0.0,
+ "step": 116,
+ "text_loss": 0.392981618642807
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.5541532139712357,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.439453125,
+ "learning_rate": 0.00023400000000000002,
+ "loss": 0.0843,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 190970.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03859659656882286,
+ "skip_count": 0.0,
+ "step": 118,
+ "text_loss": 0.309179425239563
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.5635456413266804,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2255859375,
+ "learning_rate": 0.00023799999999999998,
+ "loss": 0.053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 193988.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019092386588454247,
+ "skip_count": 0.0,
+ "step": 120,
+ "text_loss": 0.48543134331703186
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.572938068682125,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.35546875,
+ "learning_rate": 0.000242,
+ "loss": 0.1203,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 196475.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0619138665497303,
+ "skip_count": 1.0,
+ "step": 122,
+ "text_loss": 0.4615364074707031
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.5823304960375697,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1875,
+ "learning_rate": 0.000246,
+ "loss": 0.1002,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 200045.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.09752107411623001,
+ "skip_count": 0.0,
+ "step": 124,
+ "text_loss": 0.15802054107189178
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.5917229233930144,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1728515625,
+ "learning_rate": 0.00025,
+ "loss": 0.0773,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 203214.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02896115928888321,
+ "skip_count": 0.0,
+ "step": 126,
+ "text_loss": 0.4543360471725464
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.601115350748459,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.4296875,
+ "learning_rate": 0.000254,
+ "loss": 0.0973,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 206168.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011423567309975624,
+ "skip_count": 0.0,
+ "step": 128,
+ "text_loss": 0.4730179011821747
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6105077781039038,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.365234375,
+ "learning_rate": 0.00025800000000000004,
+ "loss": 0.099,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 209907.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01957600563764572,
+ "skip_count": 0.0,
+ "step": 130,
+ "text_loss": 0.45122358202934265
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6199002054593484,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2060546875,
+ "learning_rate": 0.000262,
+ "loss": 0.0868,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 213521.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04882373288273811,
+ "skip_count": 1.0,
+ "step": 132,
+ "text_loss": 0.4341491758823395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6292926328147931,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1708984375,
+ "learning_rate": 0.000266,
+ "loss": 0.0834,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 216484.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016083380207419395,
+ "skip_count": 0.0,
+ "step": 134,
+ "text_loss": 0.46990111470222473
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6386850601702377,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.220703125,
+ "learning_rate": 0.00027,
+ "loss": 0.0863,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 219398.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01733536459505558,
+ "skip_count": 0.0,
+ "step": 136,
+ "text_loss": 0.4455361068248749
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6480774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1806640625,
+ "learning_rate": 0.00027400000000000005,
+ "loss": 0.0997,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 222430.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01332803163677454,
+ "skip_count": 0.0,
+ "step": 138,
+ "text_loss": 0.47699397802352905
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.657469914881127,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.333984375,
+ "learning_rate": 0.00027800000000000004,
+ "loss": 0.0922,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 225458.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.14924728870391846,
+ "skip_count": 2.0,
+ "step": 140,
+ "text_loss": 0.5858222842216492
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6668623422365718,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25,
+ "learning_rate": 0.00028199999999999997,
+ "loss": 0.0798,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 229365.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.1860177218914032,
+ "skip_count": 2.0,
+ "step": 142,
+ "text_loss": 0.5003137588500977
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6762547695920165,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.00028599999999999996,
+ "loss": 0.054,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 231787.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.16498211026191711,
+ "skip_count": 1.0,
+ "step": 144,
+ "text_loss": 0.5026470422744751
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6856471969474611,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.306640625,
+ "learning_rate": 0.00029,
+ "loss": 0.0936,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 235014.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11801310628652573,
+ "skip_count": 1.0,
+ "step": 146,
+ "text_loss": 0.611888587474823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6950396243029058,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1806640625,
+ "learning_rate": 0.000294,
+ "loss": 0.0878,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 238210.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02422776259481907,
+ "skip_count": 0.0,
+ "step": 148,
+ "text_loss": 0.2876914143562317
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7044320516583504,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1728515625,
+ "learning_rate": 0.000298,
+ "loss": 0.0858,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 241582.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07282499223947525,
+ "skip_count": 2.0,
+ "step": 150,
+ "text_loss": 0.3919292390346527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7138244790137951,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.37890625,
+ "learning_rate": 0.000302,
+ "loss": 0.0797,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 244621.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.20659038424491882,
+ "skip_count": 1.0,
+ "step": 152,
+ "text_loss": 0.4294498860836029
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7232169063692399,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1787109375,
+ "learning_rate": 0.000306,
+ "loss": 0.072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 247833.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02428400330245495,
+ "skip_count": 0.0,
+ "step": 154,
+ "text_loss": 0.5930765867233276
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7326093337246845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1533203125,
+ "learning_rate": 0.00031,
+ "loss": 0.0772,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 251349.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0167869683355093,
+ "skip_count": 0.0,
+ "step": 156,
+ "text_loss": 0.41063904762268066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7420017610801292,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1572265625,
+ "learning_rate": 0.000314,
+ "loss": 0.0821,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 254886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02531604655086994,
+ "skip_count": 0.0,
+ "step": 158,
+ "text_loss": 0.6739020347595215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7513941884355738,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.201171875,
+ "learning_rate": 0.00031800000000000003,
+ "loss": 0.09,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 258260.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017772775143384933,
+ "skip_count": 0.0,
+ "step": 160,
+ "text_loss": 0.46873849630355835
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7607866157910185,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.224609375,
+ "learning_rate": 0.000322,
+ "loss": 0.0893,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 261846.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.034902360290288925,
+ "skip_count": 1.0,
+ "step": 162,
+ "text_loss": 0.3727971017360687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7701790431464631,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.212890625,
+ "learning_rate": 0.000326,
+ "loss": 0.076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 264348.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013553355820477009,
+ "skip_count": 0.0,
+ "step": 164,
+ "text_loss": 0.5798237323760986
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7795714705019078,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.408203125,
+ "learning_rate": 0.00033,
+ "loss": 0.0926,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 267479.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.13571743667125702,
+ "skip_count": 1.0,
+ "step": 166,
+ "text_loss": 0.8084776997566223
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7889638978573525,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2431640625,
+ "learning_rate": 0.00033400000000000004,
+ "loss": 0.0817,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 270268.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.19884146749973297,
+ "skip_count": 0.0,
+ "step": 168,
+ "text_loss": 0.7366134524345398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7983563252127972,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.267578125,
+ "learning_rate": 0.00033800000000000003,
+ "loss": 0.1022,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 273518.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.15469175577163696,
+ "skip_count": 1.0,
+ "step": 170,
+ "text_loss": 0.27204006910324097
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8077487525682419,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.17578125,
+ "learning_rate": 0.000342,
+ "loss": 0.0865,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 277210.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08603330701589584,
+ "skip_count": 2.0,
+ "step": 172,
+ "text_loss": 0.7137667536735535
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8171411799236865,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.189453125,
+ "learning_rate": 0.000346,
+ "loss": 0.0902,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 280389.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.17851492762565613,
+ "skip_count": 4.0,
+ "step": 174,
+ "text_loss": 0.5148105621337891
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8265336072791312,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1494140625,
+ "learning_rate": 0.00035,
+ "loss": 0.0853,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 283501.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021331604570150375,
+ "skip_count": 0.0,
+ "step": 176,
+ "text_loss": 0.301013320684433
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8359260346345758,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2158203125,
+ "learning_rate": 0.000354,
+ "loss": 0.0911,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 287154.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.057273946702480316,
+ "skip_count": 2.0,
+ "step": 178,
+ "text_loss": 0.4740981459617615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8453184619900206,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.240234375,
+ "learning_rate": 0.000358,
+ "loss": 0.0904,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 289929.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04116598889231682,
+ "skip_count": 1.0,
+ "step": 180,
+ "text_loss": 0.4838573932647705
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8547108893454652,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.14453125,
+ "learning_rate": 0.000362,
+ "loss": 0.0991,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 294293.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.027111956849694252,
+ "skip_count": 0.0,
+ "step": 182,
+ "text_loss": 0.7495553493499756
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8641033167009099,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.158203125,
+ "learning_rate": 0.000366,
+ "loss": 0.1038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 297730.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019166452810168266,
+ "skip_count": 0.0,
+ "step": 184,
+ "text_loss": 0.534831166267395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 0.8734957440563546,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2236328125,
+ "learning_rate": 0.00037,
+ "loss": 0.0784,
+ "macro_f1": 0.5427350401878357,
+ "num_tokens": 300593.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.2349659502506256,
+ "skip_count": 2.0,
+ "step": 186,
+ "text_loss": 0.3549048602581024
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8828881714117992,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2041015625,
+ "learning_rate": 0.000374,
+ "loss": 0.0827,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 303456.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.22502389550209045,
+ "skip_count": 2.0,
+ "step": 188,
+ "text_loss": 0.8837642073631287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8922805987672439,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.271484375,
+ "learning_rate": 0.000378,
+ "loss": 0.1085,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 306241.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.12291611731052399,
+ "skip_count": 0.0,
+ "step": 190,
+ "text_loss": 0.73353511095047
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9016730261226886,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15625,
+ "learning_rate": 0.000382,
+ "loss": 0.0969,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 310606.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.055988848209381104,
+ "skip_count": 1.0,
+ "step": 192,
+ "text_loss": 0.6261917352676392
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9110654534781333,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.34375,
+ "learning_rate": 0.000386,
+ "loss": 0.1055,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 313564.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.12363404780626297,
+ "skip_count": 3.0,
+ "step": 194,
+ "text_loss": 0.2790874242782593
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9204578808335779,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.27734375,
+ "learning_rate": 0.00039000000000000005,
+ "loss": 0.0964,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 316958.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.2718356251716614,
+ "skip_count": 2.0,
+ "step": 196,
+ "text_loss": 0.14428086578845978
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9298503081890226,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2021484375,
+ "learning_rate": 0.00039400000000000004,
+ "loss": 0.0917,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 320103.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07188102602958679,
+ "skip_count": 2.0,
+ "step": 198,
+ "text_loss": 0.27155816555023193
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9392427355444672,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.201171875,
+ "learning_rate": 0.000398,
+ "loss": 0.0809,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 323566.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.18038256466388702,
+ "skip_count": 1.0,
+ "step": 200,
+ "text_loss": 0.8453494310379028
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9486351628999119,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.000402,
+ "loss": 0.0801,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 326385.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014639763161540031,
+ "skip_count": 0.0,
+ "step": 202,
+ "text_loss": 0.5733131766319275
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9580275902553567,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.21875,
+ "learning_rate": 0.00040600000000000006,
+ "loss": 0.104,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 329266.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015269627794623375,
+ "skip_count": 0.0,
+ "step": 204,
+ "text_loss": 0.7355639934539795
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9674200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.27734375,
+ "learning_rate": 0.00041,
+ "loss": 0.0833,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 332984.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018046971410512924,
+ "skip_count": 0.0,
+ "step": 206,
+ "text_loss": 0.587641179561615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.976812444966246,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.185546875,
+ "learning_rate": 0.000414,
+ "loss": 0.0588,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 335739.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.12791286408901215,
+ "skip_count": 0.0,
+ "step": 208,
+ "text_loss": 0.6538406610488892
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9862048723216906,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.24609375,
+ "learning_rate": 0.00041799999999999997,
+ "loss": 0.0732,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 338966.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.050490595400333405,
+ "skip_count": 1.0,
+ "step": 210,
+ "text_loss": 0.4188295602798462
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9955972996771353,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.271484375,
+ "learning_rate": 0.000422,
+ "loss": 0.0588,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 342063.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.11652113497257233,
+ "skip_count": 3.0,
+ "step": 212,
+ "text_loss": 0.21822240948677063
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.0046962136777224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2060546875,
+ "learning_rate": 0.000426,
+ "loss": 0.0621,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 344887.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023898238316178322,
+ "skip_count": 0.0,
+ "step": 214,
+ "text_loss": 0.24692800641059875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.014088641033167,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.3671875,
+ "learning_rate": 0.00043,
+ "loss": 0.1005,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 348700.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06414655596017838,
+ "skip_count": 0.0,
+ "step": 216,
+ "text_loss": 0.4744548797607422
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.0234810683886117,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1962890625,
+ "learning_rate": 0.00043400000000000003,
+ "loss": 0.0753,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 351507.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11702914535999298,
+ "skip_count": 1.0,
+ "step": 218,
+ "text_loss": 0.5614864826202393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.0328734957440564,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.189453125,
+ "learning_rate": 0.000438,
+ "loss": 0.0792,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 354484.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014991643838584423,
+ "skip_count": 0.0,
+ "step": 220,
+ "text_loss": 0.47209832072257996
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.042265923099501,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.251953125,
+ "learning_rate": 0.000442,
+ "loss": 0.106,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 357954.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04747112840414047,
+ "skip_count": 1.0,
+ "step": 222,
+ "text_loss": 0.2968728244304657
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.0516583504549457,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.40234375,
+ "learning_rate": 0.000446,
+ "loss": 0.0853,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 360547.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06754162162542343,
+ "skip_count": 2.0,
+ "step": 224,
+ "text_loss": 0.2364148646593094
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.0610507778103904,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2412109375,
+ "learning_rate": 0.00045000000000000004,
+ "loss": 0.1016,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 364529.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07830183953046799,
+ "skip_count": 1.0,
+ "step": 226,
+ "text_loss": 0.4787476360797882
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.070443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1953125,
+ "learning_rate": 0.00045400000000000003,
+ "loss": 0.0792,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 367683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015735948458313942,
+ "skip_count": 0.0,
+ "step": 228,
+ "text_loss": 0.37148505449295044
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.0798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25,
+ "learning_rate": 0.000458,
+ "loss": 0.0995,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 371402.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013354359194636345,
+ "skip_count": 0.0,
+ "step": 230,
+ "text_loss": 0.7464763522148132
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.0892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1494140625,
+ "learning_rate": 0.000462,
+ "loss": 0.0731,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 374587.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013763721100986004,
+ "skip_count": 0.0,
+ "step": 232,
+ "text_loss": 0.8754443526268005
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.098620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.3984375,
+ "learning_rate": 0.00046600000000000005,
+ "loss": 0.0861,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 377513.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010075435042381287,
+ "skip_count": 0.0,
+ "step": 234,
+ "text_loss": 0.31534913182258606
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1080129145876136,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.17578125,
+ "learning_rate": 0.00047,
+ "loss": 0.0791,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 380736.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.059825167059898376,
+ "skip_count": 1.0,
+ "step": 236,
+ "text_loss": 0.5936337113380432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1174053419430585,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.267578125,
+ "learning_rate": 0.000474,
+ "loss": 0.0514,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 383236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09134846180677414,
+ "skip_count": 2.0,
+ "step": 238,
+ "text_loss": 0.5976157784461975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1267977692985032,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.208984375,
+ "learning_rate": 0.00047799999999999996,
+ "loss": 0.0858,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 385778.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11989791691303253,
+ "skip_count": 1.0,
+ "step": 240,
+ "text_loss": 0.3554210960865021
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1361901966539478,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.171875,
+ "learning_rate": 0.000482,
+ "loss": 0.0734,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 388777.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013591105118393898,
+ "skip_count": 0.0,
+ "step": 242,
+ "text_loss": 0.4829460382461548
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1455826240093925,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12060546875,
+ "learning_rate": 0.000486,
+ "loss": 0.0625,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 391797.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0920003354549408,
+ "skip_count": 2.0,
+ "step": 244,
+ "text_loss": 0.3085818886756897
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1549750513648371,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.00049,
+ "loss": 0.0501,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 396485.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0129330949857831,
+ "skip_count": 0.0,
+ "step": 246,
+ "text_loss": 0.42803969979286194
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1643674787202818,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.296875,
+ "learning_rate": 0.000494,
+ "loss": 0.0945,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 399923.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.10677755624055862,
+ "skip_count": 3.0,
+ "step": 248,
+ "text_loss": 0.2908555567264557
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1737599060757264,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.203125,
+ "learning_rate": 0.000498,
+ "loss": 0.0812,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 403647.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.1504337340593338,
+ "skip_count": 3.0,
+ "step": 250,
+ "text_loss": 0.333095908164978
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.183152333431171,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.22265625,
+ "learning_rate": 0.0005020000000000001,
+ "loss": 0.0828,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 409147.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06503184884786606,
+ "skip_count": 2.0,
+ "step": 252,
+ "text_loss": 0.16117942333221436
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1925447607866158,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.287109375,
+ "learning_rate": 0.000506,
+ "loss": 0.0995,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 412072.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016280122101306915,
+ "skip_count": 0.0,
+ "step": 254,
+ "text_loss": 0.4217492640018463
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.2019371881420604,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.21484375,
+ "learning_rate": 0.00051,
+ "loss": 0.0803,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 415052.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.2117508500814438,
+ "skip_count": 1.0,
+ "step": 256,
+ "text_loss": 0.5795308947563171
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.211329615497505,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2421875,
+ "learning_rate": 0.000514,
+ "loss": 0.0668,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 418099.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.15002092719078064,
+ "skip_count": 0.0,
+ "step": 258,
+ "text_loss": 0.4840938448905945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.2207220428529497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1533203125,
+ "learning_rate": 0.000518,
+ "loss": 0.0538,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 422526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012834074907004833,
+ "skip_count": 0.0,
+ "step": 260,
+ "text_loss": 0.36141225695610046
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.2301144702083944,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.000522,
+ "loss": 0.085,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 425765.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.23808011412620544,
+ "skip_count": 2.0,
+ "step": 262,
+ "text_loss": 0.27572691440582275
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.2395068975638392,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.17578125,
+ "learning_rate": 0.000526,
+ "loss": 0.0708,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 429048.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.055687375366687775,
+ "skip_count": 1.0,
+ "step": 264,
+ "text_loss": 0.37020301818847656
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.248899324919284,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2080078125,
+ "learning_rate": 0.0005300000000000001,
+ "loss": 0.0839,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 431784.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0872957780957222,
+ "skip_count": 1.0,
+ "step": 266,
+ "text_loss": 0.5937283039093018
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.2582917522747286,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.0005340000000000001,
+ "loss": 0.0733,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 434297.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.23507654666900635,
+ "skip_count": 0.0,
+ "step": 268,
+ "text_loss": 0.3367372453212738
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.2676841796301732,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2431640625,
+ "learning_rate": 0.0005380000000000001,
+ "loss": 0.0708,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 437586.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.12860390543937683,
+ "skip_count": 2.0,
+ "step": 270,
+ "text_loss": 0.7149854302406311
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.2770766069856179,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2451171875,
+ "learning_rate": 0.0005420000000000001,
+ "loss": 0.1072,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 440649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.044308312237262726,
+ "skip_count": 1.0,
+ "step": 272,
+ "text_loss": 0.26778292655944824
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.2864690343410625,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.44921875,
+ "learning_rate": 0.000546,
+ "loss": 0.0938,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 443907.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.11514109373092651,
+ "skip_count": 3.0,
+ "step": 274,
+ "text_loss": 0.23578761518001556
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 1.2958614616965072,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2578125,
+ "learning_rate": 0.00055,
+ "loss": 0.0932,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 447147.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.055705297738313675,
+ "skip_count": 2.0,
+ "step": 276,
+ "text_loss": 0.2513524889945984
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.3052538890519518,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.29296875,
+ "learning_rate": 0.000554,
+ "loss": 0.0667,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 450032.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.13778971135616302,
+ "skip_count": 2.0,
+ "step": 278,
+ "text_loss": 0.4857243597507477
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.3146463164073965,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.185546875,
+ "learning_rate": 0.000558,
+ "loss": 0.0672,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 453195.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0700262188911438,
+ "skip_count": 0.0,
+ "step": 280,
+ "text_loss": 0.7589789628982544
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.3240387437628411,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25,
+ "learning_rate": 0.0005620000000000001,
+ "loss": 0.0603,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 455942.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11706235259771347,
+ "skip_count": 2.0,
+ "step": 282,
+ "text_loss": 0.4783432185649872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.3334311711182858,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.265625,
+ "learning_rate": 0.000566,
+ "loss": 0.0793,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 458932.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07073967158794403,
+ "skip_count": 1.0,
+ "step": 284,
+ "text_loss": 0.7117193937301636
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.3428235984737307,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1650390625,
+ "learning_rate": 0.00057,
+ "loss": 0.0915,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 462650.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05301115661859512,
+ "skip_count": 1.0,
+ "step": 286,
+ "text_loss": 0.4175460636615753
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.352216025829175,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2158203125,
+ "learning_rate": 0.000574,
+ "loss": 0.0675,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 466290.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06356479972600937,
+ "skip_count": 1.0,
+ "step": 288,
+ "text_loss": 0.5832946300506592
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 1.36160845318462,
+ "f1_execute": 0.9019607901573181,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.28515625,
+ "learning_rate": 0.000578,
+ "loss": 0.0805,
+ "macro_f1": 0.3006536066532135,
+ "num_tokens": 469296.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.21032999455928802,
+ "skip_count": 3.0,
+ "step": 290,
+ "text_loss": 0.36023473739624023
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.3710008805400646,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.27734375,
+ "learning_rate": 0.0005819999999999999,
+ "loss": 0.0685,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 472272.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08062280714511871,
+ "skip_count": 1.0,
+ "step": 292,
+ "text_loss": 0.37197956442832947
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.3803933078955093,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.28125,
+ "learning_rate": 0.0005859999999999999,
+ "loss": 0.0878,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 475864.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05023600533604622,
+ "skip_count": 2.0,
+ "step": 294,
+ "text_loss": 0.4765273630619049
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.389785735250954,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2177734375,
+ "learning_rate": 0.00059,
+ "loss": 0.0728,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 478916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011689410544931889,
+ "skip_count": 0.0,
+ "step": 296,
+ "text_loss": 0.5878773927688599
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.3991781626063986,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15625,
+ "learning_rate": 0.000594,
+ "loss": 0.0727,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 482369.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010772093199193478,
+ "skip_count": 0.0,
+ "step": 298,
+ "text_loss": 0.4424116313457489
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4085705899618433,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.181640625,
+ "learning_rate": 0.000598,
+ "loss": 0.0787,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 486049.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.23482851684093475,
+ "skip_count": 2.0,
+ "step": 300,
+ "text_loss": 0.21217775344848633
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.417963017317288,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2080078125,
+ "learning_rate": 0.000602,
+ "loss": 0.073,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 488683.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.18843084573745728,
+ "skip_count": 3.0,
+ "step": 302,
+ "text_loss": 0.2109498232603073
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4273554446727326,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.279296875,
+ "learning_rate": 0.000606,
+ "loss": 0.0945,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 492010.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.17861786484718323,
+ "skip_count": 3.0,
+ "step": 304,
+ "text_loss": 0.8446305394172668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4367478720281772,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1943359375,
+ "learning_rate": 0.00061,
+ "loss": 0.0827,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 494764.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014124520123004913,
+ "skip_count": 0.0,
+ "step": 306,
+ "text_loss": 0.742735743522644
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4461402993836219,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.26953125,
+ "learning_rate": 0.000614,
+ "loss": 0.1071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 497820.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017968112602829933,
+ "skip_count": 0.0,
+ "step": 308,
+ "text_loss": 0.28305482864379883
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4555327267390665,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1689453125,
+ "learning_rate": 0.0006180000000000001,
+ "loss": 0.0775,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 500694.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08593655377626419,
+ "skip_count": 2.0,
+ "step": 310,
+ "text_loss": 0.3496848940849304
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4649251540945114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.19140625,
+ "learning_rate": 0.000622,
+ "loss": 0.061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 503871.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016449492424726486,
+ "skip_count": 0.0,
+ "step": 312,
+ "text_loss": 0.6691372990608215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4743175814499558,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.205078125,
+ "learning_rate": 0.000626,
+ "loss": 0.0815,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 506730.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014532964676618576,
+ "skip_count": 0.0,
+ "step": 314,
+ "text_loss": 0.6118118166923523
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4837100088054007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2216796875,
+ "learning_rate": 0.00063,
+ "loss": 0.0742,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 510323.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013093139044940472,
+ "skip_count": 0.0,
+ "step": 316,
+ "text_loss": 0.38126271963119507
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4931024361608454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.400390625,
+ "learning_rate": 0.000634,
+ "loss": 0.0915,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 514075.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008627045899629593,
+ "skip_count": 0.0,
+ "step": 318,
+ "text_loss": 0.5983037948608398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.50249486351629,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15234375,
+ "learning_rate": 0.000638,
+ "loss": 0.1008,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 517418.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04561378434300423,
+ "skip_count": 1.0,
+ "step": 320,
+ "text_loss": 0.767257034778595
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 1.5118872908717347,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.259765625,
+ "learning_rate": 0.000642,
+ "loss": 0.0926,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 520443.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.024372953921556473,
+ "skip_count": 0.0,
+ "step": 322,
+ "text_loss": 0.6572105884552002
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.5212797182271793,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.30078125,
+ "learning_rate": 0.000646,
+ "loss": 0.0822,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 523317.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08099937438964844,
+ "skip_count": 0.0,
+ "step": 324,
+ "text_loss": 0.205499529838562
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 1.530672145582624,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.0006500000000000001,
+ "loss": 0.0809,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 526355.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0657225176692009,
+ "skip_count": 1.0,
+ "step": 326,
+ "text_loss": 0.2587239742279053
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.5400645729380686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.111328125,
+ "learning_rate": 0.0006540000000000001,
+ "loss": 0.0779,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 529689.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01849208027124405,
+ "skip_count": 0.0,
+ "step": 328,
+ "text_loss": 0.2172023057937622
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.5494570002935135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1845703125,
+ "learning_rate": 0.0006580000000000001,
+ "loss": 0.0758,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 532603.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016184113919734955,
+ "skip_count": 0.0,
+ "step": 330,
+ "text_loss": 0.5980568528175354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.558849427648958,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.220703125,
+ "learning_rate": 0.000662,
+ "loss": 0.0439,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 536056.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01303898449987173,
+ "skip_count": 0.0,
+ "step": 332,
+ "text_loss": 0.5421966314315796
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 1.5682418550044028,
+ "f1_execute": 0.8979591727256775,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.296875,
+ "learning_rate": 0.000666,
+ "loss": 0.0963,
+ "macro_f1": 0.465986430644989,
+ "num_tokens": 539231.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.3075675964355469,
+ "skip_count": 3.0,
+ "step": 334,
+ "text_loss": 0.19719554483890533
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.5776342823598473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.173828125,
+ "learning_rate": 0.00067,
+ "loss": 0.0706,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 542038.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009116224013268948,
+ "skip_count": 0.0,
+ "step": 336,
+ "text_loss": 0.3407036066055298
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.5870267097152921,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2421875,
+ "learning_rate": 0.000674,
+ "loss": 0.0768,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 545019.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021463042125105858,
+ "skip_count": 0.0,
+ "step": 338,
+ "text_loss": 0.24486012756824493
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.5964191370707366,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1708984375,
+ "learning_rate": 0.0006780000000000001,
+ "loss": 0.0889,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 548036.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01857556402683258,
+ "skip_count": 0.0,
+ "step": 340,
+ "text_loss": 0.28140124678611755
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.6058115644261814,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.130859375,
+ "learning_rate": 0.0006820000000000001,
+ "loss": 0.0617,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 551419.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.27090007066726685,
+ "skip_count": 3.0,
+ "step": 342,
+ "text_loss": 0.20690307021141052
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.615203991781626,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.3046875,
+ "learning_rate": 0.0006860000000000001,
+ "loss": 0.1047,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 554037.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09231195598840714,
+ "skip_count": 2.0,
+ "step": 344,
+ "text_loss": 0.4479128420352936
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.6245964191370708,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.255859375,
+ "learning_rate": 0.00069,
+ "loss": 0.0883,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 556672.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00935924518853426,
+ "skip_count": 0.0,
+ "step": 346,
+ "text_loss": 0.6377320289611816
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.6339888464925154,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.000694,
+ "loss": 0.0781,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 559756.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.17641772329807281,
+ "skip_count": 2.0,
+ "step": 348,
+ "text_loss": 0.6097636222839355
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 1.64338127384796,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.30078125,
+ "learning_rate": 0.0006979999999999999,
+ "loss": 0.0616,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 563415.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06240406632423401,
+ "skip_count": 2.0,
+ "step": 350,
+ "text_loss": 0.5291631817817688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.6527737012034047,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.296875,
+ "learning_rate": 0.0007019999999999999,
+ "loss": 0.1026,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 566357.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012269247323274612,
+ "skip_count": 0.0,
+ "step": 352,
+ "text_loss": 0.5170195698738098
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.6621661285588494,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1435546875,
+ "learning_rate": 0.0007059999999999999,
+ "loss": 0.0815,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 569449.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07515309751033783,
+ "skip_count": 2.0,
+ "step": 354,
+ "text_loss": 0.34507250785827637
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.6715585559142943,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.00071,
+ "loss": 0.0791,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 572761.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.20768006145954132,
+ "skip_count": 2.0,
+ "step": 356,
+ "text_loss": 0.3158532381057739
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.6809509832697387,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1884765625,
+ "learning_rate": 0.000714,
+ "loss": 0.0682,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 575909.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.025329967960715294,
+ "skip_count": 0.0,
+ "step": 358,
+ "text_loss": 0.21455390751361847
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 1.6903434106251836,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.21484375,
+ "learning_rate": 0.000718,
+ "loss": 0.0775,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 579186.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.07676175981760025,
+ "skip_count": 0.0,
+ "step": 360,
+ "text_loss": 0.61895352602005
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 1.699735837980628,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.197265625,
+ "learning_rate": 0.000722,
+ "loss": 0.0781,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 582437.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08070661872625351,
+ "skip_count": 1.0,
+ "step": 362,
+ "text_loss": 0.20557661354541779
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.7091282653360729,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2216796875,
+ "learning_rate": 0.000726,
+ "loss": 0.11,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 586096.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015891313552856445,
+ "skip_count": 0.0,
+ "step": 364,
+ "text_loss": 0.597991943359375
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.7185206926915173,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15625,
+ "learning_rate": 0.00073,
+ "loss": 0.0573,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 589520.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.12844261527061462,
+ "skip_count": 3.0,
+ "step": 366,
+ "text_loss": 0.2944789230823517
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.7279131200469622,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.150390625,
+ "learning_rate": 0.000734,
+ "loss": 0.1005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 592691.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02382199838757515,
+ "skip_count": 0.0,
+ "step": 368,
+ "text_loss": 0.23989969491958618
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.7373055474024068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1796875,
+ "learning_rate": 0.000738,
+ "loss": 0.0661,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 596004.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018812084570527077,
+ "skip_count": 0.0,
+ "step": 370,
+ "text_loss": 0.22111408412456512
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.7466979747578515,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2412109375,
+ "learning_rate": 0.000742,
+ "loss": 0.0666,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 599087.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08290331065654755,
+ "skip_count": 1.0,
+ "step": 372,
+ "text_loss": 0.2567356526851654
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.7560904021132961,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2412109375,
+ "learning_rate": 0.000746,
+ "loss": 0.0941,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 602330.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11482042074203491,
+ "skip_count": 1.0,
+ "step": 374,
+ "text_loss": 0.7217292785644531
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.7654828294687408,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2265625,
+ "learning_rate": 0.00075,
+ "loss": 0.0728,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 605503.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11849870532751083,
+ "skip_count": 0.0,
+ "step": 376,
+ "text_loss": 0.5122153759002686
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 1.7748752568241855,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2333984375,
+ "learning_rate": 0.000754,
+ "loss": 0.0835,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 608505.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07090992480516434,
+ "skip_count": 1.0,
+ "step": 378,
+ "text_loss": 0.2204965502023697
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.78426768417963,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1826171875,
+ "learning_rate": 0.000758,
+ "loss": 0.0794,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 611193.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03812089189887047,
+ "skip_count": 1.0,
+ "step": 380,
+ "text_loss": 0.44909021258354187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.793660111535075,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1689453125,
+ "learning_rate": 0.000762,
+ "loss": 0.0882,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 614231.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.10270529240369797,
+ "skip_count": 0.0,
+ "step": 382,
+ "text_loss": 0.13624964654445648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8030525388905194,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.330078125,
+ "learning_rate": 0.0007660000000000001,
+ "loss": 0.1107,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 617090.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11624004691839218,
+ "skip_count": 1.0,
+ "step": 384,
+ "text_loss": 0.7314052581787109
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8124449662459643,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1396484375,
+ "learning_rate": 0.0007700000000000001,
+ "loss": 0.0628,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 620596.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07114322483539581,
+ "skip_count": 2.0,
+ "step": 386,
+ "text_loss": 0.503322958946228
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8218373936014087,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.306640625,
+ "learning_rate": 0.0007740000000000001,
+ "loss": 0.0829,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 624108.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06061873584985733,
+ "skip_count": 2.0,
+ "step": 388,
+ "text_loss": 0.11481904983520508
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8312298209568536,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2099609375,
+ "learning_rate": 0.000778,
+ "loss": 0.0791,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 626895.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.2921771705150604,
+ "skip_count": 4.0,
+ "step": 390,
+ "text_loss": 0.3069624602794647
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8406222483122983,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.30859375,
+ "learning_rate": 0.000782,
+ "loss": 0.0605,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 630204.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.202707901597023,
+ "skip_count": 4.0,
+ "step": 392,
+ "text_loss": 0.6022785305976868
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.850014675667743,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.29296875,
+ "learning_rate": 0.000786,
+ "loss": 0.0877,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 634373.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0221510399132967,
+ "skip_count": 0.0,
+ "step": 394,
+ "text_loss": 0.26787394285202026
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8594071030231876,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.37890625,
+ "learning_rate": 0.00079,
+ "loss": 0.0805,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 637442.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.12636390328407288,
+ "skip_count": 0.0,
+ "step": 396,
+ "text_loss": 0.2799781560897827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8687995303786322,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2080078125,
+ "learning_rate": 0.0007940000000000001,
+ "loss": 0.0724,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 641231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07933453470468521,
+ "skip_count": 2.0,
+ "step": 398,
+ "text_loss": 0.2507784366607666
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8781919577340769,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.0007980000000000001,
+ "loss": 0.0909,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 644560.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.10324911028146744,
+ "skip_count": 0.0,
+ "step": 400,
+ "text_loss": 0.7756280303001404
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8875843850895215,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2275390625,
+ "learning_rate": 0.0008020000000000001,
+ "loss": 0.0783,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 647393.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.18546262383460999,
+ "skip_count": 2.0,
+ "step": 402,
+ "text_loss": 0.5013328194618225
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8969768124449664,
+ "f1_execute": 0.8571428656578064,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.283203125,
+ "learning_rate": 0.0008060000000000001,
+ "loss": 0.0787,
+ "macro_f1": 0.2857142984867096,
+ "num_tokens": 650355.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.3280293643474579,
+ "skip_count": 4.0,
+ "step": 404,
+ "text_loss": 0.2842077314853668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.9063692398004108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.0008100000000000001,
+ "loss": 0.0901,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 654280.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02623247355222702,
+ "skip_count": 0.0,
+ "step": 406,
+ "text_loss": 0.46742817759513855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.9157616671558557,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.216796875,
+ "learning_rate": 0.0008139999999999999,
+ "loss": 0.0945,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 657568.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009744114242494106,
+ "skip_count": 0.0,
+ "step": 408,
+ "text_loss": 0.7168047428131104
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.9251540945113002,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2158203125,
+ "learning_rate": 0.0008179999999999999,
+ "loss": 0.1065,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 660593.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07591600716114044,
+ "skip_count": 2.0,
+ "step": 410,
+ "text_loss": 0.449823260307312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.934546521866745,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1396484375,
+ "learning_rate": 0.0008219999999999999,
+ "loss": 0.0795,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 663916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02076602540910244,
+ "skip_count": 0.0,
+ "step": 412,
+ "text_loss": 0.4764713943004608
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.9439389492221895,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1650390625,
+ "learning_rate": 0.000826,
+ "loss": 0.0836,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 667502.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.049170155078172684,
+ "skip_count": 1.0,
+ "step": 414,
+ "text_loss": 0.30333325266838074
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.9533313765776343,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1513671875,
+ "learning_rate": 0.00083,
+ "loss": 0.1021,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 670510.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.15554003417491913,
+ "skip_count": 0.0,
+ "step": 416,
+ "text_loss": 0.3691870868206024
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.962723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.000834,
+ "loss": 0.1013,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 674761.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.024516675621271133,
+ "skip_count": 0.0,
+ "step": 418,
+ "text_loss": 0.32850381731987
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.9721162312885236,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10888671875,
+ "learning_rate": 0.000838,
+ "loss": 0.0649,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 678055.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011026890948414803,
+ "skip_count": 0.0,
+ "step": 420,
+ "text_loss": 0.6637290716171265
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.9815086586439683,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.000842,
+ "loss": 0.0771,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 680979.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07451887428760529,
+ "skip_count": 1.0,
+ "step": 422,
+ "text_loss": 0.27131685614585876
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.990901085999413,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1318359375,
+ "learning_rate": 0.000846,
+ "loss": 0.0714,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 684144.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11341800540685654,
+ "skip_count": 1.0,
+ "step": 424,
+ "text_loss": 0.652126669883728
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.0,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2158203125,
+ "learning_rate": 0.00085,
+ "loss": 0.0754,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 687004.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08985847979784012,
+ "skip_count": 0.0,
+ "step": 426,
+ "text_loss": 0.2589428424835205
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.009392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.23828125,
+ "learning_rate": 0.000854,
+ "loss": 0.0866,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 689702.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011355436407029629,
+ "skip_count": 0.0,
+ "step": 428,
+ "text_loss": 0.8909716010093689
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.0187848547108893,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1435546875,
+ "learning_rate": 0.000858,
+ "loss": 0.0623,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 692698.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013788948766887188,
+ "skip_count": 0.0,
+ "step": 430,
+ "text_loss": 0.19141142070293427
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.028177282066334,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.000862,
+ "loss": 0.0499,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 696007.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07998392730951309,
+ "skip_count": 2.0,
+ "step": 432,
+ "text_loss": 0.1611809879541397
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.0375697094217786,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.173828125,
+ "learning_rate": 0.000866,
+ "loss": 0.0541,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 700271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06988382339477539,
+ "skip_count": 2.0,
+ "step": 434,
+ "text_loss": 0.37254223227500916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.0469621367772235,
+ "f1_execute": 0.8333333730697632,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1943359375,
+ "learning_rate": 0.00087,
+ "loss": 0.0834,
+ "macro_f1": 0.2777777910232544,
+ "num_tokens": 703519.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.28240787982940674,
+ "skip_count": 5.0,
+ "step": 436,
+ "text_loss": 0.29636648297309875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.056354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.423828125,
+ "learning_rate": 0.000874,
+ "loss": 0.0657,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 706826.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013924967497587204,
+ "skip_count": 0.0,
+ "step": 438,
+ "text_loss": 0.20867908000946045
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.065746991488113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2353515625,
+ "learning_rate": 0.000878,
+ "loss": 0.0657,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 710530.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01170142088085413,
+ "skip_count": 0.0,
+ "step": 440,
+ "text_loss": 0.7273373007774353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.0751394188435572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.171875,
+ "learning_rate": 0.000882,
+ "loss": 0.076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 713503.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011930872686207294,
+ "skip_count": 0.0,
+ "step": 442,
+ "text_loss": 0.39314430952072144
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.084531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.0008860000000000001,
+ "loss": 0.0592,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 716582.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008630385622382164,
+ "skip_count": 0.0,
+ "step": 444,
+ "text_loss": 0.5925271511077881
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.0939242735544465,
+ "f1_execute": 0.9019607901573181,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.23046875,
+ "learning_rate": 0.0008900000000000001,
+ "loss": 0.0811,
+ "macro_f1": 0.3006536066532135,
+ "num_tokens": 719941.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.3015584945678711,
+ "skip_count": 1.0,
+ "step": 446,
+ "text_loss": 0.5059905052185059
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.1033167009098914,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.203125,
+ "learning_rate": 0.000894,
+ "loss": 0.0822,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 723113.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.10897493362426758,
+ "skip_count": 1.0,
+ "step": 448,
+ "text_loss": 0.19616436958312988
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.112709128265336,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.33984375,
+ "learning_rate": 0.000898,
+ "loss": 0.0782,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 726193.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07236456125974655,
+ "skip_count": 2.0,
+ "step": 450,
+ "text_loss": 0.1773054152727127
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.1221015556207807,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.3203125,
+ "learning_rate": 0.000902,
+ "loss": 0.058,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 729275.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08184371143579483,
+ "skip_count": 0.0,
+ "step": 452,
+ "text_loss": 0.4927310049533844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.1314939829762256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1953125,
+ "learning_rate": 0.000906,
+ "loss": 0.0607,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 731948.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014033539220690727,
+ "skip_count": 0.0,
+ "step": 454,
+ "text_loss": 0.4745742678642273
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.154296875,
+ "learning_rate": 0.00091,
+ "loss": 0.0651,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 735351.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0071774693205952644,
+ "skip_count": 0.0,
+ "step": 456,
+ "text_loss": 0.18523462116718292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 2.150278837687115,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.400390625,
+ "learning_rate": 0.0009140000000000001,
+ "loss": 0.0738,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 738587.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07781517505645752,
+ "skip_count": 2.0,
+ "step": 458,
+ "text_loss": 0.3459635376930237
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 26.0,
+ "epoch": 2.1596712650425594,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.28125,
+ "learning_rate": 0.0009180000000000001,
+ "loss": 0.0723,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 741779.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09529037028551102,
+ "skip_count": 2.0,
+ "step": 460,
+ "text_loss": 0.20197433233261108
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.1690636923980042,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1865234375,
+ "learning_rate": 0.0009220000000000001,
+ "loss": 0.0519,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 745355.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009765669703483582,
+ "skip_count": 0.0,
+ "step": 462,
+ "text_loss": 0.7031404376029968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.1784561197534487,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1298828125,
+ "learning_rate": 0.0009260000000000001,
+ "loss": 0.0527,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 748628.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03344850242137909,
+ "skip_count": 1.0,
+ "step": 464,
+ "text_loss": 0.21274663507938385
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.1878485471088935,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.173828125,
+ "learning_rate": 0.00093,
+ "loss": 0.0534,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 751472.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.1354292333126068,
+ "skip_count": 2.0,
+ "step": 466,
+ "text_loss": 0.5350717306137085
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.197240974464338,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.142578125,
+ "learning_rate": 0.000934,
+ "loss": 0.0598,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 754479.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.056420840322971344,
+ "skip_count": 1.0,
+ "step": 468,
+ "text_loss": 0.28153330087661743
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.206633401819783,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.234375,
+ "learning_rate": 0.0009379999999999999,
+ "loss": 0.0597,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 757872.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.1622387170791626,
+ "skip_count": 1.0,
+ "step": 470,
+ "text_loss": 0.22956843674182892
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.2160258291752273,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.5,
+ "learning_rate": 0.000942,
+ "loss": 0.0953,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 760468.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05146972835063934,
+ "skip_count": 2.0,
+ "step": 472,
+ "text_loss": 0.4513966739177704
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.225418256530672,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.212890625,
+ "learning_rate": 0.000946,
+ "loss": 0.0592,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 763519.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.09022669494152069,
+ "skip_count": 0.0,
+ "step": 474,
+ "text_loss": 0.25758957862854004
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.234810683886117,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1259765625,
+ "learning_rate": 0.00095,
+ "loss": 0.0498,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 767391.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03044828027486801,
+ "skip_count": 1.0,
+ "step": 476,
+ "text_loss": 0.21366681158542633
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.2442031112415615,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.291015625,
+ "learning_rate": 0.000954,
+ "loss": 0.0802,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 770338.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.10397060960531235,
+ "skip_count": 1.0,
+ "step": 478,
+ "text_loss": 1.0396177768707275
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.2535955385970063,
+ "f1_execute": 0.8571429252624512,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.267578125,
+ "learning_rate": 0.000958,
+ "loss": 0.1099,
+ "macro_f1": 0.285714328289032,
+ "num_tokens": 773699.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.22604143619537354,
+ "skip_count": 4.0,
+ "step": 480,
+ "text_loss": 0.2570283114910126
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.2629879659524508,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.146484375,
+ "learning_rate": 0.000962,
+ "loss": 0.0667,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 777473.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.048258859664201736,
+ "skip_count": 1.0,
+ "step": 482,
+ "text_loss": 0.2540103495121002
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.2723803933078957,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.197265625,
+ "learning_rate": 0.000966,
+ "loss": 0.0592,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 780833.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023018671199679375,
+ "skip_count": 0.0,
+ "step": 484,
+ "text_loss": 0.38524550199508667
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.28177282066334,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.314453125,
+ "learning_rate": 0.0009699999999999999,
+ "loss": 0.0709,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 783656.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.044845327734947205,
+ "skip_count": 1.0,
+ "step": 486,
+ "text_loss": 0.5859048366546631
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.000974,
+ "loss": 0.0615,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 787173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010898692533373833,
+ "skip_count": 0.0,
+ "step": 488,
+ "text_loss": 0.3456067442893982
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.3005576753742294,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.000978,
+ "loss": 0.0796,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 790395.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06497956812381744,
+ "skip_count": 2.0,
+ "step": 490,
+ "text_loss": 0.3751123249530792
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.3099501027296743,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2158203125,
+ "learning_rate": 0.000982,
+ "loss": 0.0772,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 793137.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07763728499412537,
+ "skip_count": 1.0,
+ "step": 492,
+ "text_loss": 0.43296709656715393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.3193425300851187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1416015625,
+ "learning_rate": 0.0009860000000000001,
+ "loss": 0.0819,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 796497.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02127906307578087,
+ "skip_count": 0.0,
+ "step": 494,
+ "text_loss": 0.4841311275959015
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.3287349574405636,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.00099,
+ "loss": 0.073,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 799361.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.09518691152334213,
+ "skip_count": 0.0,
+ "step": 496,
+ "text_loss": 0.5094487071037292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 2.3381273847960085,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.130859375,
+ "learning_rate": 0.000994,
+ "loss": 0.0789,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 802629.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0563947930932045,
+ "skip_count": 2.0,
+ "step": 498,
+ "text_loss": 0.42783617973327637
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.347519812151453,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1865234375,
+ "learning_rate": 0.000998,
+ "loss": 0.0476,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 805881.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.10570426285266876,
+ "skip_count": 0.0,
+ "step": 500,
+ "text_loss": 0.28395503759384155
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 2.3569122395068973,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2275390625,
+ "learning_rate": 0.0009999999760498814,
+ "loss": 0.0849,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 809283.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.031202208250761032,
+ "skip_count": 2.0,
+ "step": 502,
+ "text_loss": 0.32970911264419556
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.366304666862342,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1455078125,
+ "learning_rate": 0.0009999997844489475,
+ "loss": 0.0574,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 812440.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07647835463285446,
+ "skip_count": 1.0,
+ "step": 504,
+ "text_loss": 0.4901447296142578
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.375697094217787,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25,
+ "learning_rate": 0.000999999401247153,
+ "loss": 0.0668,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 815716.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08515176922082901,
+ "skip_count": 2.0,
+ "step": 506,
+ "text_loss": 0.6157599687576294
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.3850895215732315,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25390625,
+ "learning_rate": 0.0009999988264446445,
+ "loss": 0.0686,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 819086.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00946938619017601,
+ "skip_count": 0.0,
+ "step": 508,
+ "text_loss": 0.5053519010543823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.3944819489286764,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1640625,
+ "learning_rate": 0.0009999980600416424,
+ "loss": 0.0574,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 822268.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01058756373822689,
+ "skip_count": 0.0,
+ "step": 510,
+ "text_loss": 0.5570021867752075
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.403874376284121,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1240234375,
+ "learning_rate": 0.000999997102038441,
+ "loss": 0.0678,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 825728.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008705209009349346,
+ "skip_count": 0.0,
+ "step": 512,
+ "text_loss": 0.6519040465354919
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.4132668036395657,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.220703125,
+ "learning_rate": 0.0009999959524354064,
+ "loss": 0.083,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 829459.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04024193435907364,
+ "skip_count": 1.0,
+ "step": 514,
+ "text_loss": 0.5290043950080872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25390625,
+ "learning_rate": 0.00099999461123298,
+ "loss": 0.0727,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 832291.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015742862597107887,
+ "skip_count": 0.0,
+ "step": 516,
+ "text_loss": 0.7910057902336121
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.432051658350455,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2275390625,
+ "learning_rate": 0.000999993078431675,
+ "loss": 0.0759,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 835399.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.16753782331943512,
+ "skip_count": 3.0,
+ "step": 518,
+ "text_loss": 0.45196083188056946
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.4414440857058994,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.236328125,
+ "learning_rate": 0.0009999913540320792,
+ "loss": 0.0968,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 838993.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09357143193483353,
+ "skip_count": 2.0,
+ "step": 520,
+ "text_loss": 0.5499435663223267
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 2.4508365130613443,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2451171875,
+ "learning_rate": 0.0009999894380348536,
+ "loss": 0.0821,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 842652.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.056803856045007706,
+ "skip_count": 2.0,
+ "step": 522,
+ "text_loss": 0.197520449757576
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 2.4602289404167887,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.2333984375,
+ "learning_rate": 0.000999987330440732,
+ "loss": 0.0725,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 847061.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08962195366621017,
+ "skip_count": 3.0,
+ "step": 524,
+ "text_loss": 0.27509039640426636
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.4696213677722336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.189453125,
+ "learning_rate": 0.000999985031250522,
+ "loss": 0.0561,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 850780.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022930558770895004,
+ "skip_count": 0.0,
+ "step": 526,
+ "text_loss": 0.13291706144809723
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.4790137951276785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.197265625,
+ "learning_rate": 0.0009999825404651053,
+ "loss": 0.0614,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 853886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017097990959882736,
+ "skip_count": 0.0,
+ "step": 528,
+ "text_loss": 0.21706295013427734
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.488406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.212890625,
+ "learning_rate": 0.0009999798580854356,
+ "loss": 0.0724,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 857364.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02831801027059555,
+ "skip_count": 0.0,
+ "step": 530,
+ "text_loss": 0.9035662412643433
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.497798649838568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1591796875,
+ "learning_rate": 0.000999976984112541,
+ "loss": 0.0674,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 860661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019671892747282982,
+ "skip_count": 0.0,
+ "step": 532,
+ "text_loss": 0.8354863524436951
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 2.5071910771940122,
+ "f1_execute": 0.9200000166893005,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.2890625,
+ "learning_rate": 0.0009999739185475231,
+ "loss": 0.0963,
+ "macro_f1": 0.47333335876464844,
+ "num_tokens": 864124.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.21383361518383026,
+ "skip_count": 3.0,
+ "step": 534,
+ "text_loss": 0.23422949016094208
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.516583504549457,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.0009999706613915565,
+ "loss": 0.0598,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 866976.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07158871740102768,
+ "skip_count": 1.0,
+ "step": 536,
+ "text_loss": 0.11800774186849594
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.5259759319049016,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.26953125,
+ "learning_rate": 0.0009999672126458894,
+ "loss": 0.0822,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 870549.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08185924589633942,
+ "skip_count": 1.0,
+ "step": 538,
+ "text_loss": 0.19232480227947235
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.5353683592603464,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1396484375,
+ "learning_rate": 0.000999963572311843,
+ "loss": 0.0604,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 873733.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01633382774889469,
+ "skip_count": 0.0,
+ "step": 540,
+ "text_loss": 0.3725031912326813
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.544760786615791,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15234375,
+ "learning_rate": 0.0009999597403908128,
+ "loss": 0.0761,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 877099.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0782657191157341,
+ "skip_count": 1.0,
+ "step": 542,
+ "text_loss": 0.17589199542999268
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 2.5541532139712357,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2177734375,
+ "learning_rate": 0.0009999557168842669,
+ "loss": 0.0716,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 879883.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05275818333029747,
+ "skip_count": 2.0,
+ "step": 544,
+ "text_loss": 0.26448264718055725
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.56354564132668,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.0009999515017937468,
+ "loss": 0.071,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 882223.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09335892647504807,
+ "skip_count": 2.0,
+ "step": 546,
+ "text_loss": 0.208544060587883
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.572938068682125,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.376953125,
+ "learning_rate": 0.0009999470951208684,
+ "loss": 0.0855,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 885241.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.22983254492282867,
+ "skip_count": 0.0,
+ "step": 548,
+ "text_loss": 0.6612338423728943
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.58233049603757,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.216796875,
+ "learning_rate": 0.00099994249686732,
+ "loss": 0.0786,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 887897.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.12858282029628754,
+ "skip_count": 0.0,
+ "step": 550,
+ "text_loss": 0.4673548936843872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.5917229233930144,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1591796875,
+ "learning_rate": 0.0009999377070348638,
+ "loss": 0.0944,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 891224.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017421770840883255,
+ "skip_count": 0.0,
+ "step": 552,
+ "text_loss": 0.6419258117675781
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.601115350748459,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15625,
+ "learning_rate": 0.000999932725625335,
+ "loss": 0.0791,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 894578.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07890026271343231,
+ "skip_count": 2.0,
+ "step": 554,
+ "text_loss": 0.5970752239227295
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.6105077781039037,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.216796875,
+ "learning_rate": 0.0009999275526406427,
+ "loss": 0.0796,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 897145.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.09836960583925247,
+ "skip_count": 1.0,
+ "step": 556,
+ "text_loss": 0.752425491809845
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.6199002054593485,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1875,
+ "learning_rate": 0.0009999221880827693,
+ "loss": 0.0882,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 900565.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017694659531116486,
+ "skip_count": 0.0,
+ "step": 558,
+ "text_loss": 0.195619136095047
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.629292632814793,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2021484375,
+ "learning_rate": 0.0009999166319537703,
+ "loss": 0.0561,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 903506.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019375264644622803,
+ "skip_count": 0.0,
+ "step": 560,
+ "text_loss": 0.4603337347507477
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 2.638685060170238,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.146484375,
+ "learning_rate": 0.0009999108842557748,
+ "loss": 0.0953,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 906380.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.12013207376003265,
+ "skip_count": 3.0,
+ "step": 562,
+ "text_loss": 0.6279402375221252
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.6480774875256823,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.255859375,
+ "learning_rate": 0.0009999049449909854,
+ "loss": 0.0799,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 909116.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06441342830657959,
+ "skip_count": 1.0,
+ "step": 564,
+ "text_loss": 0.23741699755191803
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.657469914881127,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15234375,
+ "learning_rate": 0.0009998988141616781,
+ "loss": 0.064,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 912189.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08309414982795715,
+ "skip_count": 1.0,
+ "step": 566,
+ "text_loss": 0.27780941128730774
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.6668623422365716,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1962890625,
+ "learning_rate": 0.0009998924917702023,
+ "loss": 0.0876,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 916279.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.07197169959545135,
+ "skip_count": 0.0,
+ "step": 568,
+ "text_loss": 0.6371755599975586
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.6762547695920165,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2255859375,
+ "learning_rate": 0.0009998859778189806,
+ "loss": 0.0706,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 919490.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008022273890674114,
+ "skip_count": 0.0,
+ "step": 570,
+ "text_loss": 0.6028938889503479
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.6856471969474613,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1650390625,
+ "learning_rate": 0.000999879272310509,
+ "loss": 0.084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 923694.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01634674146771431,
+ "skip_count": 0.0,
+ "step": 572,
+ "text_loss": 0.7177054286003113
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.695039624302906,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.17578125,
+ "learning_rate": 0.0009998723752473574,
+ "loss": 0.0716,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 926933.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.060559045523405075,
+ "skip_count": 1.0,
+ "step": 574,
+ "text_loss": 0.5203254818916321
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.185546875,
+ "learning_rate": 0.0009998652866321687,
+ "loss": 0.0801,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 929832.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011485611088573933,
+ "skip_count": 0.0,
+ "step": 576,
+ "text_loss": 0.6147452592849731
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.713824479013795,
+ "f1_execute": 0.8799999952316284,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.000999858006467659,
+ "loss": 0.0649,
+ "macro_f1": 0.29333335161209106,
+ "num_tokens": 933266.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.2929030954837799,
+ "skip_count": 4.0,
+ "step": 578,
+ "text_loss": 0.1720666140317917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.72321690636924,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.24609375,
+ "learning_rate": 0.0009998505347566186,
+ "loss": 0.0782,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 937545.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.053780000656843185,
+ "skip_count": 2.0,
+ "step": 580,
+ "text_loss": 0.3258405327796936
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.7326093337246844,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1416015625,
+ "learning_rate": 0.00099984287150191,
+ "loss": 0.0582,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 941001.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02637636847794056,
+ "skip_count": 0.0,
+ "step": 582,
+ "text_loss": 0.23762771487236023
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.7420017610801293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0009998350167064705,
+ "loss": 0.0672,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 943989.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01637580618262291,
+ "skip_count": 0.0,
+ "step": 584,
+ "text_loss": 0.7460582852363586
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.7513941884355737,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1884765625,
+ "learning_rate": 0.0009998269703733096,
+ "loss": 0.0686,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 947245.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.13934117555618286,
+ "skip_count": 0.0,
+ "step": 586,
+ "text_loss": 0.5284690260887146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.7607866157910186,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.13671875,
+ "learning_rate": 0.0009998187325055106,
+ "loss": 0.0667,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 950116.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02138397842645645,
+ "skip_count": 0.0,
+ "step": 588,
+ "text_loss": 0.3920256197452545
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1533203125,
+ "learning_rate": 0.0009998103031062305,
+ "loss": 0.0778,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 953277.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007098200265318155,
+ "skip_count": 0.0,
+ "step": 590,
+ "text_loss": 0.7472905516624451
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.779571470501908,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.318359375,
+ "learning_rate": 0.0009998016821786994,
+ "loss": 0.0872,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 958229.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.07946522533893585,
+ "skip_count": 1.0,
+ "step": 592,
+ "text_loss": 0.5506448745727539
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.7889638978573528,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1357421875,
+ "learning_rate": 0.000999792869726221,
+ "loss": 0.0523,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 961016.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0850791186094284,
+ "skip_count": 1.0,
+ "step": 594,
+ "text_loss": 0.3824431002140045
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1650390625,
+ "learning_rate": 0.0009997838657521717,
+ "loss": 0.0632,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 963847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016370445489883423,
+ "skip_count": 0.0,
+ "step": 596,
+ "text_loss": 0.2139475792646408
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.8077487525682416,
+ "f1_execute": 0.923076868057251,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12890625,
+ "learning_rate": 0.0009997746702600026,
+ "loss": 0.0702,
+ "macro_f1": 0.307692289352417,
+ "num_tokens": 966619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.1310746818780899,
+ "skip_count": 3.0,
+ "step": 598,
+ "text_loss": 0.3651018440723419
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.8171411799236865,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.23828125,
+ "learning_rate": 0.0009997652832532372,
+ "loss": 0.0792,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 970418.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.14303378760814667,
+ "skip_count": 0.0,
+ "step": 600,
+ "text_loss": 0.7094736099243164
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.8265336072791314,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0009997557047354722,
+ "loss": 0.0531,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 973491.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03334212675690651,
+ "skip_count": 1.0,
+ "step": 602,
+ "text_loss": 0.4812237024307251
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.835926034634576,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2890625,
+ "learning_rate": 0.0009997459347103783,
+ "loss": 0.0956,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 976672.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02831871062517166,
+ "skip_count": 0.0,
+ "step": 604,
+ "text_loss": 0.21737146377563477
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.8453184619900207,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1298828125,
+ "learning_rate": 0.0009997359731816998,
+ "loss": 0.0646,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 979898.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017968013882637024,
+ "skip_count": 0.0,
+ "step": 606,
+ "text_loss": 0.5458008050918579
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.854710889345465,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.224609375,
+ "learning_rate": 0.0009997258201532536,
+ "loss": 0.0751,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 982811.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016256732866168022,
+ "skip_count": 0.0,
+ "step": 608,
+ "text_loss": 0.8643257021903992
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2275390625,
+ "learning_rate": 0.0009997154756289303,
+ "loss": 0.0561,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 985245.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021214161068201065,
+ "skip_count": 0.0,
+ "step": 610,
+ "text_loss": 0.2204967886209488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.8734957440563544,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.150390625,
+ "learning_rate": 0.000999704939612694,
+ "loss": 0.0636,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 988539.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.23249399662017822,
+ "skip_count": 2.0,
+ "step": 612,
+ "text_loss": 0.32489025592803955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.8828881714117993,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.095703125,
+ "learning_rate": 0.0009996942121085824,
+ "loss": 0.0445,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 991660.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010706410743296146,
+ "skip_count": 0.0,
+ "step": 614,
+ "text_loss": 0.4551754891872406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.8922805987672437,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.3671875,
+ "learning_rate": 0.000999683293120706,
+ "loss": 0.1016,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 994828.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006676184479147196,
+ "skip_count": 0.0,
+ "step": 616,
+ "text_loss": 0.6212068200111389
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.9016730261226886,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.408203125,
+ "learning_rate": 0.0009996721826532491,
+ "loss": 0.0976,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 997951.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.2148125320672989,
+ "skip_count": 2.0,
+ "step": 618,
+ "text_loss": 0.26514527201652527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.911065453478133,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1904296875,
+ "learning_rate": 0.000999660880710469,
+ "loss": 0.0909,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1001139.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022332455962896347,
+ "skip_count": 0.0,
+ "step": 620,
+ "text_loss": 0.26131340861320496
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.920457880833578,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.169921875,
+ "learning_rate": 0.0009996493872966971,
+ "loss": 0.0732,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1003678.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08348730951547623,
+ "skip_count": 0.0,
+ "step": 622,
+ "text_loss": 0.19151706993579865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.929850308189023,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.173828125,
+ "learning_rate": 0.0009996377024163374,
+ "loss": 0.0822,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1007082.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.028577150776982307,
+ "skip_count": 0.0,
+ "step": 624,
+ "text_loss": 0.305387407541275
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.9392427355444672,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11279296875,
+ "learning_rate": 0.0009996258260738676,
+ "loss": 0.0892,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1010064.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08312026411294937,
+ "skip_count": 0.0,
+ "step": 626,
+ "text_loss": 0.49436143040657043
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.9486351628999117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009996137582738388,
+ "loss": 0.0591,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1013462.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013337327167391777,
+ "skip_count": 0.0,
+ "step": 628,
+ "text_loss": 0.6515294313430786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.9580275902553566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.140625,
+ "learning_rate": 0.000999601499020875,
+ "loss": 0.0537,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1016246.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.029126765206456184,
+ "skip_count": 0.0,
+ "step": 630,
+ "text_loss": 0.18834827840328217
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.9674200176108014,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.095703125,
+ "learning_rate": 0.0009995890483196746,
+ "loss": 0.0602,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1019286.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.054844800382852554,
+ "skip_count": 1.0,
+ "step": 632,
+ "text_loss": 0.6988179087638855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.322265625,
+ "learning_rate": 0.0009995764061750086,
+ "loss": 0.0767,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1022207.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010095693171024323,
+ "skip_count": 0.0,
+ "step": 634,
+ "text_loss": 0.558451771736145
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.9862048723216907,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2890625,
+ "learning_rate": 0.000999563572591721,
+ "loss": 0.0521,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1025319.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0698433518409729,
+ "skip_count": 1.0,
+ "step": 636,
+ "text_loss": 0.5961872935295105
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.995597299677135,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11083984375,
+ "learning_rate": 0.0009995505475747302,
+ "loss": 0.0849,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1028362.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.040211405605077744,
+ "skip_count": 1.0,
+ "step": 638,
+ "text_loss": 0.546863317489624
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.004696213677722,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.119140625,
+ "learning_rate": 0.0009995373311290272,
+ "loss": 0.0709,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1032199.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.1457643061876297,
+ "skip_count": 1.0,
+ "step": 640,
+ "text_loss": 0.2137298285961151
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.014088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1279296875,
+ "learning_rate": 0.0009995239232596764,
+ "loss": 0.0545,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1035801.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011394930072128773,
+ "skip_count": 0.0,
+ "step": 642,
+ "text_loss": 0.43054503202438354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.0234810683886115,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1015625,
+ "learning_rate": 0.0009995103239718163,
+ "loss": 0.0665,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1039223.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00997432041913271,
+ "skip_count": 0.0,
+ "step": 644,
+ "text_loss": 0.7749615907669067
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.0328734957440564,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2275390625,
+ "learning_rate": 0.0009994965332706573,
+ "loss": 0.0755,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1042154.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.10589150339365005,
+ "skip_count": 0.0,
+ "step": 646,
+ "text_loss": 0.7812211513519287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.042265923099501,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1943359375,
+ "learning_rate": 0.0009994825511614846,
+ "loss": 0.0383,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1045250.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0748734176158905,
+ "skip_count": 1.0,
+ "step": 648,
+ "text_loss": 0.844803512096405
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.0516583504549457,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1220703125,
+ "learning_rate": 0.0009994683776496562,
+ "loss": 0.0433,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1048446.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03742415830492973,
+ "skip_count": 1.0,
+ "step": 650,
+ "text_loss": 0.2098839282989502
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.0610507778103906,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12890625,
+ "learning_rate": 0.0009994540127406034,
+ "loss": 0.0591,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1051840.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06025516986846924,
+ "skip_count": 2.0,
+ "step": 652,
+ "text_loss": 0.27727583050727844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.070443205165835,
+ "f1_execute": 0.8979591727256775,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.181640625,
+ "learning_rate": 0.0009994394564398306,
+ "loss": 0.0519,
+ "macro_f1": 0.521541953086853,
+ "num_tokens": 1055142.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.22807340323925018,
+ "skip_count": 2.0,
+ "step": 654,
+ "text_loss": 0.9672397971153259
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.130859375,
+ "learning_rate": 0.0009994247087529158,
+ "loss": 0.0618,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1057698.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01348950993269682,
+ "skip_count": 0.0,
+ "step": 656,
+ "text_loss": 0.6375506520271301
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.0892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1953125,
+ "learning_rate": 0.0009994097696855106,
+ "loss": 0.0412,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1060624.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009649243205785751,
+ "skip_count": 0.0,
+ "step": 658,
+ "text_loss": 0.5315385460853577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.098620487232169,
+ "f1_execute": 0.923076868057251,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2041015625,
+ "learning_rate": 0.0009993946392433395,
+ "loss": 0.0609,
+ "macro_f1": 0.307692289352417,
+ "num_tokens": 1065076.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.1250980943441391,
+ "skip_count": 3.0,
+ "step": 660,
+ "text_loss": 0.25780341029167175
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.1080129145876136,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1640625,
+ "learning_rate": 0.0009993793174322006,
+ "loss": 0.0471,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1068365.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011544390581548214,
+ "skip_count": 0.0,
+ "step": 662,
+ "text_loss": 0.34876301884651184
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.1174053419430585,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0009993638042579654,
+ "loss": 0.0473,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1071693.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03777370601892471,
+ "skip_count": 1.0,
+ "step": 664,
+ "text_loss": 0.21811571717262268
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.126797769298503,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.203125,
+ "learning_rate": 0.0009993480997265783,
+ "loss": 0.0475,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1074733.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.049949806183576584,
+ "skip_count": 2.0,
+ "step": 666,
+ "text_loss": 0.38410288095474243
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.136190196653948,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10302734375,
+ "learning_rate": 0.0009993322038440572,
+ "loss": 0.0605,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1077993.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0247171800583601,
+ "skip_count": 0.0,
+ "step": 668,
+ "text_loss": 0.25576895475387573
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.1455826240093923,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.216796875,
+ "learning_rate": 0.000999316116616494,
+ "loss": 0.0619,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1080491.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008118715137243271,
+ "skip_count": 0.0,
+ "step": 670,
+ "text_loss": 0.6269792914390564
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.154975051364837,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.173828125,
+ "learning_rate": 0.0009992998380500527,
+ "loss": 0.0462,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1083817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03366057574748993,
+ "skip_count": 1.0,
+ "step": 672,
+ "text_loss": 0.26891493797302246
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.1643674787202816,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.0009992833681509716,
+ "loss": 0.0529,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1087368.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.020552074536681175,
+ "skip_count": 0.0,
+ "step": 674,
+ "text_loss": 0.14421936869621277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.1737599060757264,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.18359375,
+ "learning_rate": 0.0009992667069255619,
+ "loss": 0.0696,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 1090452.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06937336176633835,
+ "skip_count": 2.0,
+ "step": 676,
+ "text_loss": 0.24999259412288666
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.1831523334311713,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08740234375,
+ "learning_rate": 0.0009992498543802085,
+ "loss": 0.0588,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1093996.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0380021296441555,
+ "skip_count": 0.0,
+ "step": 678,
+ "text_loss": 0.42473849654197693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.25,
+ "avg_layers": 27.0,
+ "epoch": 3.1925447607866158,
+ "f1_execute": 0.9200000166893005,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.2119140625,
+ "learning_rate": 0.0009992328105213688,
+ "loss": 0.0411,
+ "macro_f1": 0.4400000274181366,
+ "num_tokens": 1096837.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.20885063707828522,
+ "skip_count": 4.0,
+ "step": 680,
+ "text_loss": 0.3829527199268341
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.2019371881420606,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.0009992155753555747,
+ "loss": 0.0722,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1100320.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018230699002742767,
+ "skip_count": 2.0,
+ "step": 682,
+ "text_loss": 0.6190969944000244
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.211329615497505,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.30859375,
+ "learning_rate": 0.0009991981488894303,
+ "loss": 0.0681,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 1103682.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05550144240260124,
+ "skip_count": 1.0,
+ "step": 684,
+ "text_loss": 0.44418027997016907
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.22072204285295,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2158203125,
+ "learning_rate": 0.0009991805311296133,
+ "loss": 0.0507,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1106427.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07990608364343643,
+ "skip_count": 2.0,
+ "step": 686,
+ "text_loss": 0.5577231645584106
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.2301144702083944,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1962890625,
+ "learning_rate": 0.0009991627220828753,
+ "loss": 0.0568,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1109314.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05167485028505325,
+ "skip_count": 2.0,
+ "step": 688,
+ "text_loss": 0.27325430512428284
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.2395068975638392,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10693359375,
+ "learning_rate": 0.0009991447217560408,
+ "loss": 0.0521,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1112748.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04621964320540428,
+ "skip_count": 2.0,
+ "step": 690,
+ "text_loss": 0.5288321375846863
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.2488993249192837,
+ "f1_execute": 0.923076868057251,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1962890625,
+ "learning_rate": 0.000999126530156007,
+ "loss": 0.0499,
+ "macro_f1": 0.307692289352417,
+ "num_tokens": 1116965.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11950276792049408,
+ "skip_count": 2.0,
+ "step": 692,
+ "text_loss": 0.14215624332427979
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.2582917522747286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2353515625,
+ "learning_rate": 0.0009991081472897454,
+ "loss": 0.0722,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1120570.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01905500330030918,
+ "skip_count": 0.0,
+ "step": 694,
+ "text_loss": 0.41862696409225464
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.267684179630173,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1357421875,
+ "learning_rate": 0.0009990895731643002,
+ "loss": 0.0464,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1124009.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06974572688341141,
+ "skip_count": 0.0,
+ "step": 696,
+ "text_loss": 0.41160130500793457
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.277076606985618,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1923828125,
+ "learning_rate": 0.000999070807786789,
+ "loss": 0.0531,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1127370.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.07055293023586273,
+ "skip_count": 0.0,
+ "step": 698,
+ "text_loss": 0.48068273067474365
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.2864690343410627,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.197265625,
+ "learning_rate": 0.000999051851164403,
+ "loss": 0.0619,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1130234.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.12506946921348572,
+ "skip_count": 1.0,
+ "step": 700,
+ "text_loss": 0.47925490140914917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1943359375,
+ "learning_rate": 0.000999032703304406,
+ "loss": 0.0674,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1132874.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00809287466108799,
+ "skip_count": 0.0,
+ "step": 702,
+ "text_loss": 0.47433632612228394
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.305253889051952,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1064453125,
+ "learning_rate": 0.0009990133642141358,
+ "loss": 0.0497,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1136011.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0319170281291008,
+ "skip_count": 2.0,
+ "step": 704,
+ "text_loss": 0.6574832201004028
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.3146463164073965,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.33984375,
+ "learning_rate": 0.000998993833901003,
+ "loss": 0.0619,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1139674.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09850362688302994,
+ "skip_count": 2.0,
+ "step": 706,
+ "text_loss": 0.7660127282142639
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.3240387437628414,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12158203125,
+ "learning_rate": 0.0009989741123724919,
+ "loss": 0.0574,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1143558.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006673311349004507,
+ "skip_count": 0.0,
+ "step": 708,
+ "text_loss": 0.5976111888885498
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.333431171118286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.154296875,
+ "learning_rate": 0.0009989541996361594,
+ "loss": 0.045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1146122.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004988791421055794,
+ "skip_count": 0.0,
+ "step": 710,
+ "text_loss": 0.5256119966506958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.3428235984737307,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1044921875,
+ "learning_rate": 0.0009989340956996367,
+ "loss": 0.0528,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1149546.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0067769973538815975,
+ "skip_count": 0.0,
+ "step": 712,
+ "text_loss": 0.5040497779846191
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.352216025829175,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.26953125,
+ "learning_rate": 0.0009989138005706273,
+ "loss": 0.0735,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1153195.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09899546951055527,
+ "skip_count": 2.0,
+ "step": 714,
+ "text_loss": 0.20803412795066833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1396484375,
+ "learning_rate": 0.000998893314256908,
+ "loss": 0.064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1157081.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010492355562746525,
+ "skip_count": 0.0,
+ "step": 716,
+ "text_loss": 0.23077639937400818
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.3710008805400644,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1298828125,
+ "learning_rate": 0.0009988726367663298,
+ "loss": 0.0539,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1160079.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01063773687928915,
+ "skip_count": 0.0,
+ "step": 718,
+ "text_loss": 0.6085864901542664
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.3803933078955093,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1640625,
+ "learning_rate": 0.0009988517681068163,
+ "loss": 0.0421,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1163249.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.05981874838471413,
+ "skip_count": 0.0,
+ "step": 720,
+ "text_loss": 0.4047050476074219
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.3897857352509537,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.171875,
+ "learning_rate": 0.0009988307082863638,
+ "loss": 0.0361,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1166259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009750043973326683,
+ "skip_count": 0.0,
+ "step": 722,
+ "text_loss": 0.5306474566459656
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.3991781626063986,
+ "f1_execute": 0.9411765336990356,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.240234375,
+ "learning_rate": 0.0009988094573130434,
+ "loss": 0.063,
+ "macro_f1": 0.5359477400779724,
+ "num_tokens": 1168887.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.18601104617118835,
+ "skip_count": 2.0,
+ "step": 724,
+ "text_loss": 0.53528892993927
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.408570589961843,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.142578125,
+ "learning_rate": 0.0009987880151949974,
+ "loss": 0.0496,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1172625.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02845010720193386,
+ "skip_count": 1.0,
+ "step": 726,
+ "text_loss": 0.4760453701019287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.417963017317288,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2177734375,
+ "learning_rate": 0.0009987663819404434,
+ "loss": 0.06,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1176580.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017596980556845665,
+ "skip_count": 2.0,
+ "step": 728,
+ "text_loss": 0.5146099328994751
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.427355444672733,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1318359375,
+ "learning_rate": 0.000998744557557671,
+ "loss": 0.0484,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1179804.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0625474750995636,
+ "skip_count": 1.0,
+ "step": 730,
+ "text_loss": 0.27738022804260254
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.436747872028177,
+ "f1_execute": 0.923076868057251,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.203125,
+ "learning_rate": 0.0009987225420550433,
+ "loss": 0.0796,
+ "macro_f1": 0.307692289352417,
+ "num_tokens": 1182658.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.16188351809978485,
+ "skip_count": 2.0,
+ "step": 732,
+ "text_loss": 0.23231445252895355
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.446140299383622,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2001953125,
+ "learning_rate": 0.0009987003354409965,
+ "loss": 0.0626,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1185451.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02391529455780983,
+ "skip_count": 0.0,
+ "step": 734,
+ "text_loss": 0.4496627151966095
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.4555327267390665,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.234375,
+ "learning_rate": 0.0009986779377240405,
+ "loss": 0.0513,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 1188666.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08435963839292526,
+ "skip_count": 1.0,
+ "step": 736,
+ "text_loss": 0.4950787127017975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.4649251540945114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1220703125,
+ "learning_rate": 0.000998655348912758,
+ "loss": 0.0515,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1193035.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01648722216486931,
+ "skip_count": 0.0,
+ "step": 738,
+ "text_loss": 0.24761848151683807
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1513671875,
+ "learning_rate": 0.0009986325690158051,
+ "loss": 0.0435,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1196840.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013143910095095634,
+ "skip_count": 0.0,
+ "step": 740,
+ "text_loss": 0.15662719309329987
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.4837100088054007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009986095980419113,
+ "loss": 0.0757,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1200573.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.026706280186772346,
+ "skip_count": 0.0,
+ "step": 742,
+ "text_loss": 0.16725164651870728
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.493102436160845,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1982421875,
+ "learning_rate": 0.0009985864359998787,
+ "loss": 0.0795,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 1203589.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.28607678413391113,
+ "skip_count": 3.0,
+ "step": 744,
+ "text_loss": 0.6350882053375244
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.50249486351629,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.0009985630828985835,
+ "loss": 0.0572,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1206422.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05685260891914368,
+ "skip_count": 1.0,
+ "step": 746,
+ "text_loss": 0.33779552578926086
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.5118872908717345,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.09814453125,
+ "learning_rate": 0.0009985395387469742,
+ "loss": 0.0458,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1211588.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0437830351293087,
+ "skip_count": 2.0,
+ "step": 748,
+ "text_loss": 0.28664472699165344
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.5212797182271793,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15625,
+ "learning_rate": 0.0009985158035540735,
+ "loss": 0.0714,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1214580.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.07074898481369019,
+ "skip_count": 0.0,
+ "step": 750,
+ "text_loss": 0.3939313292503357
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.21484375,
+ "learning_rate": 0.0009984918773289762,
+ "loss": 0.0699,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1217388.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009757856838405132,
+ "skip_count": 0.0,
+ "step": 752,
+ "text_loss": 0.37641215324401855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.5400645729380686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.140625,
+ "learning_rate": 0.0009984677600808512,
+ "loss": 0.054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1219960.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02515069581568241,
+ "skip_count": 0.0,
+ "step": 754,
+ "text_loss": 0.155938982963562
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.5494570002935135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.30078125,
+ "learning_rate": 0.0009984434518189405,
+ "loss": 0.0764,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1223234.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.025766927748918533,
+ "skip_count": 0.0,
+ "step": 756,
+ "text_loss": 0.691118061542511
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 3.558849427648958,
+ "f1_execute": 0.9411765336990356,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1416015625,
+ "learning_rate": 0.0009984189525525584,
+ "loss": 0.0451,
+ "macro_f1": 0.5359477400779724,
+ "num_tokens": 1225764.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.1782722771167755,
+ "skip_count": 2.0,
+ "step": 758,
+ "text_loss": 0.3592209219932556
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.568241855004403,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.189453125,
+ "learning_rate": 0.0009983942622910935,
+ "loss": 0.0659,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1230097.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00825568474829197,
+ "skip_count": 0.0,
+ "step": 760,
+ "text_loss": 0.4646475315093994
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.5776342823598473,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1962890625,
+ "learning_rate": 0.0009983693810440074,
+ "loss": 0.0477,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1233140.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04156976938247681,
+ "skip_count": 2.0,
+ "step": 762,
+ "text_loss": 0.298682302236557
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.587026709715292,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.3515625,
+ "learning_rate": 0.000998344308820834,
+ "loss": 0.0666,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1236305.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05697929114103317,
+ "skip_count": 1.0,
+ "step": 764,
+ "text_loss": 0.5249121189117432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.5964191370707366,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.18359375,
+ "learning_rate": 0.0009983190456311817,
+ "loss": 0.0592,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1239673.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09547408670186996,
+ "skip_count": 3.0,
+ "step": 766,
+ "text_loss": 0.41277334094047546
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.6058115644261814,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.185546875,
+ "learning_rate": 0.000998293591484731,
+ "loss": 0.0484,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1242292.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030693158507347107,
+ "skip_count": 2.0,
+ "step": 768,
+ "text_loss": 0.1583656519651413
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.615203991781626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15234375,
+ "learning_rate": 0.000998267946391236,
+ "loss": 0.051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1244661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01211300864815712,
+ "skip_count": 0.0,
+ "step": 770,
+ "text_loss": 0.4629349112510681
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.6245964191370708,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0927734375,
+ "learning_rate": 0.0009982421103605238,
+ "loss": 0.0441,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1248688.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0665968507528305,
+ "skip_count": 2.0,
+ "step": 772,
+ "text_loss": 0.4019293785095215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.6339888464925156,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2890625,
+ "learning_rate": 0.000998216083402495,
+ "loss": 0.0613,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1251395.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07186859846115112,
+ "skip_count": 2.0,
+ "step": 774,
+ "text_loss": 0.4659276604652405
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.302734375,
+ "learning_rate": 0.0009981898655271235,
+ "loss": 0.0488,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1254888.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007823926396667957,
+ "skip_count": 0.0,
+ "step": 776,
+ "text_loss": 0.5160359740257263
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 3.6527737012034045,
+ "f1_execute": 0.9130434989929199,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.11962890625,
+ "learning_rate": 0.0009981634567444557,
+ "loss": 0.0775,
+ "macro_f1": 0.590062141418457,
+ "num_tokens": 1258250.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.24624499678611755,
+ "skip_count": 4.0,
+ "step": 778,
+ "text_loss": 0.29319918155670166
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.6621661285588494,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.0009981368570646115,
+ "loss": 0.0885,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1260916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030730176717042923,
+ "skip_count": 1.0,
+ "step": 780,
+ "text_loss": 0.624981164932251
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.6715585559142943,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.142578125,
+ "learning_rate": 0.0009981100664977838,
+ "loss": 0.0699,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1264004.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006829176563769579,
+ "skip_count": 0.0,
+ "step": 782,
+ "text_loss": 0.6137266159057617
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.6809509832697387,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1748046875,
+ "learning_rate": 0.0009980830850542391,
+ "loss": 0.058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1267130.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018471000716090202,
+ "skip_count": 0.0,
+ "step": 784,
+ "text_loss": 0.15213175117969513
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.6903434106251836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2353515625,
+ "learning_rate": 0.0009980559127443166,
+ "loss": 0.052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1271129.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007903140969574451,
+ "skip_count": 0.0,
+ "step": 786,
+ "text_loss": 0.5768613219261169
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.699735837980628,
+ "f1_execute": 0.923076868057251,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.130859375,
+ "learning_rate": 0.000998028549578429,
+ "loss": 0.0719,
+ "macro_f1": 0.307692289352417,
+ "num_tokens": 1274232.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06737866252660751,
+ "skip_count": 3.0,
+ "step": 788,
+ "text_loss": 0.2877073585987091
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.709128265336073,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1748046875,
+ "learning_rate": 0.0009980009955670615,
+ "loss": 0.0698,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1277193.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.10194934904575348,
+ "skip_count": 3.0,
+ "step": 790,
+ "text_loss": 0.11860492825508118
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.7185206926915173,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.126953125,
+ "learning_rate": 0.000997973250720773,
+ "loss": 0.0552,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1280960.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.10297708213329315,
+ "skip_count": 2.0,
+ "step": 792,
+ "text_loss": 0.13477706909179688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.727913120046962,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009979453150501954,
+ "loss": 0.0663,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1284611.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06122037023305893,
+ "skip_count": 1.0,
+ "step": 794,
+ "text_loss": 0.40569379925727844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.737305547402407,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1279296875,
+ "learning_rate": 0.000997917188566034,
+ "loss": 0.062,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1287834.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.061135001480579376,
+ "skip_count": 2.0,
+ "step": 796,
+ "text_loss": 0.2829287648200989
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.7466979747578515,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.109375,
+ "learning_rate": 0.0009978888712790664,
+ "loss": 0.0654,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1291666.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04841872677206993,
+ "skip_count": 1.0,
+ "step": 798,
+ "text_loss": 1.011757254600525
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.20000000298023224,
+ "avg_layers": 27.0,
+ "epoch": 3.756090402113296,
+ "f1_execute": 0.8979591727256775,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.3333333134651184,
+ "grad_norm": 0.14453125,
+ "learning_rate": 0.0009978603632001444,
+ "loss": 0.0636,
+ "macro_f1": 0.4104308485984802,
+ "num_tokens": 1294627.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.15698759257793427,
+ "skip_count": 5.0,
+ "step": 800,
+ "text_loss": 0.4457623362541199
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.765482829468741,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.283203125,
+ "learning_rate": 0.0009978316643401916,
+ "loss": 0.0688,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1297711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018952010199427605,
+ "skip_count": 0.0,
+ "step": 802,
+ "text_loss": 0.2069481462240219
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.7748752568241857,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.14453125,
+ "learning_rate": 0.0009978027747102062,
+ "loss": 0.0479,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1300569.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014538386836647987,
+ "skip_count": 0.0,
+ "step": 804,
+ "text_loss": 0.4983852505683899
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.78426768417963,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2109375,
+ "learning_rate": 0.0009977736943212584,
+ "loss": 0.0721,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1303969.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.11164087057113647,
+ "skip_count": 2.0,
+ "step": 806,
+ "text_loss": 0.2910642921924591
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.793660111535075,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1826171875,
+ "learning_rate": 0.000997744423184492,
+ "loss": 0.0424,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1307263.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06073406711220741,
+ "skip_count": 1.0,
+ "step": 808,
+ "text_loss": 0.18831779062747955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.25,
+ "avg_layers": 27.0,
+ "epoch": 3.8030525388905194,
+ "f1_execute": 0.9200000166893005,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.26171875,
+ "learning_rate": 0.0009977149613111236,
+ "loss": 0.0486,
+ "macro_f1": 0.4400000274181366,
+ "num_tokens": 1309953.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11035524308681488,
+ "skip_count": 4.0,
+ "step": 810,
+ "text_loss": 0.7872759699821472
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.8124449662459643,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1650390625,
+ "learning_rate": 0.0009976853087124433,
+ "loss": 0.0536,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1313243.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021804286167025566,
+ "skip_count": 0.0,
+ "step": 812,
+ "text_loss": 0.22349292039871216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.8218373936014087,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.28125,
+ "learning_rate": 0.0009976554653998138,
+ "loss": 0.0612,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 1316165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.10715524107217789,
+ "skip_count": 2.0,
+ "step": 814,
+ "text_loss": 0.18035532534122467
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.8312298209568536,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1279296875,
+ "learning_rate": 0.000997625431384671,
+ "loss": 0.0564,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1319206.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007173649035394192,
+ "skip_count": 0.0,
+ "step": 816,
+ "text_loss": 0.48928648233413696
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.8406222483122985,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1357421875,
+ "learning_rate": 0.0009975952066785243,
+ "loss": 0.0655,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 1322549.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.22308112680912018,
+ "skip_count": 4.0,
+ "step": 818,
+ "text_loss": 0.5211259722709656
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.850014675667743,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1337890625,
+ "learning_rate": 0.0009975647912929557,
+ "loss": 0.0564,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1325213.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00998698640614748,
+ "skip_count": 0.0,
+ "step": 820,
+ "text_loss": 0.7117052674293518
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.8594071030231873,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15234375,
+ "learning_rate": 0.0009975341852396205,
+ "loss": 0.0723,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1328383.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07454588264226913,
+ "skip_count": 2.0,
+ "step": 822,
+ "text_loss": 0.34539610147476196
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.8687995303786322,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1630859375,
+ "learning_rate": 0.0009975033885302469,
+ "loss": 0.0604,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1331406.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009157589636743069,
+ "skip_count": 0.0,
+ "step": 824,
+ "text_loss": 0.7484824657440186
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.878191957734077,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1923828125,
+ "learning_rate": 0.0009974724011766363,
+ "loss": 0.0474,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1334410.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.17149391770362854,
+ "skip_count": 0.0,
+ "step": 826,
+ "text_loss": 0.5913820266723633
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.8875843850895215,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1884765625,
+ "learning_rate": 0.0009974412231906632,
+ "loss": 0.058,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1337653.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.09743282198905945,
+ "skip_count": 1.0,
+ "step": 828,
+ "text_loss": 0.2505693733692169
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.8969768124449664,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1533203125,
+ "learning_rate": 0.0009974098545842748,
+ "loss": 0.0638,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1340860.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.041490405797958374,
+ "skip_count": 1.0,
+ "step": 830,
+ "text_loss": 0.5585370063781738
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.906369239800411,
+ "f1_execute": 0.9019607901573181,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.193359375,
+ "learning_rate": 0.0009973782953694918,
+ "loss": 0.0746,
+ "macro_f1": 0.3006536066532135,
+ "num_tokens": 1344232.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.16080693900585175,
+ "skip_count": 3.0,
+ "step": 832,
+ "text_loss": 0.4782734513282776
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.9157616671558557,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1298828125,
+ "learning_rate": 0.000997346545558408,
+ "loss": 0.0522,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1347667.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01173500344157219,
+ "skip_count": 0.0,
+ "step": 834,
+ "text_loss": 0.25036177039146423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.173828125,
+ "learning_rate": 0.0009973146051631895,
+ "loss": 0.0522,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1350707.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011477196589112282,
+ "skip_count": 0.0,
+ "step": 836,
+ "text_loss": 0.5482863187789917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.934546521866745,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1650390625,
+ "learning_rate": 0.0009972824741960764,
+ "loss": 0.0536,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1353704.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010528896935284138,
+ "skip_count": 0.0,
+ "step": 838,
+ "text_loss": 0.6732596158981323
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.9439389492221895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1181640625,
+ "learning_rate": 0.000997250152669381,
+ "loss": 0.0573,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1356608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010678744874894619,
+ "skip_count": 0.0,
+ "step": 840,
+ "text_loss": 0.5479338765144348
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.9533313765776343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.181640625,
+ "learning_rate": 0.000997217640595489,
+ "loss": 0.0631,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1359809.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00835978239774704,
+ "skip_count": 0.0,
+ "step": 842,
+ "text_loss": 0.42543259263038635
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.9627238039330788,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1923828125,
+ "learning_rate": 0.0009971849379868593,
+ "loss": 0.0653,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1362201.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009930923581123352,
+ "skip_count": 0.0,
+ "step": 844,
+ "text_loss": 0.720462441444397
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.9721162312885236,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1123046875,
+ "learning_rate": 0.0009971520448560235,
+ "loss": 0.0615,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1365790.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06344373524188995,
+ "skip_count": 1.0,
+ "step": 846,
+ "text_loss": 0.8423607349395752
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 26.0,
+ "epoch": 3.9815086586439685,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.16796875,
+ "learning_rate": 0.000997118961215586,
+ "loss": 0.0674,
+ "macro_f1": 0.4533333480358124,
+ "num_tokens": 1368387.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.14688406884670258,
+ "skip_count": 3.0,
+ "step": 848,
+ "text_loss": 0.3933577537536621
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.990901085999413,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.000997085687078225,
+ "loss": 0.0518,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1371189.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009953443892300129,
+ "skip_count": 0.0,
+ "step": 850,
+ "text_loss": 0.41469162702560425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 4.0,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15625,
+ "learning_rate": 0.0009970522224566909,
+ "loss": 0.0555,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 1374008.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.048870690166950226,
+ "skip_count": 1.0,
+ "step": 852,
+ "text_loss": 0.613615870475769
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.009392427355444,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.283203125,
+ "learning_rate": 0.0009970185673638075,
+ "loss": 0.0629,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1376662.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06865929812192917,
+ "skip_count": 1.0,
+ "step": 854,
+ "text_loss": 0.4392736256122589
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 4.01878485471089,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.162109375,
+ "learning_rate": 0.0009969847218124716,
+ "loss": 0.0506,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1380049.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02382219396531582,
+ "skip_count": 1.0,
+ "step": 856,
+ "text_loss": 0.19115346670150757
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.028177282066334,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1884765625,
+ "learning_rate": 0.0009969506858156527,
+ "loss": 0.0344,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1383008.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03907281160354614,
+ "skip_count": 1.0,
+ "step": 858,
+ "text_loss": 0.34842637181282043
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.037569709421779,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12060546875,
+ "learning_rate": 0.0009969164593863935,
+ "loss": 0.0365,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1387051.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007645803038030863,
+ "skip_count": 0.0,
+ "step": 860,
+ "text_loss": 0.3810436725616455
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.046962136777223,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1484375,
+ "learning_rate": 0.0009968820425378098,
+ "loss": 0.0463,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1390244.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04435238987207413,
+ "skip_count": 0.0,
+ "step": 862,
+ "text_loss": 0.34853485226631165
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.056354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.28515625,
+ "learning_rate": 0.00099684743528309,
+ "loss": 0.0424,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1392976.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006071661598980427,
+ "skip_count": 0.0,
+ "step": 864,
+ "text_loss": 0.6395178437232971
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.065746991488113,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0810546875,
+ "learning_rate": 0.0009968126376354958,
+ "loss": 0.0477,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1396061.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05011235550045967,
+ "skip_count": 2.0,
+ "step": 866,
+ "text_loss": 0.09103966504335403
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.075139418843557,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.154296875,
+ "learning_rate": 0.0009967776496083616,
+ "loss": 0.0509,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1398993.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03979124873876572,
+ "skip_count": 0.0,
+ "step": 868,
+ "text_loss": 0.27257058024406433
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.084531846199002,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.14453125,
+ "learning_rate": 0.000996742471215095,
+ "loss": 0.0516,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1402080.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030823837965726852,
+ "skip_count": 2.0,
+ "step": 870,
+ "text_loss": 0.7047103047370911
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.093924273554447,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009967071024691763,
+ "loss": 0.0461,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1404890.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009721715934574604,
+ "skip_count": 0.0,
+ "step": 872,
+ "text_loss": 0.959106981754303
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.103316700909891,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1142578125,
+ "learning_rate": 0.000996671543384159,
+ "loss": 0.05,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1407853.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006025883834809065,
+ "skip_count": 0.0,
+ "step": 874,
+ "text_loss": 0.47571972012519836
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 4.112709128265336,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09765625,
+ "learning_rate": 0.0009966357939736692,
+ "loss": 0.0416,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1410723.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.025964925065636635,
+ "skip_count": 0.0,
+ "step": 876,
+ "text_loss": 0.4964611530303955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.122101555620781,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09423828125,
+ "learning_rate": 0.0009965998542514065,
+ "loss": 0.0415,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1414008.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09509637206792831,
+ "skip_count": 2.0,
+ "step": 878,
+ "text_loss": 0.621494710445404
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 4.131493982976226,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.11083984375,
+ "learning_rate": 0.0009965637242311427,
+ "loss": 0.0472,
+ "macro_f1": 0.542222261428833,
+ "num_tokens": 1417447.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02520318515598774,
+ "skip_count": 4.0,
+ "step": 880,
+ "text_loss": 0.40209758281707764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 25.0,
+ "epoch": 4.14088641033167,
+ "f1_execute": 0.936170220375061,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.000996527403926723,
+ "loss": 0.0495,
+ "macro_f1": 0.5342789888381958,
+ "num_tokens": 1419905.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.13183781504631042,
+ "skip_count": 6.0,
+ "step": 882,
+ "text_loss": 0.642185389995575
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.1502788376871145,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1201171875,
+ "learning_rate": 0.0009964908933520655,
+ "loss": 0.0375,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1423436.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009429510682821274,
+ "skip_count": 0.0,
+ "step": 884,
+ "text_loss": 0.48232755064964294
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.15967126504256,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1669921875,
+ "learning_rate": 0.0009964541925211613,
+ "loss": 0.0349,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1426842.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07629609107971191,
+ "skip_count": 2.0,
+ "step": 886,
+ "text_loss": 0.16620934009552002
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.169063692398004,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0927734375,
+ "learning_rate": 0.0009964173014480738,
+ "loss": 0.0348,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1430430.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.036814019083976746,
+ "skip_count": 2.0,
+ "step": 888,
+ "text_loss": 0.4866008758544922
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.178456119753449,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1123046875,
+ "learning_rate": 0.0009963802201469398,
+ "loss": 0.0476,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1433821.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0041250260546803474,
+ "skip_count": 0.0,
+ "step": 890,
+ "text_loss": 0.578216552734375
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.187848547108893,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2373046875,
+ "learning_rate": 0.0009963429486319693,
+ "loss": 0.0463,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1436976.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06213559955358505,
+ "skip_count": 2.0,
+ "step": 892,
+ "text_loss": 0.221701517701149
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 4.197240974464338,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.361328125,
+ "learning_rate": 0.0009963054869174446,
+ "loss": 0.0313,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 1440397.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07532428950071335,
+ "skip_count": 2.0,
+ "step": 894,
+ "text_loss": 0.6922838091850281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.206633401819783,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1572265625,
+ "learning_rate": 0.0009962678350177209,
+ "loss": 0.0472,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1443604.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0419243648648262,
+ "skip_count": 1.0,
+ "step": 896,
+ "text_loss": 0.22092342376708984
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.216025829175227,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1015625,
+ "learning_rate": 0.0009962299929472268,
+ "loss": 0.034,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1446257.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.10849297791719437,
+ "skip_count": 0.0,
+ "step": 898,
+ "text_loss": 0.26394811272621155
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.225418256530672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10205078125,
+ "learning_rate": 0.000996191960720463,
+ "loss": 0.0394,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1449669.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0092767970636487,
+ "skip_count": 0.0,
+ "step": 900,
+ "text_loss": 0.5338577628135681
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.234810683886117,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.154296875,
+ "learning_rate": 0.0009961537383520042,
+ "loss": 0.0354,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1452450.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02985367365181446,
+ "skip_count": 0.0,
+ "step": 902,
+ "text_loss": 0.5875228047370911
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.2442031112415615,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10205078125,
+ "learning_rate": 0.0009961153258564966,
+ "loss": 0.0378,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1456909.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06794842332601547,
+ "skip_count": 3.0,
+ "step": 904,
+ "text_loss": 0.40959444642066956
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.253595538597006,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0009960767232486604,
+ "loss": 0.0476,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1461712.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023562447167932987,
+ "skip_count": 0.0,
+ "step": 906,
+ "text_loss": 0.3932875096797943
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.262987965952451,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08203125,
+ "learning_rate": 0.000996037930543288,
+ "loss": 0.0505,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1464817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03880339860916138,
+ "skip_count": 1.0,
+ "step": 908,
+ "text_loss": 0.17482402920722961
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.272380393307896,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2119140625,
+ "learning_rate": 0.000995998947755245,
+ "loss": 0.0479,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1467810.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01736828312277794,
+ "skip_count": 1.0,
+ "step": 910,
+ "text_loss": 0.4140470325946808
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.169921875,
+ "learning_rate": 0.0009959597748994695,
+ "loss": 0.0752,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1470802.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011824851855635643,
+ "skip_count": 0.0,
+ "step": 912,
+ "text_loss": 0.7153383493423462
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 4.2911652480187845,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1455078125,
+ "learning_rate": 0.0009959204119909726,
+ "loss": 0.0421,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1474539.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.025456594303250313,
+ "skip_count": 0.0,
+ "step": 914,
+ "text_loss": 0.42812058329582214
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.142578125,
+ "learning_rate": 0.0009958808590448385,
+ "loss": 0.0489,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1477552.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006795851048082113,
+ "skip_count": 0.0,
+ "step": 916,
+ "text_loss": 0.5402814149856567
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.309950102729674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1083984375,
+ "learning_rate": 0.0009958411160762234,
+ "loss": 0.039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1482547.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015615932643413544,
+ "skip_count": 0.0,
+ "step": 918,
+ "text_loss": 0.3836168050765991
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.319342530085119,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08984375,
+ "learning_rate": 0.0009958011831003577,
+ "loss": 0.0448,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1485807.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.043541423976421356,
+ "skip_count": 1.0,
+ "step": 920,
+ "text_loss": 0.4333936274051666
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 4.328734957440563,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1337890625,
+ "learning_rate": 0.000995761060132543,
+ "loss": 0.0418,
+ "macro_f1": 0.6538461446762085,
+ "num_tokens": 1488941.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.05866432189941406,
+ "skip_count": 2.0,
+ "step": 922,
+ "text_loss": 0.4106994867324829
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.3381273847960085,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1630859375,
+ "learning_rate": 0.0009957207471881552,
+ "loss": 0.0531,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1492026.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02714901603758335,
+ "skip_count": 2.0,
+ "step": 924,
+ "text_loss": 0.542091429233551
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.347519812151453,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1796875,
+ "learning_rate": 0.0009956802442826415,
+ "loss": 0.0386,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1494543.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0563737191259861,
+ "skip_count": 0.0,
+ "step": 926,
+ "text_loss": 0.47209203243255615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.356912239506897,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1259765625,
+ "learning_rate": 0.0009956395514315235,
+ "loss": 0.0496,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1497831.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03285066783428192,
+ "skip_count": 0.0,
+ "step": 928,
+ "text_loss": 0.6628931164741516
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.366304666862343,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.154296875,
+ "learning_rate": 0.0009955986686503943,
+ "loss": 0.0466,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1501375.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.024297121912240982,
+ "skip_count": 1.0,
+ "step": 930,
+ "text_loss": 0.495676189661026
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.25,
+ "avg_layers": 28.0,
+ "epoch": 4.375697094217787,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.11181640625,
+ "learning_rate": 0.0009955575959549202,
+ "loss": 0.0424,
+ "macro_f1": 0.7795917987823486,
+ "num_tokens": 1504363.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.12196464836597443,
+ "skip_count": 4.0,
+ "step": 932,
+ "text_loss": 0.26123273372650146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.3850895215732315,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1708984375,
+ "learning_rate": 0.0009955163333608408,
+ "loss": 0.0538,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1507178.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012947078794240952,
+ "skip_count": 0.0,
+ "step": 934,
+ "text_loss": 0.32552677392959595
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.394481948928676,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.154296875,
+ "learning_rate": 0.0009954748808839674,
+ "loss": 0.0379,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1509910.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008946365676820278,
+ "skip_count": 0.0,
+ "step": 936,
+ "text_loss": 0.533141016960144
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.403874376284121,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.140625,
+ "learning_rate": 0.000995433238540185,
+ "loss": 0.0466,
+ "macro_f1": 0.6538461446762085,
+ "num_tokens": 1512826.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.029975678771734238,
+ "skip_count": 1.0,
+ "step": 938,
+ "text_loss": 0.2953577935695648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.413266803639566,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10888671875,
+ "learning_rate": 0.0009953914063454512,
+ "loss": 0.0497,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1517230.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0889134630560875,
+ "skip_count": 2.0,
+ "step": 940,
+ "text_loss": 0.5368834733963013
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.193359375,
+ "learning_rate": 0.000995349384315796,
+ "loss": 0.0413,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1519876.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013458753935992718,
+ "skip_count": 0.0,
+ "step": 942,
+ "text_loss": 0.2005518227815628
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 4.432051658350455,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1357421875,
+ "learning_rate": 0.000995307172467322,
+ "loss": 0.0444,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 1522998.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08850377053022385,
+ "skip_count": 1.0,
+ "step": 944,
+ "text_loss": 0.227926567196846
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.4414440857059,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1435546875,
+ "learning_rate": 0.0009952647708162054,
+ "loss": 0.0503,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1527100.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03199794515967369,
+ "skip_count": 1.0,
+ "step": 946,
+ "text_loss": 0.4859686493873596
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.450836513061344,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1748046875,
+ "learning_rate": 0.0009952221793786942,
+ "loss": 0.0354,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1530028.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006507779937237501,
+ "skip_count": 0.0,
+ "step": 948,
+ "text_loss": 0.6855354905128479
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.460228940416789,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10986328125,
+ "learning_rate": 0.0009951793981711097,
+ "loss": 0.0584,
+ "macro_f1": 0.6538461446762085,
+ "num_tokens": 1533254.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06175103038549423,
+ "skip_count": 1.0,
+ "step": 950,
+ "text_loss": 0.7590400576591492
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.469621367772234,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1025390625,
+ "learning_rate": 0.0009951364272098458,
+ "loss": 0.0295,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1536239.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03773383051156998,
+ "skip_count": 2.0,
+ "step": 952,
+ "text_loss": 0.669784665107727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.4790137951276785,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1748046875,
+ "learning_rate": 0.0009950932665113688,
+ "loss": 0.0507,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1539682.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07280613481998444,
+ "skip_count": 2.0,
+ "step": 954,
+ "text_loss": 0.3365570902824402
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.488406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12255859375,
+ "learning_rate": 0.0009950499160922184,
+ "loss": 0.0541,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1542875.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01770266517996788,
+ "skip_count": 0.0,
+ "step": 956,
+ "text_loss": 0.0921545997262001
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.497798649838567,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09375,
+ "learning_rate": 0.000995006375969006,
+ "loss": 0.0473,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1547135.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.07672002166509628,
+ "skip_count": 0.0,
+ "step": 958,
+ "text_loss": 0.5887606739997864
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.507191077194013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1376953125,
+ "learning_rate": 0.0009949626461584165,
+ "loss": 0.043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1550100.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006247182376682758,
+ "skip_count": 0.0,
+ "step": 960,
+ "text_loss": 0.5777931213378906
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.516583504549457,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.119140625,
+ "learning_rate": 0.0009949187266772076,
+ "loss": 0.0366,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1553192.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030319908633828163,
+ "skip_count": 2.0,
+ "step": 962,
+ "text_loss": 0.2370252162218094
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.5259759319049016,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.0009948746175422088,
+ "loss": 0.0511,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1556318.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006004320923238993,
+ "skip_count": 0.0,
+ "step": 964,
+ "text_loss": 0.6271032094955444
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15234375,
+ "learning_rate": 0.000994830318770323,
+ "loss": 0.0514,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1559195.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011544366367161274,
+ "skip_count": 0.0,
+ "step": 966,
+ "text_loss": 0.47256720066070557
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 4.544760786615791,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.171875,
+ "learning_rate": 0.0009947858303785255,
+ "loss": 0.0374,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 1561813.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.05258861929178238,
+ "skip_count": 1.0,
+ "step": 968,
+ "text_loss": 0.7703132629394531
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.554153213971236,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1142578125,
+ "learning_rate": 0.0009947411523838648,
+ "loss": 0.0453,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1564634.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011216280050575733,
+ "skip_count": 0.0,
+ "step": 970,
+ "text_loss": 0.4666804075241089
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1533203125,
+ "learning_rate": 0.0009946962848034608,
+ "loss": 0.0696,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1567959.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009387624450027943,
+ "skip_count": 0.0,
+ "step": 972,
+ "text_loss": 0.4067264199256897
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.5729380686821255,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.203125,
+ "learning_rate": 0.0009946512276545075,
+ "loss": 0.0397,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1571221.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.041713520884513855,
+ "skip_count": 0.0,
+ "step": 974,
+ "text_loss": 0.5242366194725037
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 4.58233049603757,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.228515625,
+ "learning_rate": 0.0009946059809542705,
+ "loss": 0.0487,
+ "macro_f1": 0.7644445300102234,
+ "num_tokens": 1575033.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.05748331546783447,
+ "skip_count": 2.0,
+ "step": 976,
+ "text_loss": 0.5704690217971802
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 4.591722923393014,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1396484375,
+ "learning_rate": 0.0009945605447200887,
+ "loss": 0.0445,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1579050.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016765203326940536,
+ "skip_count": 0.0,
+ "step": 978,
+ "text_loss": 0.4804173707962036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.601115350748459,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1337890625,
+ "learning_rate": 0.0009945149189693732,
+ "loss": 0.0406,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1582967.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021518222987651825,
+ "skip_count": 2.0,
+ "step": 980,
+ "text_loss": 0.4138598144054413
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.610507778103904,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11474609375,
+ "learning_rate": 0.0009944691037196078,
+ "loss": 0.0456,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1586282.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012246460653841496,
+ "skip_count": 0.0,
+ "step": 982,
+ "text_loss": 0.22561736404895782
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 24.0,
+ "epoch": 4.6199002054593485,
+ "f1_execute": 0.930232584476471,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.8000000715255737,
+ "grad_norm": 0.1455078125,
+ "learning_rate": 0.0009944230989883491,
+ "loss": 0.0456,
+ "macro_f1": 0.7989664077758789,
+ "num_tokens": 1589279.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.09344895929098129,
+ "skip_count": 5.0,
+ "step": 984,
+ "text_loss": 0.4416656494140625
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.629292632814793,
+ "f1_execute": 0.9411765336990356,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.111328125,
+ "learning_rate": 0.0009943769047932264,
+ "loss": 0.0404,
+ "macro_f1": 0.5359477400779724,
+ "num_tokens": 1592398.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.08916857838630676,
+ "skip_count": 2.0,
+ "step": 986,
+ "text_loss": 0.5536438822746277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.638685060170237,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15234375,
+ "learning_rate": 0.000994330521151941,
+ "loss": 0.039,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1596213.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06114347651600838,
+ "skip_count": 1.0,
+ "step": 988,
+ "text_loss": 0.5835405588150024
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1953125,
+ "learning_rate": 0.000994283948082267,
+ "loss": 0.0573,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1598827.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017335431184619665,
+ "skip_count": 0.0,
+ "step": 990,
+ "text_loss": 0.5857380032539368
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.657469914881127,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10693359375,
+ "learning_rate": 0.0009942371856020522,
+ "loss": 0.0341,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1602915.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014606470242142677,
+ "skip_count": 0.0,
+ "step": 992,
+ "text_loss": 0.6939892768859863
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 4.666862342236572,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.140625,
+ "learning_rate": 0.0009941902337292155,
+ "loss": 0.06,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 1605776.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.06297315657138824,
+ "skip_count": 1.0,
+ "step": 994,
+ "text_loss": 0.37616831064224243
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.676254769592017,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1083984375,
+ "learning_rate": 0.0009941430924817487,
+ "loss": 0.0572,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1609856.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03297794610261917,
+ "skip_count": 2.0,
+ "step": 996,
+ "text_loss": 0.2098303586244583
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.685647196947461,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10107421875,
+ "learning_rate": 0.000994095761877717,
+ "loss": 0.0499,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1612904.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012901155278086662,
+ "skip_count": 0.0,
+ "step": 998,
+ "text_loss": 0.20103533565998077
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 4.695039624302906,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.259765625,
+ "learning_rate": 0.000994048241935257,
+ "loss": 0.0535,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1615540.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.020434845238924026,
+ "skip_count": 0.0,
+ "step": 1000,
+ "text_loss": 0.32709044218063354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.70443205165835,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1669921875,
+ "learning_rate": 0.0009940005326725789,
+ "loss": 0.0453,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1618786.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07831378281116486,
+ "skip_count": 2.0,
+ "step": 1002,
+ "text_loss": 0.5789632797241211
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.713824479013795,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.21875,
+ "learning_rate": 0.0009939526341079647,
+ "loss": 0.0511,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1621736.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04863874986767769,
+ "skip_count": 0.0,
+ "step": 1004,
+ "text_loss": 0.6128849387168884
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1435546875,
+ "learning_rate": 0.0009939045462597693,
+ "loss": 0.0538,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1624649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00677989237010479,
+ "skip_count": 0.0,
+ "step": 1006,
+ "text_loss": 0.6168264150619507
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.732609333724684,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009938562691464202,
+ "loss": 0.0524,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1627700.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019490402191877365,
+ "skip_count": 0.0,
+ "step": 1008,
+ "text_loss": 0.17463822662830353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.742001761080129,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1298828125,
+ "learning_rate": 0.000993807802786417,
+ "loss": 0.0475,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1630714.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019022391643375158,
+ "skip_count": 0.0,
+ "step": 1010,
+ "text_loss": 0.5675593018531799
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 29.0,
+ "epoch": 4.751394188435574,
+ "f1_execute": 0.9599999785423279,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1640625,
+ "learning_rate": 0.0009937591471983322,
+ "loss": 0.0501,
+ "macro_f1": 0.7644444704055786,
+ "num_tokens": 1633770.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.042485643178224564,
+ "skip_count": 2.0,
+ "step": 1012,
+ "text_loss": 0.42387229204177856
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.760786615791019,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1396484375,
+ "learning_rate": 0.0009937103024008109,
+ "loss": 0.0545,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1637120.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09427817165851593,
+ "skip_count": 1.0,
+ "step": 1014,
+ "text_loss": 0.49511051177978516
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12890625,
+ "learning_rate": 0.0009936612684125702,
+ "loss": 0.0503,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1640165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005106127820909023,
+ "skip_count": 0.0,
+ "step": 1016,
+ "text_loss": 0.5398799180984497
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.7795714705019074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2734375,
+ "learning_rate": 0.0009936120452524004,
+ "loss": 0.0506,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1643251.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016914300620555878,
+ "skip_count": 0.0,
+ "step": 1018,
+ "text_loss": 0.20882178843021393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.788963897857353,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1962890625,
+ "learning_rate": 0.0009935626329391637,
+ "loss": 0.0537,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1646560.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.13481520116329193,
+ "skip_count": 2.0,
+ "step": 1020,
+ "text_loss": 0.5719883441925049
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.798356325212797,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0009935130314917948,
+ "loss": 0.0602,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1649538.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07700438797473907,
+ "skip_count": 2.0,
+ "step": 1022,
+ "text_loss": 0.1303367167711258
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.807748752568242,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1494140625,
+ "learning_rate": 0.0009934632409293015,
+ "loss": 0.0611,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1652397.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11416907608509064,
+ "skip_count": 1.0,
+ "step": 1024,
+ "text_loss": 0.24076920747756958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 4.817141179923686,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.306640625,
+ "learning_rate": 0.0009934132612707631,
+ "loss": 0.0507,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 1654938.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09484589844942093,
+ "skip_count": 2.0,
+ "step": 1026,
+ "text_loss": 0.1652517318725586
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.826533607279131,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1435546875,
+ "learning_rate": 0.0009933630925353324,
+ "loss": 0.0395,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1658536.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00741987070068717,
+ "skip_count": 0.0,
+ "step": 1028,
+ "text_loss": 0.49296700954437256
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.835926034634576,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1845703125,
+ "learning_rate": 0.0009933127347422337,
+ "loss": 0.0602,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1661446.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08399344235658646,
+ "skip_count": 2.0,
+ "step": 1030,
+ "text_loss": 0.22363591194152832
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.158203125,
+ "learning_rate": 0.0009932621879107648,
+ "loss": 0.0475,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1664612.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031781597062945366,
+ "skip_count": 0.0,
+ "step": 1032,
+ "text_loss": 0.36083245277404785
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.854710889345466,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2275390625,
+ "learning_rate": 0.000993211452060295,
+ "loss": 0.042,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1667467.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03595469892024994,
+ "skip_count": 1.0,
+ "step": 1034,
+ "text_loss": 0.16372856497764587
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.86410331670091,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.189453125,
+ "learning_rate": 0.000993160527210266,
+ "loss": 0.061,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1670675.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.1597205102443695,
+ "skip_count": 0.0,
+ "step": 1036,
+ "text_loss": 0.6049913763999939
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.873495744056354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2197265625,
+ "learning_rate": 0.000993109413380193,
+ "loss": 0.0562,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1673477.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009756010957062244,
+ "skip_count": 0.0,
+ "step": 1038,
+ "text_loss": 0.7034620642662048
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 4.882888171411799,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1806640625,
+ "learning_rate": 0.0009930581105896624,
+ "loss": 0.0559,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1676809.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.020718922838568687,
+ "skip_count": 0.0,
+ "step": 1040,
+ "text_loss": 0.2814720571041107
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.892280598767244,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1923828125,
+ "learning_rate": 0.0009930066188583338,
+ "loss": 0.0445,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1679398.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04755603149533272,
+ "skip_count": 1.0,
+ "step": 1042,
+ "text_loss": 0.5445759296417236
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.901673026122689,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.126953125,
+ "learning_rate": 0.0009929549382059388,
+ "loss": 0.0509,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1682269.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01040949858725071,
+ "skip_count": 0.0,
+ "step": 1044,
+ "text_loss": 0.2876914143562317
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.911065453478133,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1259765625,
+ "learning_rate": 0.0009929030686522816,
+ "loss": 0.0363,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1685428.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008158888667821884,
+ "skip_count": 0.0,
+ "step": 1046,
+ "text_loss": 0.49053525924682617
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.9204578808335775,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1630859375,
+ "learning_rate": 0.0009928510102172386,
+ "loss": 0.0498,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1688252.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005102572031319141,
+ "skip_count": 0.0,
+ "step": 1048,
+ "text_loss": 0.5274341106414795
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.929850308189023,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1591796875,
+ "learning_rate": 0.0009927987629207587,
+ "loss": 0.0564,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1691289.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016768503934144974,
+ "skip_count": 0.0,
+ "step": 1050,
+ "text_loss": 0.9935035109519958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.939242735544467,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1455078125,
+ "learning_rate": 0.0009927463267828634,
+ "loss": 0.0488,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1694148.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010905829258263111,
+ "skip_count": 0.0,
+ "step": 1052,
+ "text_loss": 0.20895758271217346
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.948635162899912,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1455078125,
+ "learning_rate": 0.000992693701823646,
+ "loss": 0.0624,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1698543.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.10533971339464188,
+ "skip_count": 0.0,
+ "step": 1054,
+ "text_loss": 0.5776236653327942
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.958027590255357,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.255859375,
+ "learning_rate": 0.0009926408880632726,
+ "loss": 0.0556,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1702460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.026313411071896553,
+ "skip_count": 1.0,
+ "step": 1056,
+ "text_loss": 0.34990596771240234
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.967420017610801,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.099609375,
+ "learning_rate": 0.0009925878855219818,
+ "loss": 0.0391,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1705686.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007763393223285675,
+ "skip_count": 0.0,
+ "step": 1058,
+ "text_loss": 0.4980163276195526
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.976812444966246,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.177734375,
+ "learning_rate": 0.000992534694220084,
+ "loss": 0.0613,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1708739.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03998444974422455,
+ "skip_count": 1.0,
+ "step": 1060,
+ "text_loss": 0.29092350602149963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.98620487232169,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1572265625,
+ "learning_rate": 0.000992481314177962,
+ "loss": 0.0312,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1711903.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06966045498847961,
+ "skip_count": 1.0,
+ "step": 1062,
+ "text_loss": 0.6267179250717163
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.995597299677136,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.244140625,
+ "learning_rate": 0.0009924277454160717,
+ "loss": 0.0548,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1715974.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05536063387989998,
+ "skip_count": 1.0,
+ "step": 1064,
+ "text_loss": 0.5813798904418945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.004696213677723,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.134765625,
+ "learning_rate": 0.0009923739879549402,
+ "loss": 0.0423,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1718828.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.020993782207369804,
+ "skip_count": 0.0,
+ "step": 1066,
+ "text_loss": 0.22665327787399292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.014088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0888671875,
+ "learning_rate": 0.0009923200418151677,
+ "loss": 0.0301,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1722419.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007351701147854328,
+ "skip_count": 0.0,
+ "step": 1068,
+ "text_loss": 0.5796169638633728
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.0234810683886115,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.142578125,
+ "learning_rate": 0.0009922659070174264,
+ "loss": 0.0452,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1725663.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.026033315807580948,
+ "skip_count": 0.0,
+ "step": 1070,
+ "text_loss": 0.25742828845977783
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.0009922115835824612,
+ "loss": 0.041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1729239.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0118600158020854,
+ "skip_count": 0.0,
+ "step": 1072,
+ "text_loss": 0.21630282700061798
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 5.042265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12158203125,
+ "learning_rate": 0.0009921570715310884,
+ "loss": 0.0364,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 1732507.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.016118815168738365,
+ "skip_count": 0.0,
+ "step": 1074,
+ "text_loss": 0.5639925003051758
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.051658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0791015625,
+ "learning_rate": 0.0009921023708841974,
+ "loss": 0.0407,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1736182.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004275390412658453,
+ "skip_count": 0.0,
+ "step": 1076,
+ "text_loss": 0.5758615136146545
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1103515625,
+ "learning_rate": 0.0009920474816627496,
+ "loss": 0.037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1739559.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01299292128533125,
+ "skip_count": 0.0,
+ "step": 1078,
+ "text_loss": 0.18221625685691833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.0704432051658355,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1630859375,
+ "learning_rate": 0.0009919924038877788,
+ "loss": 0.0343,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1742890.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.038295745849609375,
+ "skip_count": 2.0,
+ "step": 1080,
+ "text_loss": 0.17354349792003632
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 29.0,
+ "epoch": 5.07983563252128,
+ "f1_execute": 0.9583333134651184,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.1884765625,
+ "learning_rate": 0.0009919371375803905,
+ "loss": 0.0455,
+ "macro_f1": 0.8194444179534912,
+ "num_tokens": 1746433.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04052971675992012,
+ "skip_count": 3.0,
+ "step": 1082,
+ "text_loss": 0.2250112146139145
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.0009918816827617632,
+ "loss": 0.0353,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1750802.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009114136919379234,
+ "skip_count": 0.0,
+ "step": 1084,
+ "text_loss": 0.2526719272136688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.098620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1279296875,
+ "learning_rate": 0.000991826039453147,
+ "loss": 0.0392,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1754272.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004904678091406822,
+ "skip_count": 0.0,
+ "step": 1086,
+ "text_loss": 0.7308789491653442
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 5.108012914587614,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.138671875,
+ "learning_rate": 0.000991770207675865,
+ "loss": 0.0327,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 1757231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02129189297556877,
+ "skip_count": 2.0,
+ "step": 1088,
+ "text_loss": 0.21764220297336578
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.1174053419430585,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009917141874513113,
+ "loss": 0.0315,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1760003.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01310618408024311,
+ "skip_count": 0.0,
+ "step": 1090,
+ "text_loss": 0.33892181515693665
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.126797769298503,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.171875,
+ "learning_rate": 0.0009916579788009537,
+ "loss": 0.0457,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1763052.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02059309557080269,
+ "skip_count": 2.0,
+ "step": 1092,
+ "text_loss": 0.6551769375801086
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.136190196653947,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10546875,
+ "learning_rate": 0.0009916015817463312,
+ "loss": 0.0385,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1766655.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0274797435849905,
+ "skip_count": 2.0,
+ "step": 1094,
+ "text_loss": 0.3984372019767761
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11181640625,
+ "learning_rate": 0.000991544996309055,
+ "loss": 0.0271,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1769997.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01437368243932724,
+ "skip_count": 0.0,
+ "step": 1096,
+ "text_loss": 0.4203338921070099
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.154975051364837,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1103515625,
+ "learning_rate": 0.000991488222510809,
+ "loss": 0.0292,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1773130.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001382062560878694,
+ "skip_count": 0.0,
+ "step": 1098,
+ "text_loss": 0.43132516741752625
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.164367478720282,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.123046875,
+ "learning_rate": 0.000991431260373349,
+ "loss": 0.0329,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1775682.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.1115434318780899,
+ "skip_count": 2.0,
+ "step": 1100,
+ "text_loss": 0.3218227028846741
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.111328125,
+ "learning_rate": 0.000991374109918503,
+ "loss": 0.0185,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1778407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009529678151011467,
+ "skip_count": 0.0,
+ "step": 1102,
+ "text_loss": 0.17183731496334076
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.183152333431171,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1142578125,
+ "learning_rate": 0.000991316771168171,
+ "loss": 0.044,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1781518.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018668074160814285,
+ "skip_count": 2.0,
+ "step": 1104,
+ "text_loss": 1.1324785947799683
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.192544760786616,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.125,
+ "learning_rate": 0.0009912592441443258,
+ "loss": 0.0411,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1784878.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04145100712776184,
+ "skip_count": 1.0,
+ "step": 1106,
+ "text_loss": 0.6082063317298889
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.20193718814206,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08984375,
+ "learning_rate": 0.0009912015288690112,
+ "loss": 0.0421,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1788978.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021450644358992577,
+ "skip_count": 1.0,
+ "step": 1108,
+ "text_loss": 0.5597621202468872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.2113296154975055,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.083984375,
+ "learning_rate": 0.0009911436253643444,
+ "loss": 0.0238,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1792321.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017405325546860695,
+ "skip_count": 0.0,
+ "step": 1110,
+ "text_loss": 0.2560598850250244
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.0009910855336525137,
+ "loss": 0.0383,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1795182.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007162237539887428,
+ "skip_count": 0.0,
+ "step": 1112,
+ "text_loss": 0.3438240587711334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 5.230114470208394,
+ "f1_execute": 0.9411765336990356,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.115234375,
+ "learning_rate": 0.00099102725375578,
+ "loss": 0.0326,
+ "macro_f1": 0.480392187833786,
+ "num_tokens": 1798987.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11149197816848755,
+ "skip_count": 3.0,
+ "step": 1114,
+ "text_loss": 0.20455503463745117
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.239506897563839,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10791015625,
+ "learning_rate": 0.0009909687856964767,
+ "loss": 0.035,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 1802064.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.12679415941238403,
+ "skip_count": 3.0,
+ "step": 1116,
+ "text_loss": 0.11996729671955109
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.248899324919284,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.12451171875,
+ "learning_rate": 0.0009909101294970082,
+ "loss": 0.0365,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1805412.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05108053982257843,
+ "skip_count": 2.0,
+ "step": 1118,
+ "text_loss": 0.13224145770072937
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 5.258291752274729,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.123046875,
+ "learning_rate": 0.0009908512851798522,
+ "loss": 0.0455,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 1808196.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02131766639649868,
+ "skip_count": 1.0,
+ "step": 1120,
+ "text_loss": 0.7824069261550903
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.138671875,
+ "learning_rate": 0.0009907922527675576,
+ "loss": 0.0405,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1811622.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006226244382560253,
+ "skip_count": 0.0,
+ "step": 1122,
+ "text_loss": 0.5419743061065674
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.277076606985618,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.12890625,
+ "learning_rate": 0.000990733032282746,
+ "loss": 0.0535,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1814628.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03088250942528248,
+ "skip_count": 2.0,
+ "step": 1124,
+ "text_loss": 0.37100958824157715
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 5.286469034341063,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0810546875,
+ "learning_rate": 0.000990673623748111,
+ "loss": 0.0348,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 1817205.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05495348572731018,
+ "skip_count": 1.0,
+ "step": 1126,
+ "text_loss": 0.20241330564022064
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 5.295861461696507,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.0927734375,
+ "learning_rate": 0.0009906140271864173,
+ "loss": 0.0433,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 1820141.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.037809282541275024,
+ "skip_count": 2.0,
+ "step": 1128,
+ "text_loss": 0.32965806126594543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 5.305253889051952,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0908203125,
+ "learning_rate": 0.0009905542426205032,
+ "loss": 0.0348,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 1824011.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03320181369781494,
+ "skip_count": 1.0,
+ "step": 1130,
+ "text_loss": 0.36329755187034607
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.314646316407397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.0009904942700732777,
+ "loss": 0.0335,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1826873.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004102326463907957,
+ "skip_count": 0.0,
+ "step": 1132,
+ "text_loss": 0.6692602038383484
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.324038743762841,
+ "f1_execute": 0.8799999952316284,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08544921875,
+ "learning_rate": 0.0009904341095677226,
+ "loss": 0.03,
+ "macro_f1": 0.29333335161209106,
+ "num_tokens": 1830103.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.2376193106174469,
+ "skip_count": 4.0,
+ "step": 1134,
+ "text_loss": 0.19212862849235535
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.333431171118286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.119140625,
+ "learning_rate": 0.0009903737611268919,
+ "loss": 0.0445,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1833201.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005253395065665245,
+ "skip_count": 0.0,
+ "step": 1136,
+ "text_loss": 0.6773360371589661
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.34282359847373,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09814453125,
+ "learning_rate": 0.0009903132247739107,
+ "loss": 0.0305,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 1836045.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.14382585883140564,
+ "skip_count": 3.0,
+ "step": 1138,
+ "text_loss": 0.2882297933101654
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.3522160258291755,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.150390625,
+ "learning_rate": 0.0009902525005319766,
+ "loss": 0.04,
+ "macro_f1": 0.5427350401878357,
+ "num_tokens": 1839721.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04033960774540901,
+ "skip_count": 2.0,
+ "step": 1140,
+ "text_loss": 0.7172559499740601
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 5.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12109375,
+ "learning_rate": 0.0009901915884243597,
+ "loss": 0.0351,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 1842614.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005162308923900127,
+ "skip_count": 0.0,
+ "step": 1142,
+ "text_loss": 0.42892804741859436
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.371000880540064,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1240234375,
+ "learning_rate": 0.0009901304884744014,
+ "loss": 0.0386,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1845444.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.10117656737565994,
+ "skip_count": 2.0,
+ "step": 1144,
+ "text_loss": 0.20806430280208588
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.380393307895509,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.130859375,
+ "learning_rate": 0.0009900692007055152,
+ "loss": 0.0357,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1848558.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014107038266956806,
+ "skip_count": 0.0,
+ "step": 1146,
+ "text_loss": 0.5355974435806274
+ },
+ {
+ "acc_repeat": 0.25,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 5.389785735250954,
+ "f1_execute": 0.9166666865348816,
+ "f1_repeat": 0.4000000059604645,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.16015625,
+ "learning_rate": 0.000990007725141187,
+ "loss": 0.0449,
+ "macro_f1": 0.6611111164093018,
+ "num_tokens": 1852723.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.15537866950035095,
+ "skip_count": 2.0,
+ "step": 1148,
+ "text_loss": 0.6388513445854187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.399178162606399,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1181640625,
+ "learning_rate": 0.0009899460618049741,
+ "loss": 0.0397,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1856181.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011800912208855152,
+ "skip_count": 0.0,
+ "step": 1150,
+ "text_loss": 0.6113069653511047
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 5.408570589961843,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1005859375,
+ "learning_rate": 0.000989884210720506,
+ "loss": 0.0331,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 1859685.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.022900646552443504,
+ "skip_count": 0.0,
+ "step": 1152,
+ "text_loss": 0.25718021392822266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.4179630173172875,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.0009898221719114844,
+ "loss": 0.0354,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1862505.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.026814989745616913,
+ "skip_count": 1.0,
+ "step": 1154,
+ "text_loss": 0.5426549911499023
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.427355444672733,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1015625,
+ "learning_rate": 0.0009897599454016823,
+ "loss": 0.0401,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1866266.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032623792067170143,
+ "skip_count": 0.0,
+ "step": 1156,
+ "text_loss": 0.37752896547317505
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.436747872028177,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07080078125,
+ "learning_rate": 0.0009896975312149454,
+ "loss": 0.0369,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1870216.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015617577359080315,
+ "skip_count": 0.0,
+ "step": 1158,
+ "text_loss": 0.18207129836082458
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.446140299383622,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11669921875,
+ "learning_rate": 0.0009896349293751906,
+ "loss": 0.0423,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1873338.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02250153198838234,
+ "skip_count": 1.0,
+ "step": 1160,
+ "text_loss": 0.548884391784668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.455532726739067,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1484375,
+ "learning_rate": 0.0009895721399064072,
+ "loss": 0.0388,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1876470.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.055204521864652634,
+ "skip_count": 1.0,
+ "step": 1162,
+ "text_loss": 0.48052409291267395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.464925154094511,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.0009895091628326564,
+ "loss": 0.0293,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1879354.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009093789383769035,
+ "skip_count": 0.0,
+ "step": 1164,
+ "text_loss": 0.3908069431781769
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.474317581449956,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.140625,
+ "learning_rate": 0.000989445998178071,
+ "loss": 0.0323,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1881941.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015086972154676914,
+ "skip_count": 1.0,
+ "step": 1166,
+ "text_loss": 0.4884725511074066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.4837100088054,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.134765625,
+ "learning_rate": 0.0009893826459668558,
+ "loss": 0.0386,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1885374.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06587666273117065,
+ "skip_count": 3.0,
+ "step": 1168,
+ "text_loss": 0.12760137021541595
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.493102436160846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1591796875,
+ "learning_rate": 0.0009893191062232873,
+ "loss": 0.0322,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1888612.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006088624242693186,
+ "skip_count": 0.0,
+ "step": 1170,
+ "text_loss": 0.4821319580078125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1279296875,
+ "learning_rate": 0.0009892553789717143,
+ "loss": 0.0389,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1891463.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010113578289747238,
+ "skip_count": 0.0,
+ "step": 1172,
+ "text_loss": 0.3613642454147339
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.5118872908717345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1025390625,
+ "learning_rate": 0.0009891914642365573,
+ "loss": 0.0404,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1894230.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004947459790855646,
+ "skip_count": 0.0,
+ "step": 1174,
+ "text_loss": 0.5037549138069153
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.521279718227179,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1572265625,
+ "learning_rate": 0.0009891273620423083,
+ "loss": 0.0428,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1897294.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.026075217872858047,
+ "skip_count": 0.0,
+ "step": 1176,
+ "text_loss": 0.32558977603912354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.530672145582624,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12158203125,
+ "learning_rate": 0.0009890630724135314,
+ "loss": 0.0351,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1901553.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06650999188423157,
+ "skip_count": 1.0,
+ "step": 1178,
+ "text_loss": 0.23473620414733887
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 5.540064572938069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.0009889985953748625,
+ "loss": 0.0268,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 1904556.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010361116379499435,
+ "skip_count": 1.0,
+ "step": 1180,
+ "text_loss": 0.6927042007446289
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.549457000293513,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.103515625,
+ "learning_rate": 0.0009889339309510094,
+ "loss": 0.0351,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1908053.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013286533765494823,
+ "skip_count": 0.0,
+ "step": 1182,
+ "text_loss": 0.19977325201034546
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 28.0,
+ "epoch": 5.558849427648958,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.5,
+ "grad_norm": 0.058837890625,
+ "learning_rate": 0.0009888690791667518,
+ "loss": 0.0204,
+ "macro_f1": 0.7018141150474548,
+ "num_tokens": 1911754.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.11920545995235443,
+ "skip_count": 3.0,
+ "step": 1184,
+ "text_loss": 0.4072858691215515
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.568241855004403,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11083984375,
+ "learning_rate": 0.0009888040400469408,
+ "loss": 0.0391,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1914862.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03652849420905113,
+ "skip_count": 1.0,
+ "step": 1186,
+ "text_loss": 0.2654043138027191
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.577634282359847,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1689453125,
+ "learning_rate": 0.0009887388136164996,
+ "loss": 0.0336,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1918542.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03991910070180893,
+ "skip_count": 2.0,
+ "step": 1188,
+ "text_loss": 0.21130657196044922
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 5.587026709715292,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09521484375,
+ "learning_rate": 0.000988673399900423,
+ "loss": 0.0429,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1921589.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014900135807693005,
+ "skip_count": 0.0,
+ "step": 1190,
+ "text_loss": 0.5519335865974426
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.596419137070737,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1884765625,
+ "learning_rate": 0.0009886077989237777,
+ "loss": 0.0405,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1924320.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06271552294492722,
+ "skip_count": 1.0,
+ "step": 1192,
+ "text_loss": 0.213813915848732
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 24.0,
+ "epoch": 5.6058115644261814,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.888888955116272,
+ "grad_norm": 0.1875,
+ "learning_rate": 0.000988542010711702,
+ "loss": 0.0342,
+ "macro_f1": 0.6225374937057495,
+ "num_tokens": 1927178.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03081391751766205,
+ "skip_count": 5.0,
+ "step": 1194,
+ "text_loss": 0.7524349093437195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.615203991781626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.255859375,
+ "learning_rate": 0.0009884760352894064,
+ "loss": 0.0518,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1930216.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008556773886084557,
+ "skip_count": 0.0,
+ "step": 1196,
+ "text_loss": 0.28230375051498413
+ },
+ {
+ "acc_repeat": 0.3333333432674408,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 5.62459641913707,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.5,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1064453125,
+ "learning_rate": 0.0009884098726821726,
+ "loss": 0.0472,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 1933312.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.05344727262854576,
+ "skip_count": 0.0,
+ "step": 1198,
+ "text_loss": 0.5509607195854187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 5.633988846492516,
+ "f1_execute": 0.9411765336990356,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.1298828125,
+ "learning_rate": 0.000988343522915354,
+ "loss": 0.0441,
+ "macro_f1": 0.480392187833786,
+ "num_tokens": 1936160.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.07324771583080292,
+ "skip_count": 3.0,
+ "step": 1200,
+ "text_loss": 0.30565372109413147
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 25.0,
+ "epoch": 5.64338127384796,
+ "f1_execute": 0.8936169743537903,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.444444477558136,
+ "grad_norm": 0.2470703125,
+ "learning_rate": 0.0009882769860143764,
+ "loss": 0.0317,
+ "macro_f1": 0.4460204839706421,
+ "num_tokens": 1939266.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.18620699644088745,
+ "skip_count": 6.0,
+ "step": 1202,
+ "text_loss": 0.976121723651886
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 5.6527737012034045,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1318359375,
+ "learning_rate": 0.000988210262004737,
+ "loss": 0.0474,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 1942173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007703613489866257,
+ "skip_count": 1.0,
+ "step": 1204,
+ "text_loss": 0.5647401809692383
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.66216612855885,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1484375,
+ "learning_rate": 0.0009881433509120036,
+ "loss": 0.0376,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1945071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02162683941423893,
+ "skip_count": 2.0,
+ "step": 1206,
+ "text_loss": 0.24229218065738678
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.671558555914294,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0966796875,
+ "learning_rate": 0.0009880762527618176,
+ "loss": 0.0383,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1949060.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017667081207036972,
+ "skip_count": 0.0,
+ "step": 1208,
+ "text_loss": 0.4035970866680145
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.680950983269739,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.154296875,
+ "learning_rate": 0.0009880089675798908,
+ "loss": 0.0367,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1951698.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006405784282833338,
+ "skip_count": 0.0,
+ "step": 1210,
+ "text_loss": 0.5319879055023193
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.690343410625183,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09814453125,
+ "learning_rate": 0.0009879414953920071,
+ "loss": 0.0294,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1955266.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009859707206487656,
+ "skip_count": 0.0,
+ "step": 1212,
+ "text_loss": 0.6687407493591309
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.699735837980628,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.130859375,
+ "learning_rate": 0.0009878738362240219,
+ "loss": 0.045,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1958538.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030890554189682007,
+ "skip_count": 2.0,
+ "step": 1214,
+ "text_loss": 0.20820017158985138
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 5.709128265336073,
+ "f1_execute": 0.9200000166893005,
+ "f1_repeat": 0.5,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1806640625,
+ "learning_rate": 0.000987805990101862,
+ "loss": 0.0317,
+ "macro_f1": 0.47333335876464844,
+ "num_tokens": 1961419.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.10383198410272598,
+ "skip_count": 2.0,
+ "step": 1216,
+ "text_loss": 0.8664976358413696
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.718520692691517,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1435546875,
+ "learning_rate": 0.0009877379570515268,
+ "loss": 0.0366,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1964836.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013376163318753242,
+ "skip_count": 0.0,
+ "step": 1218,
+ "text_loss": 0.4223395884037018
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.727913120046962,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0859375,
+ "learning_rate": 0.0009876697370990865,
+ "loss": 0.0343,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1967620.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008577900938689709,
+ "skip_count": 0.0,
+ "step": 1220,
+ "text_loss": 0.4789901375770569
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1728515625,
+ "learning_rate": 0.0009876013302706828,
+ "loss": 0.049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1971100.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004730266984552145,
+ "skip_count": 0.0,
+ "step": 1222,
+ "text_loss": 0.6799837946891785
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.7466979747578515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08349609375,
+ "learning_rate": 0.0009875327365925295,
+ "loss": 0.0341,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1974408.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010849526152014732,
+ "skip_count": 0.0,
+ "step": 1224,
+ "text_loss": 0.18967926502227783
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 5.756090402113296,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.169921875,
+ "learning_rate": 0.0009874639560909118,
+ "loss": 0.0498,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 1977046.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04841252416372299,
+ "skip_count": 1.0,
+ "step": 1226,
+ "text_loss": 0.6133310198783875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.765482829468741,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1318359375,
+ "learning_rate": 0.0009873949887921867,
+ "loss": 0.0402,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1980330.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.029638588428497314,
+ "skip_count": 1.0,
+ "step": 1228,
+ "text_loss": 0.15649555623531342
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 5.774875256824186,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1103515625,
+ "learning_rate": 0.0009873258347227823,
+ "loss": 0.0331,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1983173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009955910965800285,
+ "skip_count": 0.0,
+ "step": 1230,
+ "text_loss": 0.4741005599498749
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0849609375,
+ "learning_rate": 0.0009872564939091989,
+ "loss": 0.0342,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1986825.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010205300524830818,
+ "skip_count": 0.0,
+ "step": 1232,
+ "text_loss": 0.5315462350845337
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5714285969734192,
+ "avg_layers": 25.0,
+ "epoch": 5.7936601115350745,
+ "f1_execute": 0.9302325248718262,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.7272727489471436,
+ "grad_norm": 0.11865234375,
+ "learning_rate": 0.0009871869663780077,
+ "loss": 0.0336,
+ "macro_f1": 0.8858351111412048,
+ "num_tokens": 1990448.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.09120134264230728,
+ "skip_count": 7.0,
+ "step": 1234,
+ "text_loss": 0.6187508702278137
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 5.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.125,
+ "learning_rate": 0.0009871172521558522,
+ "loss": 0.0475,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 1993474.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016188839450478554,
+ "skip_count": 1.0,
+ "step": 1236,
+ "text_loss": 0.20783066749572754
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 5.812444966245964,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.216796875,
+ "learning_rate": 0.0009870473512694465,
+ "loss": 0.0373,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 1996536.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05046704784035683,
+ "skip_count": 3.0,
+ "step": 1238,
+ "text_loss": 0.247748002409935
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 5.821837393601409,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.09033203125,
+ "learning_rate": 0.0009869772637455772,
+ "loss": 0.0251,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 1999530.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.044926248490810394,
+ "skip_count": 2.0,
+ "step": 1240,
+ "text_loss": 0.26001980900764465
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 5.831229820956853,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1513671875,
+ "learning_rate": 0.000986906989611102,
+ "loss": 0.0446,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2002782.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.025911526754498482,
+ "skip_count": 0.0,
+ "step": 1242,
+ "text_loss": 0.9009982943534851
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.8406222483122985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.115234375,
+ "learning_rate": 0.0009868365288929492,
+ "loss": 0.0371,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2005331.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0043760035187006,
+ "skip_count": 0.0,
+ "step": 1244,
+ "text_loss": 0.5547386407852173
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.850014675667743,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1005859375,
+ "learning_rate": 0.0009867658816181206,
+ "loss": 0.0374,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2008115.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009227181784808636,
+ "skip_count": 0.0,
+ "step": 1246,
+ "text_loss": 1.0067731142044067
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.859407103023187,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.126953125,
+ "learning_rate": 0.000986695047813688,
+ "loss": 0.0261,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2011137.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.023822437971830368,
+ "skip_count": 0.0,
+ "step": 1248,
+ "text_loss": 0.30058956146240234
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 5.868799530378633,
+ "f1_execute": 0.9200000166893005,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.1044921875,
+ "learning_rate": 0.0009866240275067948,
+ "loss": 0.044,
+ "macro_f1": 0.47333335876464844,
+ "num_tokens": 2014159.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.21523773670196533,
+ "skip_count": 3.0,
+ "step": 1250,
+ "text_loss": 0.39072203636169434
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.878191957734077,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1201171875,
+ "learning_rate": 0.0009865528207246563,
+ "loss": 0.0351,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2017731.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06184682995080948,
+ "skip_count": 2.0,
+ "step": 1252,
+ "text_loss": 0.35751575231552124
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.8875843850895215,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.166015625,
+ "learning_rate": 0.000986481427494559,
+ "loss": 0.0336,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2020485.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007573372684419155,
+ "skip_count": 0.0,
+ "step": 1254,
+ "text_loss": 0.4061077833175659
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.896976812444966,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1708984375,
+ "learning_rate": 0.000986409847843861,
+ "loss": 0.0382,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2024149.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.07447971403598785,
+ "skip_count": 0.0,
+ "step": 1256,
+ "text_loss": 0.41876497864723206
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.906369239800411,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.17578125,
+ "learning_rate": 0.000986338081799992,
+ "loss": 0.0351,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2026545.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006609147880226374,
+ "skip_count": 0.0,
+ "step": 1258,
+ "text_loss": 0.4673794209957123
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.915761667155856,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1123046875,
+ "learning_rate": 0.0009862661293904523,
+ "loss": 0.0498,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2029581.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.10624702274799347,
+ "skip_count": 2.0,
+ "step": 1260,
+ "text_loss": 0.3483233153820038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1201171875,
+ "learning_rate": 0.0009861939906428145,
+ "loss": 0.0525,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2033936.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007944886572659016,
+ "skip_count": 0.0,
+ "step": 1262,
+ "text_loss": 0.16362667083740234
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 5.934546521866745,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11669921875,
+ "learning_rate": 0.0009861216655847225,
+ "loss": 0.0376,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2037876.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007004092447459698,
+ "skip_count": 0.0,
+ "step": 1264,
+ "text_loss": 0.43228110671043396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.94393894922219,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1005859375,
+ "learning_rate": 0.0009860491542438912,
+ "loss": 0.047,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2040842.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.026916226372122765,
+ "skip_count": 1.0,
+ "step": 1266,
+ "text_loss": 0.5901188850402832
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.953331376577634,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0986328125,
+ "learning_rate": 0.000985976456648107,
+ "loss": 0.0353,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2043890.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007325216196477413,
+ "skip_count": 0.0,
+ "step": 1268,
+ "text_loss": 0.8780109882354736
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 5.962723803933079,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.10205078125,
+ "learning_rate": 0.000985903572825228,
+ "loss": 0.0306,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 2048848.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05007527023553848,
+ "skip_count": 2.0,
+ "step": 1270,
+ "text_loss": 0.5863722562789917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 5.972116231288524,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.173828125,
+ "learning_rate": 0.000985830502803183,
+ "loss": 0.0396,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2051561.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023995524272322655,
+ "skip_count": 0.0,
+ "step": 1272,
+ "text_loss": 0.7460709810256958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.9815086586439685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10205078125,
+ "learning_rate": 0.0009857572466099732,
+ "loss": 0.0431,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2054752.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006928362417966127,
+ "skip_count": 0.0,
+ "step": 1274,
+ "text_loss": 0.5130293369293213
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.990901085999413,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.162109375,
+ "learning_rate": 0.0009856838042736698,
+ "loss": 0.0501,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2058151.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006969396956264973,
+ "skip_count": 0.0,
+ "step": 1276,
+ "text_loss": 0.5911393761634827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1357421875,
+ "learning_rate": 0.0009856101758224166,
+ "loss": 0.0441,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2061012.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003499418031424284,
+ "skip_count": 0.0,
+ "step": 1278,
+ "text_loss": 0.25347545742988586
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.009392427355444,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.000985536361284428,
+ "loss": 0.0229,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2064597.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007856054231524467,
+ "skip_count": 0.0,
+ "step": 1280,
+ "text_loss": 0.7476963400840759
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.01878485471089,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0888671875,
+ "learning_rate": 0.0009854623606879898,
+ "loss": 0.0245,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2067972.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02617792971432209,
+ "skip_count": 1.0,
+ "step": 1282,
+ "text_loss": 0.5775872468948364
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 6.028177282066334,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09033203125,
+ "learning_rate": 0.000985388174061459,
+ "loss": 0.0356,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 2071812.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.035979997366666794,
+ "skip_count": 1.0,
+ "step": 1284,
+ "text_loss": 0.2933400869369507
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.037569709421779,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08447265625,
+ "learning_rate": 0.0009853138014332646,
+ "loss": 0.0273,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2074868.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005142854526638985,
+ "skip_count": 0.0,
+ "step": 1286,
+ "text_loss": 0.29085102677345276
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.046962136777223,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09033203125,
+ "learning_rate": 0.0009852392428319058,
+ "loss": 0.0306,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2078225.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032799106556922197,
+ "skip_count": 0.0,
+ "step": 1288,
+ "text_loss": 0.7293626070022583
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 6.056354564132668,
+ "f1_execute": 0.9411765336990356,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.08935546875,
+ "learning_rate": 0.0009851644982859537,
+ "loss": 0.0273,
+ "macro_f1": 0.480392187833786,
+ "num_tokens": 2081495.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.12224318832159042,
+ "skip_count": 3.0,
+ "step": 1290,
+ "text_loss": 0.26125892996788025
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.065746991488113,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1435546875,
+ "learning_rate": 0.0009850895678240508,
+ "loss": 0.0283,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2084390.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010662888176739216,
+ "skip_count": 0.0,
+ "step": 1292,
+ "text_loss": 0.3510764539241791
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 6.075139418843557,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1689453125,
+ "learning_rate": 0.0009850144514749104,
+ "loss": 0.0332,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2087210.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01979079470038414,
+ "skip_count": 2.0,
+ "step": 1294,
+ "text_loss": 0.40202176570892334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 6.084531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.11669921875,
+ "learning_rate": 0.000984939149267317,
+ "loss": 0.0253,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2090777.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005172552540898323,
+ "skip_count": 1.0,
+ "step": 1296,
+ "text_loss": 0.5275651216506958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.093924273554447,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.095703125,
+ "learning_rate": 0.0009848636612301272,
+ "loss": 0.0299,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2094248.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029599082190543413,
+ "skip_count": 0.0,
+ "step": 1298,
+ "text_loss": 0.4517653286457062
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.103316700909891,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.23046875,
+ "learning_rate": 0.0009847879873922675,
+ "loss": 0.0357,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2097139.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011455860920250416,
+ "skip_count": 0.0,
+ "step": 1300,
+ "text_loss": 0.16888445615768433
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.112709128265336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09619140625,
+ "learning_rate": 0.0009847121277827366,
+ "loss": 0.0301,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2100415.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008091195486485958,
+ "skip_count": 0.0,
+ "step": 1302,
+ "text_loss": 0.40061676502227783
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.122101555620781,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1123046875,
+ "learning_rate": 0.000984636082430604,
+ "loss": 0.0285,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2103285.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009593960829079151,
+ "skip_count": 0.0,
+ "step": 1304,
+ "text_loss": 0.7211073637008667
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.131493982976226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.107421875,
+ "learning_rate": 0.0009845598513650103,
+ "loss": 0.0231,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2106255.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023068038281053305,
+ "skip_count": 0.0,
+ "step": 1306,
+ "text_loss": 0.7077119946479797
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.171875,
+ "learning_rate": 0.0009844834346151674,
+ "loss": 0.043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2109305.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007703019306063652,
+ "skip_count": 0.0,
+ "step": 1308,
+ "text_loss": 0.3534316122531891
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.1502788376871145,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1025390625,
+ "learning_rate": 0.0009844068322103585,
+ "loss": 0.0287,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2112216.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023549847304821014,
+ "skip_count": 1.0,
+ "step": 1310,
+ "text_loss": 0.6792599558830261
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.150390625,
+ "learning_rate": 0.0009843300441799378,
+ "loss": 0.0211,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2114925.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007605871185660362,
+ "skip_count": 0.0,
+ "step": 1312,
+ "text_loss": 0.1571389138698578
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.169063692398004,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.134765625,
+ "learning_rate": 0.0009842530705533304,
+ "loss": 0.0253,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2117744.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014964760281145573,
+ "skip_count": 0.0,
+ "step": 1314,
+ "text_loss": 0.7840361595153809
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.178456119753449,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.000984175911360033,
+ "loss": 0.0238,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2120848.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004663798492401838,
+ "skip_count": 0.0,
+ "step": 1316,
+ "text_loss": 0.536246120929718
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 6.187848547108893,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1201171875,
+ "learning_rate": 0.000984098566629613,
+ "loss": 0.0288,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2123651.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022852955386042595,
+ "skip_count": 2.0,
+ "step": 1318,
+ "text_loss": 0.43372172117233276
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.197240974464338,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07958984375,
+ "learning_rate": 0.0009840210363917087,
+ "loss": 0.0216,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2128011.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012578422203660011,
+ "skip_count": 0.0,
+ "step": 1320,
+ "text_loss": 0.28190380334854126
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10986328125,
+ "learning_rate": 0.0009839433206760306,
+ "loss": 0.0204,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2131035.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006863643880933523,
+ "skip_count": 0.0,
+ "step": 1322,
+ "text_loss": 0.6340444087982178
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.216025829175227,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1796875,
+ "learning_rate": 0.0009838654195123589,
+ "loss": 0.0243,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2133856.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00468854233622551,
+ "skip_count": 0.0,
+ "step": 1324,
+ "text_loss": 0.5138425827026367
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.225418256530672,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.115234375,
+ "learning_rate": 0.0009837873329305458,
+ "loss": 0.0396,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2136451.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005731126759201288,
+ "skip_count": 0.0,
+ "step": 1326,
+ "text_loss": 0.742124617099762
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.17578125,
+ "learning_rate": 0.000983709060960514,
+ "loss": 0.0416,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2139496.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0056343949399888515,
+ "skip_count": 0.0,
+ "step": 1328,
+ "text_loss": 0.7317464351654053
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.2442031112415615,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10791015625,
+ "learning_rate": 0.0009836306036322576,
+ "loss": 0.0312,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2143120.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005127966403961182,
+ "skip_count": 0.0,
+ "step": 1330,
+ "text_loss": 0.538652241230011
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 6.253595538597006,
+ "f1_execute": 0.9130434989929199,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.11083984375,
+ "learning_rate": 0.0009835519609758415,
+ "loss": 0.0301,
+ "macro_f1": 0.590062141418457,
+ "num_tokens": 2145807.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.1673707216978073,
+ "skip_count": 4.0,
+ "step": 1332,
+ "text_loss": 0.3498198091983795
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.262987965952451,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0009834731330214017,
+ "loss": 0.0293,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2148397.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04026653990149498,
+ "skip_count": 0.0,
+ "step": 1334,
+ "text_loss": 0.8153424859046936
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 27.0,
+ "epoch": 6.272380393307896,
+ "f1_execute": 0.8999999761581421,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.8000000715255737,
+ "grad_norm": 0.16015625,
+ "learning_rate": 0.0009833941197991455,
+ "loss": 0.0329,
+ "macro_f1": 0.7888889312744141,
+ "num_tokens": 2152226.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.05481519177556038,
+ "skip_count": 5.0,
+ "step": 1336,
+ "text_loss": 0.7802760004997253
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 6.28177282066334,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.0009833149213393506,
+ "loss": 0.0304,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2156023.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01760484278202057,
+ "skip_count": 0.0,
+ "step": 1338,
+ "text_loss": 0.19721226394176483
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.2911652480187845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11474609375,
+ "learning_rate": 0.000983235537672366,
+ "loss": 0.0256,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2160037.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013206037692725658,
+ "skip_count": 0.0,
+ "step": 1340,
+ "text_loss": 0.5003817081451416
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.000983155968828612,
+ "loss": 0.0315,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2163910.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01256406120955944,
+ "skip_count": 0.0,
+ "step": 1342,
+ "text_loss": 0.5996923446655273
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.309950102729674,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11962890625,
+ "learning_rate": 0.0009830762148385793,
+ "loss": 0.0313,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2166921.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015086234547197819,
+ "skip_count": 1.0,
+ "step": 1344,
+ "text_loss": 0.45356282591819763
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.319342530085119,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08447265625,
+ "learning_rate": 0.0009829962757328297,
+ "loss": 0.0223,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2170135.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07909081131219864,
+ "skip_count": 2.0,
+ "step": 1346,
+ "text_loss": 0.2874644994735718
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 6.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0009829161515419959,
+ "loss": 0.0246,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2173029.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013569854199886322,
+ "skip_count": 2.0,
+ "step": 1348,
+ "text_loss": 0.25533875823020935
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.3381273847960085,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0009828358422967823,
+ "loss": 0.0226,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2176605.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08111091703176498,
+ "skip_count": 1.0,
+ "step": 1350,
+ "text_loss": 0.32827726006507874
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 6.347519812151453,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.091796875,
+ "learning_rate": 0.0009827553480279627,
+ "loss": 0.03,
+ "macro_f1": 0.5427350401878357,
+ "num_tokens": 2179406.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.026550088077783585,
+ "skip_count": 2.0,
+ "step": 1352,
+ "text_loss": 0.2966301143169403
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0791015625,
+ "learning_rate": 0.0009826746687663832,
+ "loss": 0.0301,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2182353.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003914554137736559,
+ "skip_count": 0.0,
+ "step": 1354,
+ "text_loss": 0.7596251964569092
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 6.366304666862343,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0859375,
+ "learning_rate": 0.0009825938045429602,
+ "loss": 0.0324,
+ "macro_f1": 0.5866667032241821,
+ "num_tokens": 2185786.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.059612665325403214,
+ "skip_count": 3.0,
+ "step": 1356,
+ "text_loss": 0.12325898557901382
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.375697094217787,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10009765625,
+ "learning_rate": 0.0009825127553886807,
+ "loss": 0.0375,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2190157.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0071132429875433445,
+ "skip_count": 0.0,
+ "step": 1358,
+ "text_loss": 0.9287898540496826
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.3850895215732315,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0986328125,
+ "learning_rate": 0.0009824315213346033,
+ "loss": 0.0348,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2193077.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009611099027097225,
+ "skip_count": 0.0,
+ "step": 1360,
+ "text_loss": 0.20427259802818298
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.394481948928676,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10888671875,
+ "learning_rate": 0.0009823501024118569,
+ "loss": 0.0285,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2196494.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006913455203175545,
+ "skip_count": 0.0,
+ "step": 1362,
+ "text_loss": 0.574759840965271
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.403874376284121,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.0009822684986516411,
+ "loss": 0.0245,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2199839.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009208920411765575,
+ "skip_count": 0.0,
+ "step": 1364,
+ "text_loss": 0.42422571778297424
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.413266803639566,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0927734375,
+ "learning_rate": 0.000982186710085227,
+ "loss": 0.0208,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2203212.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.059975091367959976,
+ "skip_count": 1.0,
+ "step": 1366,
+ "text_loss": 0.29213017225265503
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.25,
+ "avg_layers": 27.0,
+ "epoch": 6.42265923099501,
+ "f1_execute": 0.9411765336990356,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.181640625,
+ "learning_rate": 0.0009821047367439561,
+ "loss": 0.0358,
+ "macro_f1": 0.44705885648727417,
+ "num_tokens": 2206240.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.048244867473840714,
+ "skip_count": 4.0,
+ "step": 1368,
+ "text_loss": 0.3072395324707031
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.432051658350455,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11181640625,
+ "learning_rate": 0.0009820225786592405,
+ "loss": 0.0375,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2209903.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.026068156585097313,
+ "skip_count": 0.0,
+ "step": 1370,
+ "text_loss": 0.5961400270462036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.4414440857059,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.109375,
+ "learning_rate": 0.0009819402358625634,
+ "loss": 0.0366,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2213439.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022615568712353706,
+ "skip_count": 1.0,
+ "step": 1372,
+ "text_loss": 0.19375644624233246
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.450836513061344,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1240234375,
+ "learning_rate": 0.000981857708385479,
+ "loss": 0.0346,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2216457.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005855285096913576,
+ "skip_count": 0.0,
+ "step": 1374,
+ "text_loss": 0.5123368501663208
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.460228940416789,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09423828125,
+ "learning_rate": 0.0009817749962596114,
+ "loss": 0.0249,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2219975.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0651634931564331,
+ "skip_count": 0.0,
+ "step": 1376,
+ "text_loss": 0.5999220609664917
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09912109375,
+ "learning_rate": 0.0009816920995166568,
+ "loss": 0.0371,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2222833.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011408994905650616,
+ "skip_count": 0.0,
+ "step": 1378,
+ "text_loss": 0.5323230624198914
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.4790137951276785,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.205078125,
+ "learning_rate": 0.0009816090181883807,
+ "loss": 0.0313,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2225842.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.039720915257930756,
+ "skip_count": 2.0,
+ "step": 1380,
+ "text_loss": 0.23363439738750458
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.488406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12255859375,
+ "learning_rate": 0.0009815257523066204,
+ "loss": 0.0249,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2229430.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002765297656878829,
+ "skip_count": 0.0,
+ "step": 1382,
+ "text_loss": 0.718977689743042
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.497798649838567,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.130859375,
+ "learning_rate": 0.0009814423019032835,
+ "loss": 0.0396,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2232594.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.05362323671579361,
+ "skip_count": 0.0,
+ "step": 1384,
+ "text_loss": 0.6392166614532471
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.507191077194013,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.150390625,
+ "learning_rate": 0.0009813586670103483,
+ "loss": 0.0426,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 2236327.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.031728316098451614,
+ "skip_count": 1.0,
+ "step": 1386,
+ "text_loss": 0.5951619148254395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 6.516583504549457,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.126953125,
+ "learning_rate": 0.0009812748476598638,
+ "loss": 0.031,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2239746.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03981253132224083,
+ "skip_count": 2.0,
+ "step": 1388,
+ "text_loss": 0.22756551206111908
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 6.5259759319049016,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.12451171875,
+ "learning_rate": 0.0009811908438839498,
+ "loss": 0.0331,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2242786.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04617162421345711,
+ "skip_count": 2.0,
+ "step": 1390,
+ "text_loss": 0.3233799934387207
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.535368359260346,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.154296875,
+ "learning_rate": 0.000981106655714797,
+ "loss": 0.0358,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2245696.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.046828847378492355,
+ "skip_count": 1.0,
+ "step": 1392,
+ "text_loss": 0.24273279309272766
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 6.544760786615791,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.0009810222831846656,
+ "loss": 0.0307,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2249326.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010921589098870754,
+ "skip_count": 2.0,
+ "step": 1394,
+ "text_loss": 0.3921460807323456
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 6.554153213971236,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09423828125,
+ "learning_rate": 0.0009809377263258882,
+ "loss": 0.0315,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 2253393.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04564022272825241,
+ "skip_count": 1.0,
+ "step": 1396,
+ "text_loss": 0.582602858543396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 6.56354564132668,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.103515625,
+ "learning_rate": 0.000980852985170867,
+ "loss": 0.0328,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2256626.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013289985246956348,
+ "skip_count": 0.0,
+ "step": 1398,
+ "text_loss": 0.41031694412231445
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.5729380686821255,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1728515625,
+ "learning_rate": 0.0009807680597520745,
+ "loss": 0.0264,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2259326.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0065213534981012344,
+ "skip_count": 0.0,
+ "step": 1400,
+ "text_loss": 0.2888098657131195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.58233049603757,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.23046875,
+ "learning_rate": 0.0009806829501020546,
+ "loss": 0.0358,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2262344.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04199840500950813,
+ "skip_count": 1.0,
+ "step": 1402,
+ "text_loss": 0.31973034143447876
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.591722923393014,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08935546875,
+ "learning_rate": 0.0009805976562534215,
+ "loss": 0.0317,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 2266354.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015434930101037025,
+ "skip_count": 1.0,
+ "step": 1404,
+ "text_loss": 0.508630633354187
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 6.601115350748459,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.140625,
+ "learning_rate": 0.0009805121782388599,
+ "loss": 0.0339,
+ "macro_f1": 0.6533333659172058,
+ "num_tokens": 2269660.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0720924660563469,
+ "skip_count": 2.0,
+ "step": 1406,
+ "text_loss": 0.40927737951278687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 6.610507778103904,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0009804265160911253,
+ "loss": 0.0266,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2273335.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02400495670735836,
+ "skip_count": 2.0,
+ "step": 1408,
+ "text_loss": 0.1777762621641159
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.6199002054593485,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2314453125,
+ "learning_rate": 0.0009803406698430433,
+ "loss": 0.0371,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2277107.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02560107782483101,
+ "skip_count": 1.0,
+ "step": 1410,
+ "text_loss": 0.17955881357192993
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.629292632814793,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07470703125,
+ "learning_rate": 0.0009802546395275104,
+ "loss": 0.0349,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2281638.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006655813194811344,
+ "skip_count": 0.0,
+ "step": 1412,
+ "text_loss": 0.20882295072078705
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 28.0,
+ "epoch": 6.638685060170237,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.08740234375,
+ "learning_rate": 0.000980168425177494,
+ "loss": 0.0342,
+ "macro_f1": 0.8200000524520874,
+ "num_tokens": 2284876.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06325097382068634,
+ "skip_count": 3.0,
+ "step": 1414,
+ "text_loss": 0.26035264134407043
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.648077487525683,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.138671875,
+ "learning_rate": 0.000980082026826031,
+ "loss": 0.0315,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2288938.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013436575420200825,
+ "skip_count": 0.0,
+ "step": 1416,
+ "text_loss": 0.5502325892448425
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.657469914881127,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07177734375,
+ "learning_rate": 0.0009799954445062296,
+ "loss": 0.0193,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 2292317.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011264479719102383,
+ "skip_count": 1.0,
+ "step": 1418,
+ "text_loss": 0.48075684905052185
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 6.666862342236572,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009799086782512686,
+ "loss": 0.0292,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2295935.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02833271212875843,
+ "skip_count": 2.0,
+ "step": 1420,
+ "text_loss": 0.18221206963062286
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 6.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.09375,
+ "learning_rate": 0.0009798217280943967,
+ "loss": 0.0356,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2298927.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009208574891090393,
+ "skip_count": 1.0,
+ "step": 1422,
+ "text_loss": 0.48686322569847107
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 6.685647196947461,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09423828125,
+ "learning_rate": 0.0009797345940689335,
+ "loss": 0.0267,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2301541.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015011847950518131,
+ "skip_count": 0.0,
+ "step": 1424,
+ "text_loss": 0.49446266889572144
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.4000000059604645,
+ "avg_layers": 26.0,
+ "epoch": 6.695039624302906,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5714285969734192,
+ "grad_norm": 0.1337890625,
+ "learning_rate": 0.0009796472762082687,
+ "loss": 0.0338,
+ "macro_f1": 0.5034013986587524,
+ "num_tokens": 2304589.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05912091210484505,
+ "skip_count": 5.0,
+ "step": 1426,
+ "text_loss": 0.23945684731006622
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.70443205165835,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09765625,
+ "learning_rate": 0.000979559774545863,
+ "loss": 0.0405,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2307860.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021242303773760796,
+ "skip_count": 1.0,
+ "step": 1428,
+ "text_loss": 0.531273365020752
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.713824479013795,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.000979472089115247,
+ "loss": 0.0276,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2311581.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02768544852733612,
+ "skip_count": 2.0,
+ "step": 1430,
+ "text_loss": 0.2497459501028061
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12255859375,
+ "learning_rate": 0.000979384219950022,
+ "loss": 0.0346,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2314639.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008678150363266468,
+ "skip_count": 0.0,
+ "step": 1432,
+ "text_loss": 0.6579355001449585
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.732609333724684,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08056640625,
+ "learning_rate": 0.0009792961670838595,
+ "loss": 0.0362,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2317927.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03325597569346428,
+ "skip_count": 0.0,
+ "step": 1434,
+ "text_loss": 0.5209436416625977
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.742001761080129,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1494140625,
+ "learning_rate": 0.0009792079305505016,
+ "loss": 0.0306,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2321065.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.019228918477892876,
+ "skip_count": 0.0,
+ "step": 1436,
+ "text_loss": 0.41087067127227783
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.751394188435574,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10986328125,
+ "learning_rate": 0.000979119510383761,
+ "loss": 0.0371,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2323714.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017071325331926346,
+ "skip_count": 0.0,
+ "step": 1438,
+ "text_loss": 0.21490029990673065
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.760786615791019,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2060546875,
+ "learning_rate": 0.00097903090661752,
+ "loss": 0.0309,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2326454.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00991755723953247,
+ "skip_count": 0.0,
+ "step": 1440,
+ "text_loss": 0.23847346007823944
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.770179043146463,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.232421875,
+ "learning_rate": 0.000978942119285732,
+ "loss": 0.0404,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2329462.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04908733069896698,
+ "skip_count": 1.0,
+ "step": 1442,
+ "text_loss": 0.23343028128147125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.7795714705019074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1044921875,
+ "learning_rate": 0.0009788531484224204,
+ "loss": 0.0264,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2332146.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032628148328512907,
+ "skip_count": 0.0,
+ "step": 1444,
+ "text_loss": 0.47423800826072693
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 29.0,
+ "epoch": 6.788963897857353,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.5,
+ "grad_norm": 0.10693359375,
+ "learning_rate": 0.0009787639940616788,
+ "loss": 0.0405,
+ "macro_f1": 0.7018141150474548,
+ "num_tokens": 2335738.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.14336998760700226,
+ "skip_count": 3.0,
+ "step": 1446,
+ "text_loss": 0.21837592124938965
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 6.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.189453125,
+ "learning_rate": 0.0009786746562376717,
+ "loss": 0.0241,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2338488.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010542908683419228,
+ "skip_count": 1.0,
+ "step": 1448,
+ "text_loss": 1.0614757537841797
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.807748752568242,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1728515625,
+ "learning_rate": 0.0009785851349846334,
+ "loss": 0.0268,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2342074.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005998016335070133,
+ "skip_count": 0.0,
+ "step": 1450,
+ "text_loss": 0.4269719421863556
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 26.0,
+ "epoch": 6.817141179923686,
+ "f1_execute": 0.9411764740943909,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.1083984375,
+ "learning_rate": 0.0009784954303368686,
+ "loss": 0.0384,
+ "macro_f1": 0.44705885648727417,
+ "num_tokens": 2345838.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0959126204252243,
+ "skip_count": 3.0,
+ "step": 1452,
+ "text_loss": 0.3315916955471039
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.826533607279131,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1005859375,
+ "learning_rate": 0.0009784055423287521,
+ "loss": 0.0218,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2348939.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025467623490840197,
+ "skip_count": 0.0,
+ "step": 1454,
+ "text_loss": 0.6162732839584351
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.835926034634576,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.115234375,
+ "learning_rate": 0.0009783154709947293,
+ "loss": 0.0256,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2352232.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01860538125038147,
+ "skip_count": 1.0,
+ "step": 1456,
+ "text_loss": 0.23928768932819366
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.84531846199002,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09912109375,
+ "learning_rate": 0.0009782252163693158,
+ "loss": 0.0201,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2355159.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04412713274359703,
+ "skip_count": 1.0,
+ "step": 1458,
+ "text_loss": 0.3371323347091675
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.21484375,
+ "learning_rate": 0.0009781347784870973,
+ "loss": 0.0379,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2358175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006809141952544451,
+ "skip_count": 0.0,
+ "step": 1460,
+ "text_loss": 0.547267735004425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.86410331670091,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.095703125,
+ "learning_rate": 0.0009780441573827296,
+ "loss": 0.03,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 2360991.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08924390375614166,
+ "skip_count": 4.0,
+ "step": 1462,
+ "text_loss": 0.7026563882827759
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.873495744056354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1865234375,
+ "learning_rate": 0.000977953353090939,
+ "loss": 0.0272,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2363894.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021858472377061844,
+ "skip_count": 0.0,
+ "step": 1464,
+ "text_loss": 0.2718065083026886
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.882888171411799,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11474609375,
+ "learning_rate": 0.0009778623656465219,
+ "loss": 0.0338,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2367265.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.044781096279621124,
+ "skip_count": 0.0,
+ "step": 1466,
+ "text_loss": 0.5008095502853394
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.892280598767244,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0009777711950843448,
+ "loss": 0.0212,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2370186.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0040459707379341125,
+ "skip_count": 0.0,
+ "step": 1468,
+ "text_loss": 0.5242461562156677
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 6.901673026122689,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.134765625,
+ "learning_rate": 0.0009776798414393446,
+ "loss": 0.0279,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 2373314.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0708528608083725,
+ "skip_count": 3.0,
+ "step": 1470,
+ "text_loss": 0.2821732461452484
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.911065453478133,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1328125,
+ "learning_rate": 0.0009775883047465279,
+ "loss": 0.0414,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 2376435.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0290578193962574,
+ "skip_count": 1.0,
+ "step": 1472,
+ "text_loss": 0.8438440561294556
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.9204578808335775,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10546875,
+ "learning_rate": 0.000977496585040972,
+ "loss": 0.0373,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2380244.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010360375046730042,
+ "skip_count": 0.0,
+ "step": 1474,
+ "text_loss": 0.4356135427951813
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 6.929850308189023,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09912109375,
+ "learning_rate": 0.000977404682357824,
+ "loss": 0.0294,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2383498.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023518972098827362,
+ "skip_count": 0.0,
+ "step": 1476,
+ "text_loss": 0.25195425748825073
+ },
+ {
+ "acc_repeat": 0.800000011920929,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 6.939242735544467,
+ "f1_execute": 0.9743589162826538,
+ "f1_repeat": 0.888888955116272,
+ "f1_skip": 1.0,
+ "grad_norm": 0.11181640625,
+ "learning_rate": 0.000977312596732301,
+ "loss": 0.0375,
+ "macro_f1": 0.9544159770011902,
+ "num_tokens": 2386414.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.08190606534481049,
+ "skip_count": 4.0,
+ "step": 1478,
+ "text_loss": 0.6586798429489136
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 6.948635162899912,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10546875,
+ "learning_rate": 0.0009772203281996905,
+ "loss": 0.0336,
+ "macro_f1": 1.0,
+ "num_tokens": 2389399.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.016441475600004196,
+ "skip_count": 2.0,
+ "step": 1480,
+ "text_loss": 0.3671986758708954
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.958027590255357,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09814453125,
+ "learning_rate": 0.0009771278767953502,
+ "loss": 0.0357,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2392400.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019211363047361374,
+ "skip_count": 0.0,
+ "step": 1482,
+ "text_loss": 0.27418580651283264
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.967420017610801,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0947265625,
+ "learning_rate": 0.0009770352425547072,
+ "loss": 0.0292,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2395123.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015800386667251587,
+ "skip_count": 0.0,
+ "step": 1484,
+ "text_loss": 0.19896622002124786
+ },
+ {
+ "acc_repeat": 0.3333333432674408,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.976812444966246,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.5,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12890625,
+ "learning_rate": 0.0009769424255132596,
+ "loss": 0.0256,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 2397359.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.06670158356428146,
+ "skip_count": 0.0,
+ "step": 1486,
+ "text_loss": 0.4229799509048462
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.98620487232169,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1162109375,
+ "learning_rate": 0.0009768494257065747,
+ "loss": 0.0218,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2400387.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011144762858748436,
+ "skip_count": 1.0,
+ "step": 1488,
+ "text_loss": 0.4264226257801056
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.995597299677136,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12353515625,
+ "learning_rate": 0.0009767562431702904,
+ "loss": 0.0387,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 2403241.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.12339717149734497,
+ "skip_count": 3.0,
+ "step": 1490,
+ "text_loss": 0.2850193977355957
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.004696213677723,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07177734375,
+ "learning_rate": 0.0009766628779401142,
+ "loss": 0.0215,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2406087.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008174685761332512,
+ "skip_count": 1.0,
+ "step": 1492,
+ "text_loss": 0.6756544709205627
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.014088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 0.000976569330051824,
+ "loss": 0.0186,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2409312.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021256296895444393,
+ "skip_count": 0.0,
+ "step": 1494,
+ "text_loss": 0.4789894223213196
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.0234810683886115,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.0009764755995412677,
+ "loss": 0.0193,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2412758.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003944927826523781,
+ "skip_count": 0.0,
+ "step": 1496,
+ "text_loss": 0.5157490968704224
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.032873495744056,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09228515625,
+ "learning_rate": 0.0009763816864443627,
+ "loss": 0.0239,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2416079.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03893325850367546,
+ "skip_count": 0.0,
+ "step": 1498,
+ "text_loss": 0.28045418858528137
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.042265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1279296875,
+ "learning_rate": 0.0009762875907970968,
+ "loss": 0.0199,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2420340.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017725443467497826,
+ "skip_count": 0.0,
+ "step": 1500,
+ "text_loss": 0.35550856590270996
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.051658350454946,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 0.0009761933126355277,
+ "loss": 0.0245,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2424735.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01393749937415123,
+ "skip_count": 1.0,
+ "step": 1502,
+ "text_loss": 0.38840189576148987
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 7.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1630859375,
+ "learning_rate": 0.0009760988519957828,
+ "loss": 0.0249,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2428132.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01687910407781601,
+ "skip_count": 2.0,
+ "step": 1504,
+ "text_loss": 0.3031681478023529
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.0704432051658355,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0009760042089140598,
+ "loss": 0.0193,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 2431592.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04704280197620392,
+ "skip_count": 2.0,
+ "step": 1506,
+ "text_loss": 0.16355200111865997
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0986328125,
+ "learning_rate": 0.0009759093834266259,
+ "loss": 0.0206,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2434236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016075772000476718,
+ "skip_count": 0.0,
+ "step": 1508,
+ "text_loss": 0.6080073118209839
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1025390625,
+ "learning_rate": 0.0009758143755698186,
+ "loss": 0.015,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2437170.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008451299741864204,
+ "skip_count": 0.0,
+ "step": 1510,
+ "text_loss": 0.22100484371185303
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 7.098620487232169,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0009757191853800449,
+ "loss": 0.0227,
+ "macro_f1": 0.5866667032241821,
+ "num_tokens": 2441187.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.046565692871809006,
+ "skip_count": 3.0,
+ "step": 1512,
+ "text_loss": 0.25098952651023865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.108012914587614,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11279296875,
+ "learning_rate": 0.000975623812893782,
+ "loss": 0.0276,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2444664.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02872578240931034,
+ "skip_count": 1.0,
+ "step": 1514,
+ "text_loss": 0.4952253997325897
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.1174053419430585,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1142578125,
+ "learning_rate": 0.0009755282581475768,
+ "loss": 0.0233,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2447748.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002055214950814843,
+ "skip_count": 0.0,
+ "step": 1516,
+ "text_loss": 0.7465500831604004
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.126797769298503,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10302734375,
+ "learning_rate": 0.000975432521178046,
+ "loss": 0.0216,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2450834.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04498551785945892,
+ "skip_count": 0.0,
+ "step": 1518,
+ "text_loss": 0.28144413232803345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.136190196653947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09912109375,
+ "learning_rate": 0.0009753366020218763,
+ "loss": 0.0234,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2454233.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003669742727652192,
+ "skip_count": 0.0,
+ "step": 1520,
+ "text_loss": 0.5667551755905151
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0830078125,
+ "learning_rate": 0.0009752405007158238,
+ "loss": 0.0238,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2457331.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010455607436597347,
+ "skip_count": 0.0,
+ "step": 1522,
+ "text_loss": 0.19575810432434082
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 7.154975051364837,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0751953125,
+ "learning_rate": 0.0009751442172967151,
+ "loss": 0.0193,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 2459935.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.025189083069562912,
+ "skip_count": 1.0,
+ "step": 1524,
+ "text_loss": 0.45453405380249023
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 7.164367478720282,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0927734375,
+ "learning_rate": 0.000975047751801446,
+ "loss": 0.0187,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2463008.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012297490611672401,
+ "skip_count": 0.0,
+ "step": 1526,
+ "text_loss": 0.31437572836875916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1044921875,
+ "learning_rate": 0.0009749511042669823,
+ "loss": 0.0233,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2466475.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011026266030967236,
+ "skip_count": 0.0,
+ "step": 1528,
+ "text_loss": 0.46604859828948975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.183152333431171,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1376953125,
+ "learning_rate": 0.0009748542747303595,
+ "loss": 0.0182,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2469320.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011934996582567692,
+ "skip_count": 1.0,
+ "step": 1530,
+ "text_loss": 0.7764923572540283
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.192544760786616,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0966796875,
+ "learning_rate": 0.0009747572632286827,
+ "loss": 0.0203,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2472468.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005786920432001352,
+ "skip_count": 0.0,
+ "step": 1532,
+ "text_loss": 0.3555782437324524
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 7.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0849609375,
+ "learning_rate": 0.0009746600697991271,
+ "loss": 0.02,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2475736.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0026990731712430716,
+ "skip_count": 0.0,
+ "step": 1534,
+ "text_loss": 0.49561792612075806
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 7.2113296154975055,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0556640625,
+ "learning_rate": 0.0009745626944789375,
+ "loss": 0.0204,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 2478887.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.020221207290887833,
+ "skip_count": 2.0,
+ "step": 1536,
+ "text_loss": 0.5375416278839111
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.22072204285295,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12158203125,
+ "learning_rate": 0.0009744651373054279,
+ "loss": 0.0286,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2481293.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03131086751818657,
+ "skip_count": 1.0,
+ "step": 1538,
+ "text_loss": 0.5241039395332336
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 7.230114470208394,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.08984375,
+ "learning_rate": 0.0009743673983159828,
+ "loss": 0.0241,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 2484403.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04448170214891434,
+ "skip_count": 4.0,
+ "step": 1540,
+ "text_loss": 0.7465724349021912
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.239506897563839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08935546875,
+ "learning_rate": 0.0009742694775480557,
+ "loss": 0.0265,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2487952.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007171491626650095,
+ "skip_count": 1.0,
+ "step": 1542,
+ "text_loss": 0.2877117097377777
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 7.248899324919284,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07275390625,
+ "learning_rate": 0.0009741713750391703,
+ "loss": 0.0171,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2490815.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004559285007417202,
+ "skip_count": 0.0,
+ "step": 1544,
+ "text_loss": 0.6097800135612488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.258291752274729,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06787109375,
+ "learning_rate": 0.0009740730908269193,
+ "loss": 0.0174,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2494727.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005271553061902523,
+ "skip_count": 0.0,
+ "step": 1546,
+ "text_loss": 0.5431114435195923
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0703125,
+ "learning_rate": 0.0009739746249489658,
+ "loss": 0.0239,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2499266.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015409323386847973,
+ "skip_count": 0.0,
+ "step": 1548,
+ "text_loss": 0.4702678322792053
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.277076606985618,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1171875,
+ "learning_rate": 0.0009738759774430417,
+ "loss": 0.0216,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2502273.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.030183158814907074,
+ "skip_count": 1.0,
+ "step": 1550,
+ "text_loss": 0.3239189088344574
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.286469034341063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.0009737771483469493,
+ "loss": 0.0196,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2507624.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005410848651081324,
+ "skip_count": 0.0,
+ "step": 1552,
+ "text_loss": 0.4014642834663391
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07763671875,
+ "learning_rate": 0.0009736781376985598,
+ "loss": 0.0168,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2510366.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0066976165398955345,
+ "skip_count": 1.0,
+ "step": 1554,
+ "text_loss": 0.5924848914146423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.305253889051952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.13671875,
+ "learning_rate": 0.0009735789455358144,
+ "loss": 0.022,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2513317.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002763477386906743,
+ "skip_count": 0.0,
+ "step": 1556,
+ "text_loss": 0.3222943842411041
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.314646316407397,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11767578125,
+ "learning_rate": 0.0009734795718967237,
+ "loss": 0.0283,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2516628.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.061566028743982315,
+ "skip_count": 2.0,
+ "step": 1558,
+ "text_loss": 0.3249334692955017
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 7.324038743762841,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.095703125,
+ "learning_rate": 0.0009733800168193679,
+ "loss": 0.0228,
+ "macro_f1": 1.0,
+ "num_tokens": 2519424.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.017976421862840652,
+ "skip_count": 4.0,
+ "step": 1560,
+ "text_loss": 0.3341919481754303
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.333431171118286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1826171875,
+ "learning_rate": 0.0009732802803418966,
+ "loss": 0.023,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2522922.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002525332849472761,
+ "skip_count": 0.0,
+ "step": 1562,
+ "text_loss": 0.3176332712173462
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.34282359847373,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07861328125,
+ "learning_rate": 0.0009731803625025292,
+ "loss": 0.0196,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2525811.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015524424612522125,
+ "skip_count": 1.0,
+ "step": 1564,
+ "text_loss": 0.532774031162262
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.3522160258291755,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10205078125,
+ "learning_rate": 0.0009730802633395541,
+ "loss": 0.0257,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 2529157.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08138631284236908,
+ "skip_count": 1.0,
+ "step": 1566,
+ "text_loss": 0.529487133026123
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07666015625,
+ "learning_rate": 0.0009729799828913298,
+ "loss": 0.0223,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2532249.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035867292899638414,
+ "skip_count": 0.0,
+ "step": 1568,
+ "text_loss": 0.503160297870636
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 7.371000880540064,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.06884765625,
+ "learning_rate": 0.0009728795211962838,
+ "loss": 0.0259,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2535904.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02987455204129219,
+ "skip_count": 2.0,
+ "step": 1570,
+ "text_loss": 0.9170270562171936
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.380393307895509,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11865234375,
+ "learning_rate": 0.0009727788782929131,
+ "loss": 0.0273,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2538943.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04676021635532379,
+ "skip_count": 0.0,
+ "step": 1572,
+ "text_loss": 0.29146310687065125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.389785735250954,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0009726780542197844,
+ "loss": 0.0169,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2541805.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002127803163602948,
+ "skip_count": 0.0,
+ "step": 1574,
+ "text_loss": 1.0126502513885498
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.399178162606399,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.142578125,
+ "learning_rate": 0.0009725770490155338,
+ "loss": 0.0262,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2546213.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007609677035361528,
+ "skip_count": 0.0,
+ "step": 1576,
+ "text_loss": 0.190168559551239
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.408570589961843,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.083984375,
+ "learning_rate": 0.0009724758627188665,
+ "loss": 0.0356,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2549554.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.033554721623659134,
+ "skip_count": 1.0,
+ "step": 1578,
+ "text_loss": 0.2977406084537506
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.4179630173172875,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.140625,
+ "learning_rate": 0.0009723744953685572,
+ "loss": 0.028,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2552785.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.027864238247275352,
+ "skip_count": 0.0,
+ "step": 1580,
+ "text_loss": 0.2700682580471039
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.427355444672733,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.19921875,
+ "learning_rate": 0.0009722729470034503,
+ "loss": 0.0224,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2556550.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004798175301402807,
+ "skip_count": 0.0,
+ "step": 1582,
+ "text_loss": 0.6559903025627136
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.436747872028177,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.078125,
+ "learning_rate": 0.0009721712176624591,
+ "loss": 0.0242,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2559862.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013764148578047752,
+ "skip_count": 0.0,
+ "step": 1584,
+ "text_loss": 0.2257535308599472
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 7.446140299383622,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10986328125,
+ "learning_rate": 0.0009720693073845667,
+ "loss": 0.032,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2562766.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01937069371342659,
+ "skip_count": 2.0,
+ "step": 1586,
+ "text_loss": 0.178413525223732
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 7.455532726739067,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.150390625,
+ "learning_rate": 0.0009719672162088252,
+ "loss": 0.0306,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 2566583.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06224144622683525,
+ "skip_count": 0.0,
+ "step": 1588,
+ "text_loss": 0.3992367684841156
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 27.0,
+ "epoch": 7.464925154094511,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.185546875,
+ "learning_rate": 0.0009718649441743559,
+ "loss": 0.0239,
+ "macro_f1": 0.9449735879898071,
+ "num_tokens": 2569516.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.06937911361455917,
+ "skip_count": 4.0,
+ "step": 1590,
+ "text_loss": 0.1945122629404068
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.00097176249132035,
+ "loss": 0.0229,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2572418.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034326619934290648,
+ "skip_count": 0.0,
+ "step": 1592,
+ "text_loss": 0.6259906888008118
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 7.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08642578125,
+ "learning_rate": 0.0009716598576860676,
+ "loss": 0.0278,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2575235.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004557516425848007,
+ "skip_count": 0.0,
+ "step": 1594,
+ "text_loss": 0.6638736724853516
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 7.493102436160846,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.193359375,
+ "learning_rate": 0.0009715570433108378,
+ "loss": 0.0198,
+ "macro_f1": 1.0,
+ "num_tokens": 2578157.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015363055281341076,
+ "skip_count": 1.0,
+ "step": 1596,
+ "text_loss": 0.6530464887619019
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 7.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1484375,
+ "learning_rate": 0.0009714540482340595,
+ "loss": 0.0268,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2581801.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01257144846022129,
+ "skip_count": 0.0,
+ "step": 1598,
+ "text_loss": 0.5916110277175903
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.5118872908717345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.058837890625,
+ "learning_rate": 0.0009713508724952006,
+ "loss": 0.0177,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2585204.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003175645601004362,
+ "skip_count": 0.0,
+ "step": 1600,
+ "text_loss": 0.27901601791381836
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.521279718227179,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12353515625,
+ "learning_rate": 0.0009712475161337981,
+ "loss": 0.0261,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2588286.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004122321493923664,
+ "skip_count": 0.0,
+ "step": 1602,
+ "text_loss": 0.42420244216918945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07470703125,
+ "learning_rate": 0.0009711439791894585,
+ "loss": 0.0341,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2591476.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011215819045901299,
+ "skip_count": 1.0,
+ "step": 1604,
+ "text_loss": 0.5549933910369873
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 7.540064572938069,
+ "f1_execute": 0.9599999785423279,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.0703125,
+ "learning_rate": 0.0009710402617018574,
+ "loss": 0.0172,
+ "macro_f1": 0.8200000524520874,
+ "num_tokens": 2594336.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02916567400097847,
+ "skip_count": 2.0,
+ "step": 1606,
+ "text_loss": 0.3263779282569885
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.549457000293513,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0009709363637107393,
+ "loss": 0.0209,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2597462.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015897957608103752,
+ "skip_count": 1.0,
+ "step": 1608,
+ "text_loss": 0.20917139947414398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.558849427648958,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009708322852559184,
+ "loss": 0.0229,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2601543.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002211357234045863,
+ "skip_count": 0.0,
+ "step": 1610,
+ "text_loss": 0.450550377368927
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 7.568241855004403,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1728515625,
+ "learning_rate": 0.0009707280263772776,
+ "loss": 0.0277,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2604462.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01615734025835991,
+ "skip_count": 2.0,
+ "step": 1612,
+ "text_loss": 0.6908381581306458
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 7.577634282359847,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0888671875,
+ "learning_rate": 0.0009706235871147688,
+ "loss": 0.0241,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2607484.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022048067301511765,
+ "skip_count": 2.0,
+ "step": 1614,
+ "text_loss": 0.36691340804100037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.587026709715292,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10546875,
+ "learning_rate": 0.0009705189675084138,
+ "loss": 0.0176,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2610204.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008503952994942665,
+ "skip_count": 1.0,
+ "step": 1616,
+ "text_loss": 0.5226598381996155
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.596419137070737,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09228515625,
+ "learning_rate": 0.0009704141675983029,
+ "loss": 0.0248,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2613128.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019020626787096262,
+ "skip_count": 0.0,
+ "step": 1618,
+ "text_loss": 0.6465088725090027
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5714285969734192,
+ "avg_layers": 24.0,
+ "epoch": 7.6058115644261814,
+ "f1_execute": 0.9333333373069763,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.7272727489471436,
+ "grad_norm": 0.107421875,
+ "learning_rate": 0.0009703091874245956,
+ "loss": 0.032,
+ "macro_f1": 0.5535354018211365,
+ "num_tokens": 2616360.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.11837691068649292,
+ "skip_count": 7.0,
+ "step": 1620,
+ "text_loss": 0.2987039089202881
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.615203991781626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0009702040270275204,
+ "loss": 0.0181,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2619606.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0065958453342318535,
+ "skip_count": 0.0,
+ "step": 1622,
+ "text_loss": 0.6262096166610718
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.103515625,
+ "learning_rate": 0.000970098686447375,
+ "loss": 0.0257,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2622499.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013632026500999928,
+ "skip_count": 1.0,
+ "step": 1624,
+ "text_loss": 0.2392602562904358
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 7.633988846492516,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.125,
+ "learning_rate": 0.0009699931657245264,
+ "loss": 0.0245,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2626002.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012147823348641396,
+ "skip_count": 2.0,
+ "step": 1626,
+ "text_loss": 0.4742976129055023
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 7.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0849609375,
+ "learning_rate": 0.0009698874648994098,
+ "loss": 0.0285,
+ "macro_f1": 1.0,
+ "num_tokens": 2629847.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010692884214222431,
+ "skip_count": 3.0,
+ "step": 1628,
+ "text_loss": 0.5090685486793518
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.6527737012034045,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1240234375,
+ "learning_rate": 0.0009697815840125304,
+ "loss": 0.0265,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2633529.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011442207731306553,
+ "skip_count": 0.0,
+ "step": 1630,
+ "text_loss": 0.1874329298734665
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2119140625,
+ "learning_rate": 0.0009696755231044618,
+ "loss": 0.0207,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2636321.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026681360322982073,
+ "skip_count": 0.0,
+ "step": 1632,
+ "text_loss": 0.7650400400161743
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.671558555914294,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10498046875,
+ "learning_rate": 0.0009695692822158466,
+ "loss": 0.0242,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2638840.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.033965807408094406,
+ "skip_count": 0.0,
+ "step": 1634,
+ "text_loss": 0.6175784468650818
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.680950983269739,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0009694628613873968,
+ "loss": 0.018,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2641886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007568214554339647,
+ "skip_count": 0.0,
+ "step": 1636,
+ "text_loss": 0.43139931559562683
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.690343410625183,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.193359375,
+ "learning_rate": 0.0009693562606598929,
+ "loss": 0.025,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2645028.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004973865579813719,
+ "skip_count": 0.0,
+ "step": 1638,
+ "text_loss": 0.6430339217185974
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.699735837980628,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06982421875,
+ "learning_rate": 0.0009692494800741844,
+ "loss": 0.0313,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2648209.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.049863800406455994,
+ "skip_count": 0.0,
+ "step": 1640,
+ "text_loss": 0.28138160705566406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 7.709128265336073,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08544921875,
+ "learning_rate": 0.0009691425196711901,
+ "loss": 0.0398,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2651171.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02112230286002159,
+ "skip_count": 0.0,
+ "step": 1642,
+ "text_loss": 0.3745322525501251
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.718520692691517,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0703125,
+ "learning_rate": 0.0009690353794918971,
+ "loss": 0.0275,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2654093.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024304776452481747,
+ "skip_count": 0.0,
+ "step": 1644,
+ "text_loss": 0.4275154173374176
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.727913120046962,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0771484375,
+ "learning_rate": 0.000968928059577362,
+ "loss": 0.0244,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2657079.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009320619516074657,
+ "skip_count": 1.0,
+ "step": 1646,
+ "text_loss": 0.46650025248527527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 7.737305547402407,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09814453125,
+ "learning_rate": 0.0009688205599687099,
+ "loss": 0.0209,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2660951.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011913162656128407,
+ "skip_count": 0.0,
+ "step": 1648,
+ "text_loss": 0.46644100546836853
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.7466979747578515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1083984375,
+ "learning_rate": 0.0009687128807071347,
+ "loss": 0.0284,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2663823.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013754756189882755,
+ "skip_count": 0.0,
+ "step": 1650,
+ "text_loss": 0.40808847546577454
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.756090402113296,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.103515625,
+ "learning_rate": 0.0009686050218338996,
+ "loss": 0.0286,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2667079.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009099726565182209,
+ "skip_count": 0.0,
+ "step": 1652,
+ "text_loss": 0.2389989197254181
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.765482829468741,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08837890625,
+ "learning_rate": 0.0009684969833903359,
+ "loss": 0.0283,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2670162.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034928603563457727,
+ "skip_count": 1.0,
+ "step": 1654,
+ "text_loss": 0.6930749416351318
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.774875256824186,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10888671875,
+ "learning_rate": 0.0009683887654178445,
+ "loss": 0.0261,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2673031.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008340462110936642,
+ "skip_count": 1.0,
+ "step": 1656,
+ "text_loss": 0.277752548456192
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06884765625,
+ "learning_rate": 0.0009682803679578947,
+ "loss": 0.0259,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2676092.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004337446764111519,
+ "skip_count": 0.0,
+ "step": 1658,
+ "text_loss": 0.5176776051521301
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.7936601115350745,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.169921875,
+ "learning_rate": 0.0009681717910520244,
+ "loss": 0.0242,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2679479.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.034611742943525314,
+ "skip_count": 2.0,
+ "step": 1660,
+ "text_loss": 0.21485982835292816
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 7.80305253889052,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.07958984375,
+ "learning_rate": 0.0009680630347418406,
+ "loss": 0.022,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2683289.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03297121450304985,
+ "skip_count": 2.0,
+ "step": 1662,
+ "text_loss": 0.33801013231277466
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.812444966245964,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1728515625,
+ "learning_rate": 0.000967954099069019,
+ "loss": 0.0411,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2685879.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04551183059811592,
+ "skip_count": 1.0,
+ "step": 1664,
+ "text_loss": 0.41123488545417786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.821837393601409,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1240234375,
+ "learning_rate": 0.0009678449840753038,
+ "loss": 0.0324,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2688910.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05866450071334839,
+ "skip_count": 2.0,
+ "step": 1666,
+ "text_loss": 0.1740892380475998
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09228515625,
+ "learning_rate": 0.0009677356898025082,
+ "loss": 0.023,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2691680.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009243223816156387,
+ "skip_count": 0.0,
+ "step": 1668,
+ "text_loss": 0.2512350380420685
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.8406222483122985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09619140625,
+ "learning_rate": 0.000967626216292514,
+ "loss": 0.0195,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2694895.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005576452240347862,
+ "skip_count": 0.0,
+ "step": 1670,
+ "text_loss": 0.43294376134872437
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 26.0,
+ "epoch": 7.850014675667743,
+ "f1_execute": 0.9411764740943909,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.09130859375,
+ "learning_rate": 0.0009675165635872715,
+ "loss": 0.0306,
+ "macro_f1": 0.44705885648727417,
+ "num_tokens": 2697806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05372785031795502,
+ "skip_count": 3.0,
+ "step": 1672,
+ "text_loss": 0.1614082306623459
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 7.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11669921875,
+ "learning_rate": 0.0009674067317288,
+ "loss": 0.0296,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2700529.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.018131591379642487,
+ "skip_count": 0.0,
+ "step": 1674,
+ "text_loss": 0.2093173861503601
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.868799530378633,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08203125,
+ "learning_rate": 0.0009672967207591869,
+ "loss": 0.0257,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2703650.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0673515796661377,
+ "skip_count": 1.0,
+ "step": 1676,
+ "text_loss": 0.3029400110244751
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 7.878191957734077,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11669921875,
+ "learning_rate": 0.0009671865307205892,
+ "loss": 0.021,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 2707615.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03821169584989548,
+ "skip_count": 1.0,
+ "step": 1678,
+ "text_loss": 0.2262786477804184
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 25.0,
+ "epoch": 7.8875843850895215,
+ "f1_execute": 0.9756097793579102,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.1396484375,
+ "learning_rate": 0.0009670761616552315,
+ "loss": 0.0465,
+ "macro_f1": 0.9615669250488281,
+ "num_tokens": 2710894.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.042625464498996735,
+ "skip_count": 6.0,
+ "step": 1680,
+ "text_loss": 0.29623574018478394
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.896976812444966,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.169921875,
+ "learning_rate": 0.0009669656136054074,
+ "loss": 0.0289,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2714330.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037571541033685207,
+ "skip_count": 0.0,
+ "step": 1682,
+ "text_loss": 0.7510389089584351
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.906369239800411,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07421875,
+ "learning_rate": 0.0009668548866134795,
+ "loss": 0.0256,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2717176.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004142968449741602,
+ "skip_count": 0.0,
+ "step": 1684,
+ "text_loss": 0.3273485600948334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 7.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.0009667439807218783,
+ "loss": 0.0233,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2720628.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008753842674195766,
+ "skip_count": 2.0,
+ "step": 1686,
+ "text_loss": 0.4314708709716797
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.9251540945113,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0732421875,
+ "learning_rate": 0.0009666328959731033,
+ "loss": 0.0211,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 2723739.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.022674910724163055,
+ "skip_count": 1.0,
+ "step": 1688,
+ "text_loss": 0.25734150409698486
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 7.934546521866745,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0009665216324097222,
+ "loss": 0.0324,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 2726644.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03932750225067139,
+ "skip_count": 3.0,
+ "step": 1690,
+ "text_loss": 0.24511034786701202
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.94393894922219,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09765625,
+ "learning_rate": 0.0009664101900743714,
+ "loss": 0.0255,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2729662.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012672754004597664,
+ "skip_count": 1.0,
+ "step": 1692,
+ "text_loss": 0.39431414008140564
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 7.953331376577634,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.076171875,
+ "learning_rate": 0.000966298569009756,
+ "loss": 0.0231,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2732578.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01548632513731718,
+ "skip_count": 2.0,
+ "step": 1694,
+ "text_loss": 0.12439999729394913
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.962723803933079,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0849609375,
+ "learning_rate": 0.0009661867692586494,
+ "loss": 0.0153,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2735887.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05622401833534241,
+ "skip_count": 2.0,
+ "step": 1696,
+ "text_loss": 0.29024389386177063
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.972116231288524,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.087890625,
+ "learning_rate": 0.0009660747908638933,
+ "loss": 0.0205,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2739293.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.041060201823711395,
+ "skip_count": 1.0,
+ "step": 1698,
+ "text_loss": 0.39461007714271545
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.9815086586439685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1767578125,
+ "learning_rate": 0.0009659626338683981,
+ "loss": 0.0369,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2742468.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007251353468745947,
+ "skip_count": 0.0,
+ "step": 1700,
+ "text_loss": 0.2751767635345459
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.990901085999413,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07763671875,
+ "learning_rate": 0.0009658502983151427,
+ "loss": 0.0186,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2745123.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012847424484789371,
+ "skip_count": 1.0,
+ "step": 1702,
+ "text_loss": 0.4756404757499695
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.11767578125,
+ "learning_rate": 0.0009657377842471742,
+ "loss": 0.0313,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2748016.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007060411386191845,
+ "skip_count": 1.0,
+ "step": 1704,
+ "text_loss": 0.9571210145950317
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 8.009392427355445,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10009765625,
+ "learning_rate": 0.0009656250917076081,
+ "loss": 0.0188,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2750717.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016748681664466858,
+ "skip_count": 2.0,
+ "step": 1706,
+ "text_loss": 0.14542843401432037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.018784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060302734375,
+ "learning_rate": 0.0009655122207396285,
+ "loss": 0.017,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2753635.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013607042841613293,
+ "skip_count": 0.0,
+ "step": 1708,
+ "text_loss": 0.21836471557617188
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0732421875,
+ "learning_rate": 0.0009653991713864878,
+ "loss": 0.0205,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2756643.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012097888393327594,
+ "skip_count": 0.0,
+ "step": 1710,
+ "text_loss": 0.635187029838562
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1171875,
+ "learning_rate": 0.0009652859436915066,
+ "loss": 0.0231,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2759432.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006196760106831789,
+ "skip_count": 0.0,
+ "step": 1712,
+ "text_loss": 0.5629420876502991
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.046962136777223,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0009651725376980743,
+ "loss": 0.0177,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2762538.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0042513771913945675,
+ "skip_count": 0.0,
+ "step": 1714,
+ "text_loss": 0.39522525668144226
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 29.0,
+ "epoch": 8.056354564132668,
+ "f1_execute": 0.9583333134651184,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.1494140625,
+ "learning_rate": 0.0009650589534496479,
+ "loss": 0.0194,
+ "macro_f1": 0.8194444179534912,
+ "num_tokens": 2765571.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03596706688404083,
+ "skip_count": 3.0,
+ "step": 1716,
+ "text_loss": 0.6252416968345642
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04833984375,
+ "learning_rate": 0.0009649451909897532,
+ "loss": 0.0178,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2769206.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025788163766264915,
+ "skip_count": 0.0,
+ "step": 1718,
+ "text_loss": 0.8851634860038757
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.075139418843557,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10791015625,
+ "learning_rate": 0.0009648312503619843,
+ "loss": 0.0265,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2772488.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004443451762199402,
+ "skip_count": 0.0,
+ "step": 1720,
+ "text_loss": 0.8568580746650696
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 8.084531846199003,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0009647171316100034,
+ "loss": 0.0265,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 2776482.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.022948263213038445,
+ "skip_count": 3.0,
+ "step": 1722,
+ "text_loss": 0.13431036472320557
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1572265625,
+ "learning_rate": 0.0009646028347775409,
+ "loss": 0.0204,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2778966.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011328035034239292,
+ "skip_count": 1.0,
+ "step": 1724,
+ "text_loss": 0.2085491120815277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.103316700909891,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08984375,
+ "learning_rate": 0.0009644883599083958,
+ "loss": 0.0238,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2781968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002208018908277154,
+ "skip_count": 0.0,
+ "step": 1726,
+ "text_loss": 0.4948323965072632
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.112709128265337,
+ "f1_execute": 0.9411764740943909,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0009643737070464349,
+ "loss": 0.0158,
+ "macro_f1": 0.6470588445663452,
+ "num_tokens": 2784666.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04391832649707794,
+ "skip_count": 2.0,
+ "step": 1728,
+ "text_loss": 0.39060094952583313
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046630859375,
+ "learning_rate": 0.0009642588762355935,
+ "loss": 0.0212,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2787558.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004497280344367027,
+ "skip_count": 1.0,
+ "step": 1730,
+ "text_loss": 0.34908708930015564
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.131493982976226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07275390625,
+ "learning_rate": 0.0009641438675198748,
+ "loss": 0.0175,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2790474.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00583475548774004,
+ "skip_count": 0.0,
+ "step": 1732,
+ "text_loss": 0.5720033049583435
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.140886410331671,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08154296875,
+ "learning_rate": 0.0009640286809433508,
+ "loss": 0.0235,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2793272.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007826375775039196,
+ "skip_count": 0.0,
+ "step": 1734,
+ "text_loss": 0.32181721925735474
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.0009639133165501606,
+ "loss": 0.0192,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2797726.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019055595621466637,
+ "skip_count": 0.0,
+ "step": 1736,
+ "text_loss": 0.620936393737793
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.134765625,
+ "learning_rate": 0.0009637977743845124,
+ "loss": 0.0229,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2800706.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028302327264100313,
+ "skip_count": 0.0,
+ "step": 1738,
+ "text_loss": 0.6473138332366943
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.169063692398003,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0009636820544906823,
+ "loss": 0.0146,
+ "macro_f1": 1.0,
+ "num_tokens": 2803847.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01105099730193615,
+ "skip_count": 2.0,
+ "step": 1740,
+ "text_loss": 0.4401201903820038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 8.178456119753449,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.1455078125,
+ "learning_rate": 0.0009635661569130141,
+ "loss": 0.0195,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 2807235.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02619045600295067,
+ "skip_count": 3.0,
+ "step": 1742,
+ "text_loss": 0.459264874458313
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.187848547108894,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.0009634500816959202,
+ "loss": 0.0162,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2810396.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007915694266557693,
+ "skip_count": 2.0,
+ "step": 1744,
+ "text_loss": 0.5084020495414734
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 8.197240974464338,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1748046875,
+ "learning_rate": 0.0009633338288838805,
+ "loss": 0.0271,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2813215.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.08364596217870712,
+ "skip_count": 0.0,
+ "step": 1746,
+ "text_loss": 0.27681824564933777
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 30.0,
+ "epoch": 8.206633401819783,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.0009632173985214438,
+ "loss": 0.0156,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 2816452.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.028805451467633247,
+ "skip_count": 2.0,
+ "step": 1748,
+ "text_loss": 0.4678419530391693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.216025829175228,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0625,
+ "learning_rate": 0.000963100790653226,
+ "loss": 0.0188,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2819364.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03056817688047886,
+ "skip_count": 1.0,
+ "step": 1750,
+ "text_loss": 0.3078109920024872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.225418256530672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0009629840053239116,
+ "loss": 0.0205,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2823469.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019477814203128219,
+ "skip_count": 0.0,
+ "step": 1752,
+ "text_loss": 0.45501336455345154
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.057373046875,
+ "learning_rate": 0.000962867042578253,
+ "loss": 0.0173,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2826716.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032963966950774193,
+ "skip_count": 0.0,
+ "step": 1754,
+ "text_loss": 0.49234694242477417
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.244203111241562,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0947265625,
+ "learning_rate": 0.0009627499024610707,
+ "loss": 0.0239,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2829733.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010289114899933338,
+ "skip_count": 1.0,
+ "step": 1756,
+ "text_loss": 0.22335539758205414
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.253595538597006,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0888671875,
+ "learning_rate": 0.0009626325850172527,
+ "loss": 0.0174,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2833350.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03249066323041916,
+ "skip_count": 1.0,
+ "step": 1758,
+ "text_loss": 0.6581931114196777
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.262987965952451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0703125,
+ "learning_rate": 0.0009625150902917555,
+ "loss": 0.0185,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2836558.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00870000571012497,
+ "skip_count": 0.0,
+ "step": 1760,
+ "text_loss": 0.22938725352287292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1259765625,
+ "learning_rate": 0.0009623974183296031,
+ "loss": 0.0192,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2840560.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007767196744680405,
+ "skip_count": 0.0,
+ "step": 1762,
+ "text_loss": 0.24473799765110016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09228515625,
+ "learning_rate": 0.0009622795691758876,
+ "loss": 0.0244,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2843548.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021693643648177385,
+ "skip_count": 0.0,
+ "step": 1764,
+ "text_loss": 0.3084608018398285
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.0009621615428757693,
+ "loss": 0.0149,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2847076.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024727333802729845,
+ "skip_count": 0.0,
+ "step": 1766,
+ "text_loss": 0.5251734852790833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.300557675374229,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 0.000962043339474476,
+ "loss": 0.0194,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2849751.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005174890160560608,
+ "skip_count": 0.0,
+ "step": 1768,
+ "text_loss": 0.4410129189491272
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.309950102729674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06103515625,
+ "learning_rate": 0.0009619249590173032,
+ "loss": 0.016,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2853916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006785830482840538,
+ "skip_count": 2.0,
+ "step": 1770,
+ "text_loss": 0.550076425075531
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 8.31934253008512,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.06591796875,
+ "learning_rate": 0.0009618064015496149,
+ "loss": 0.0192,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 2857372.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021370256319642067,
+ "skip_count": 3.0,
+ "step": 1772,
+ "text_loss": 0.1988629847764969
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.072265625,
+ "learning_rate": 0.0009616876671168423,
+ "loss": 0.0162,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2861028.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004313841462135315,
+ "skip_count": 1.0,
+ "step": 1774,
+ "text_loss": 0.42581331729888916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.338127384796008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1103515625,
+ "learning_rate": 0.0009615687557644847,
+ "loss": 0.0268,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2864847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025742491707205772,
+ "skip_count": 0.0,
+ "step": 1776,
+ "text_loss": 0.46510905027389526
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1494140625,
+ "learning_rate": 0.0009614496675381093,
+ "loss": 0.0109,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2867392.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016813480760902166,
+ "skip_count": 0.0,
+ "step": 1778,
+ "text_loss": 0.5922174453735352
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0810546875,
+ "learning_rate": 0.0009613304024833507,
+ "loss": 0.0166,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2871273.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004948933608829975,
+ "skip_count": 0.0,
+ "step": 1780,
+ "text_loss": 0.6776977777481079
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.366304666862343,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07470703125,
+ "learning_rate": 0.0009612109606459117,
+ "loss": 0.0186,
+ "macro_f1": 1.0,
+ "num_tokens": 2874172.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.016950147226452827,
+ "skip_count": 2.0,
+ "step": 1782,
+ "text_loss": 0.48758944869041443
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.375697094217786,
+ "f1_execute": 0.9599999785423279,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.08251953125,
+ "learning_rate": 0.0009610913420715623,
+ "loss": 0.0237,
+ "macro_f1": 0.7644444704055786,
+ "num_tokens": 2877528.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04880943149328232,
+ "skip_count": 1.0,
+ "step": 1784,
+ "text_loss": 0.4404778480529785
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.385089521573232,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 0.0009609715468061411,
+ "loss": 0.0205,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2880627.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004678630735725164,
+ "skip_count": 0.0,
+ "step": 1786,
+ "text_loss": 0.7295402884483337
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.394481948928677,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07958984375,
+ "learning_rate": 0.0009608515748955535,
+ "loss": 0.0205,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2883333.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026695074047893286,
+ "skip_count": 0.0,
+ "step": 1788,
+ "text_loss": 0.9697831273078918
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 8.40387437628412,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.107421875,
+ "learning_rate": 0.000960731426385773,
+ "loss": 0.0157,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 2887444.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.029743613675236702,
+ "skip_count": 2.0,
+ "step": 1790,
+ "text_loss": 0.4737568199634552
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10107421875,
+ "learning_rate": 0.0009606111013228407,
+ "loss": 0.0207,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2890221.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016153788892552257,
+ "skip_count": 0.0,
+ "step": 1792,
+ "text_loss": 0.6693558096885681
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.422659230995011,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08349609375,
+ "learning_rate": 0.0009604905997528655,
+ "loss": 0.02,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2893262.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01965433731675148,
+ "skip_count": 1.0,
+ "step": 1794,
+ "text_loss": 0.45227760076522827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.432051658350455,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08642578125,
+ "learning_rate": 0.0009603699217220239,
+ "loss": 0.0117,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 2896823.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.024017298594117165,
+ "skip_count": 2.0,
+ "step": 1796,
+ "text_loss": 0.48865509033203125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08837890625,
+ "learning_rate": 0.0009602490672765597,
+ "loss": 0.0182,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2899707.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012420224957168102,
+ "skip_count": 0.0,
+ "step": 1798,
+ "text_loss": 0.43292415142059326
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07861328125,
+ "learning_rate": 0.0009601280364627848,
+ "loss": 0.0196,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2902795.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020389219280332327,
+ "skip_count": 0.0,
+ "step": 1800,
+ "text_loss": 0.41021591424942017
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.460228940416789,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0009600068293270783,
+ "loss": 0.0142,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2905769.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002006303984671831,
+ "skip_count": 0.0,
+ "step": 1802,
+ "text_loss": 0.46892106533050537
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08740234375,
+ "learning_rate": 0.000959885445915887,
+ "loss": 0.017,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2909475.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003734810510650277,
+ "skip_count": 0.0,
+ "step": 1804,
+ "text_loss": 0.45364710688591003
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 8.479013795127678,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.11669921875,
+ "learning_rate": 0.0009597638862757254,
+ "loss": 0.0182,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 2914348.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.038971323519945145,
+ "skip_count": 2.0,
+ "step": 1806,
+ "text_loss": 0.42913779616355896
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.488406222483123,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.080078125,
+ "learning_rate": 0.0009596421504531751,
+ "loss": 0.0249,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2917467.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04800829663872719,
+ "skip_count": 0.0,
+ "step": 1808,
+ "text_loss": 0.17332297563552856
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 8.497798649838568,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1083984375,
+ "learning_rate": 0.0009595202384948858,
+ "loss": 0.0227,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2920223.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009164143353700638,
+ "skip_count": 0.0,
+ "step": 1810,
+ "text_loss": 0.33740702271461487
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0947265625,
+ "learning_rate": 0.0009593981504475742,
+ "loss": 0.0275,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2923780.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011236993595957756,
+ "skip_count": 2.0,
+ "step": 1812,
+ "text_loss": 0.1609916388988495
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 8.516583504549457,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.0009592758863580248,
+ "loss": 0.0259,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2926259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019026532769203186,
+ "skip_count": 2.0,
+ "step": 1814,
+ "text_loss": 0.6460903882980347
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 8.525975931904902,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09814453125,
+ "learning_rate": 0.0009591534462730894,
+ "loss": 0.0206,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2929173.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0608333982527256,
+ "skip_count": 0.0,
+ "step": 1816,
+ "text_loss": 0.476126492023468
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.000959030830239687,
+ "loss": 0.0175,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2932703.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0093300249427557,
+ "skip_count": 0.0,
+ "step": 1818,
+ "text_loss": 0.5471875667572021
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.544760786615791,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2001953125,
+ "learning_rate": 0.0009589080383048048,
+ "loss": 0.0235,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2936195.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010434109717607498,
+ "skip_count": 0.0,
+ "step": 1820,
+ "text_loss": 0.5068115592002869
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0986328125,
+ "learning_rate": 0.0009587850705154964,
+ "loss": 0.0291,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2939412.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004347751382738352,
+ "skip_count": 0.0,
+ "step": 1822,
+ "text_loss": 0.4241984784603119
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 8.56354564132668,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0859375,
+ "learning_rate": 0.0009586619269188836,
+ "loss": 0.0224,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 2942318.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.034238871186971664,
+ "skip_count": 1.0,
+ "step": 1824,
+ "text_loss": 0.2328975349664688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.572938068682125,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11181640625,
+ "learning_rate": 0.0009585386075621553,
+ "loss": 0.027,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2945731.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006097695790231228,
+ "skip_count": 0.0,
+ "step": 1826,
+ "text_loss": 0.22816994786262512
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.582330496037569,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0908203125,
+ "learning_rate": 0.0009584151124925676,
+ "loss": 0.0208,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2948944.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007790776435285807,
+ "skip_count": 1.0,
+ "step": 1828,
+ "text_loss": 0.5009413361549377
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07275390625,
+ "learning_rate": 0.0009582914417574438,
+ "loss": 0.0145,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2951723.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009144559502601624,
+ "skip_count": 2.0,
+ "step": 1830,
+ "text_loss": 0.1402502954006195
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 8.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 0.0009581675954041751,
+ "loss": 0.0166,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2954726.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006593191530555487,
+ "skip_count": 0.0,
+ "step": 1832,
+ "text_loss": 0.4871736466884613
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.610507778103903,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0869140625,
+ "learning_rate": 0.0009580435734802196,
+ "loss": 0.0206,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2957853.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01241068821400404,
+ "skip_count": 0.0,
+ "step": 1834,
+ "text_loss": 0.30100154876708984
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.619900205459349,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1298828125,
+ "learning_rate": 0.0009579193760331027,
+ "loss": 0.022,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2960783.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002219218760728836,
+ "skip_count": 0.0,
+ "step": 1836,
+ "text_loss": 0.4961516559123993
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.629292632814794,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.12255859375,
+ "learning_rate": 0.0009577950031104169,
+ "loss": 0.0166,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 2963328.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.029363535344600677,
+ "skip_count": 2.0,
+ "step": 1838,
+ "text_loss": 0.42814353108406067
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.25,
+ "avg_layers": 28.0,
+ "epoch": 8.638685060170237,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.1044921875,
+ "learning_rate": 0.0009576704547598226,
+ "loss": 0.0257,
+ "macro_f1": 0.7795917987823486,
+ "num_tokens": 2966108.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0579402856528759,
+ "skip_count": 4.0,
+ "step": 1840,
+ "text_loss": 0.20523512363433838
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 8.648077487525683,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0625,
+ "learning_rate": 0.0009575457310290463,
+ "loss": 0.0121,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2969137.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008810589089989662,
+ "skip_count": 0.0,
+ "step": 1842,
+ "text_loss": 0.6199528574943542
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0009574208319658831,
+ "loss": 0.0208,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2972407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012295129708945751,
+ "skip_count": 1.0,
+ "step": 1844,
+ "text_loss": 0.66938316822052
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 8.666862342236572,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.000957295757618194,
+ "loss": 0.0152,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 2976045.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06162935495376587,
+ "skip_count": 2.0,
+ "step": 1846,
+ "text_loss": 0.5381782650947571
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0830078125,
+ "learning_rate": 0.0009571705080339079,
+ "loss": 0.0144,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2979025.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003950524143874645,
+ "skip_count": 0.0,
+ "step": 1848,
+ "text_loss": 0.5831671357154846
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11376953125,
+ "learning_rate": 0.0009570450832610208,
+ "loss": 0.0209,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2982276.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010354886762797832,
+ "skip_count": 0.0,
+ "step": 1850,
+ "text_loss": 0.27448201179504395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 8.695039624302906,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.061279296875,
+ "learning_rate": 0.0009569194833475956,
+ "loss": 0.0199,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2985691.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010167439468204975,
+ "skip_count": 0.0,
+ "step": 1852,
+ "text_loss": 0.5264663696289062
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.704432051658351,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1328125,
+ "learning_rate": 0.0009567937083417624,
+ "loss": 0.0194,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2989126.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0371871180832386,
+ "skip_count": 1.0,
+ "step": 1854,
+ "text_loss": 0.2008018046617508
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 0.0009566677582917185,
+ "loss": 0.0184,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2992814.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010190588422119617,
+ "skip_count": 0.0,
+ "step": 1856,
+ "text_loss": 0.749717116355896
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.72321690636924,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.080078125,
+ "learning_rate": 0.0009565416332457282,
+ "loss": 0.0132,
+ "macro_f1": 0.6538461446762085,
+ "num_tokens": 2995729.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.022285036742687225,
+ "skip_count": 1.0,
+ "step": 1858,
+ "text_loss": 0.5870219469070435
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.732609333724685,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07666015625,
+ "learning_rate": 0.0009564153332521228,
+ "loss": 0.0224,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2998812.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011050296947360039,
+ "skip_count": 1.0,
+ "step": 1860,
+ "text_loss": 0.8444408774375916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.742001761080129,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06005859375,
+ "learning_rate": 0.0009562888583593005,
+ "loss": 0.0163,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3001799.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007125461008399725,
+ "skip_count": 0.0,
+ "step": 1862,
+ "text_loss": 0.41510361433029175
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.751394188435574,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06884765625,
+ "learning_rate": 0.0009561622086157272,
+ "loss": 0.0236,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3005088.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0049054501578211784,
+ "skip_count": 0.0,
+ "step": 1864,
+ "text_loss": 0.3801248073577881
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 8.760786615791018,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.054443359375,
+ "learning_rate": 0.000956035384069935,
+ "loss": 0.0238,
+ "macro_f1": 1.0,
+ "num_tokens": 3008178.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005162427201867104,
+ "skip_count": 1.0,
+ "step": 1866,
+ "text_loss": 0.2687684893608093
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.770179043146463,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10400390625,
+ "learning_rate": 0.0009559083847705233,
+ "loss": 0.0214,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3010923.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.028984658420085907,
+ "skip_count": 1.0,
+ "step": 1868,
+ "text_loss": 0.6277349591255188
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 8.779571470501908,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08349609375,
+ "learning_rate": 0.0009557812107661584,
+ "loss": 0.0208,
+ "macro_f1": 1.0,
+ "num_tokens": 3015030.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012200530618429184,
+ "skip_count": 1.0,
+ "step": 1870,
+ "text_loss": 0.6293368339538574
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.788963897857352,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11962890625,
+ "learning_rate": 0.0009556538621055739,
+ "loss": 0.0268,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3019067.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06365182995796204,
+ "skip_count": 1.0,
+ "step": 1872,
+ "text_loss": 0.39046618342399597
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.115234375,
+ "learning_rate": 0.0009555263388375699,
+ "loss": 0.014,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3022166.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0041703456081449986,
+ "skip_count": 1.0,
+ "step": 1874,
+ "text_loss": 0.42232340574264526
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.807748752568243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11572265625,
+ "learning_rate": 0.0009553986410110134,
+ "loss": 0.016,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3025865.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005841755773872137,
+ "skip_count": 0.0,
+ "step": 1876,
+ "text_loss": 0.37600573897361755
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.817141179923686,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09228515625,
+ "learning_rate": 0.0009552707686748388,
+ "loss": 0.0219,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3029950.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05165952071547508,
+ "skip_count": 1.0,
+ "step": 1878,
+ "text_loss": 0.33717799186706543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.826533607279131,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0849609375,
+ "learning_rate": 0.0009551427218780467,
+ "loss": 0.0219,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3033649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.020680008456110954,
+ "skip_count": 2.0,
+ "step": 1880,
+ "text_loss": 0.5011783838272095
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.835926034634575,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15625,
+ "learning_rate": 0.0009550145006697048,
+ "loss": 0.0217,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3036847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07626450061798096,
+ "skip_count": 2.0,
+ "step": 1882,
+ "text_loss": 0.3066408336162567
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 8.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056396484375,
+ "learning_rate": 0.0009548861050989482,
+ "loss": 0.0136,
+ "macro_f1": 1.0,
+ "num_tokens": 3040353.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010884666815400124,
+ "skip_count": 1.0,
+ "step": 1884,
+ "text_loss": 0.49779415130615234
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0908203125,
+ "learning_rate": 0.0009547575352149778,
+ "loss": 0.0213,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3043504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006704333238303661,
+ "skip_count": 2.0,
+ "step": 1886,
+ "text_loss": 0.12284614145755768
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 8.86410331670091,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.11474609375,
+ "learning_rate": 0.0009546287910670621,
+ "loss": 0.0211,
+ "macro_f1": 0.5427350401878357,
+ "num_tokens": 3046422.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04799000173807144,
+ "skip_count": 2.0,
+ "step": 1888,
+ "text_loss": 0.1824081838130951
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.873495744056354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1484375,
+ "learning_rate": 0.0009544998727045361,
+ "loss": 0.0306,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3049819.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008139612153172493,
+ "skip_count": 0.0,
+ "step": 1890,
+ "text_loss": 0.18929053843021393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 8.8828881714118,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.09375,
+ "learning_rate": 0.0009543707801768015,
+ "loss": 0.0175,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 3052766.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02966771461069584,
+ "skip_count": 3.0,
+ "step": 1892,
+ "text_loss": 0.247748002409935
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 25.0,
+ "epoch": 8.892280598767243,
+ "f1_execute": 0.9411764740943909,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0009542415135333267,
+ "loss": 0.0193,
+ "macro_f1": 0.44705885648727417,
+ "num_tokens": 3056427.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03637036308646202,
+ "skip_count": 2.0,
+ "step": 1894,
+ "text_loss": 0.2583999037742615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.901673026122689,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0595703125,
+ "learning_rate": 0.0009541120728236472,
+ "loss": 0.0136,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3059497.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007026574574410915,
+ "skip_count": 0.0,
+ "step": 1896,
+ "text_loss": 0.5222375988960266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.076171875,
+ "learning_rate": 0.0009539824580973646,
+ "loss": 0.0219,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3062187.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003449335927143693,
+ "skip_count": 0.0,
+ "step": 1898,
+ "text_loss": 0.5736427307128906
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0009538526694041477,
+ "loss": 0.0163,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3066100.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035463871899992228,
+ "skip_count": 0.0,
+ "step": 1900,
+ "text_loss": 0.5471583604812622
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 8.929850308189023,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.080078125,
+ "learning_rate": 0.0009537227067937318,
+ "loss": 0.0233,
+ "macro_f1": 1.0,
+ "num_tokens": 3068737.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.00597514258697629,
+ "skip_count": 3.0,
+ "step": 1902,
+ "text_loss": 0.36644190549850464
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.939242735544468,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.166015625,
+ "learning_rate": 0.0009535925703159186,
+ "loss": 0.0301,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3071686.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.025420479476451874,
+ "skip_count": 2.0,
+ "step": 1904,
+ "text_loss": 0.535789966583252
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.948635162899912,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07568359375,
+ "learning_rate": 0.0009534622600205769,
+ "loss": 0.0145,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3074954.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014377486892044544,
+ "skip_count": 0.0,
+ "step": 1906,
+ "text_loss": 0.19009549915790558
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.958027590255357,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11083984375,
+ "learning_rate": 0.0009533317759576416,
+ "loss": 0.0197,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3077540.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004848944488912821,
+ "skip_count": 0.0,
+ "step": 1908,
+ "text_loss": 0.5022001266479492
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07470703125,
+ "learning_rate": 0.0009532011181771148,
+ "loss": 0.0217,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3080445.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009480170905590057,
+ "skip_count": 2.0,
+ "step": 1910,
+ "text_loss": 0.35135936737060547
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10400390625,
+ "learning_rate": 0.0009530702867290644,
+ "loss": 0.0185,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3083657.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019353039097040892,
+ "skip_count": 0.0,
+ "step": 1912,
+ "text_loss": 0.5123994946479797
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.986204872321691,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1455078125,
+ "learning_rate": 0.0009529392816636256,
+ "loss": 0.0249,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3086837.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010921972570940852,
+ "skip_count": 0.0,
+ "step": 1914,
+ "text_loss": 0.44477662444114685
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.995597299677135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.19140625,
+ "learning_rate": 0.0009528081030309995,
+ "loss": 0.0351,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3089892.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018027103506028652,
+ "skip_count": 0.0,
+ "step": 1916,
+ "text_loss": 0.7356183528900146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.004696213677722,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07568359375,
+ "learning_rate": 0.0009526767508814542,
+ "loss": 0.0236,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3093058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003243023296818137,
+ "skip_count": 0.0,
+ "step": 1918,
+ "text_loss": 0.48823556303977966
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.014088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.080078125,
+ "learning_rate": 0.0009525452252653239,
+ "loss": 0.0175,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3096404.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009360014460980892,
+ "skip_count": 0.0,
+ "step": 1920,
+ "text_loss": 0.21498437225818634
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 9.023481068388612,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.140625,
+ "learning_rate": 0.0009524135262330098,
+ "loss": 0.0224,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 3099520.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.017444295808672905,
+ "skip_count": 3.0,
+ "step": 1922,
+ "text_loss": 0.27608850598335266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 9.032873495744056,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 0.0009522816538349789,
+ "loss": 0.0162,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3102956.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06424452364444733,
+ "skip_count": 2.0,
+ "step": 1924,
+ "text_loss": 0.21558666229248047
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 9.042265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0009521496081217651,
+ "loss": 0.0112,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3106565.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002270506462082267,
+ "skip_count": 0.0,
+ "step": 1926,
+ "text_loss": 0.5641813278198242
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 9.051658350454945,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.095703125,
+ "learning_rate": 0.0009520173891439684,
+ "loss": 0.0216,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3109314.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011512448079884052,
+ "skip_count": 1.0,
+ "step": 1928,
+ "text_loss": 0.6351624727249146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0830078125,
+ "learning_rate": 0.0009518849969522556,
+ "loss": 0.0198,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3112956.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003883908037096262,
+ "skip_count": 0.0,
+ "step": 1930,
+ "text_loss": 0.35160085558891296
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.070443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10888671875,
+ "learning_rate": 0.0009517524315973595,
+ "loss": 0.019,
+ "macro_f1": 1.0,
+ "num_tokens": 3115593.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009479222819209099,
+ "skip_count": 3.0,
+ "step": 1932,
+ "text_loss": 0.2900560200214386
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.079835632521279,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0771484375,
+ "learning_rate": 0.0009516196931300794,
+ "loss": 0.0153,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3118516.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017834696918725967,
+ "skip_count": 2.0,
+ "step": 1934,
+ "text_loss": 0.20094378292560577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12890625,
+ "learning_rate": 0.0009514867816012809,
+ "loss": 0.02,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3122242.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017964740982279181,
+ "skip_count": 0.0,
+ "step": 1936,
+ "text_loss": 0.6498590707778931
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0009513536970618961,
+ "loss": 0.013,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3125645.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007437168620526791,
+ "skip_count": 2.0,
+ "step": 1938,
+ "text_loss": 0.25863033533096313
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 9.108012914587613,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0625,
+ "learning_rate": 0.0009512204395629232,
+ "loss": 0.0184,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3128740.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008759932243265212,
+ "skip_count": 1.0,
+ "step": 1940,
+ "text_loss": 0.5638351440429688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.117405341943059,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06884765625,
+ "learning_rate": 0.0009510870091554264,
+ "loss": 0.0153,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3131742.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.019906625151634216,
+ "skip_count": 0.0,
+ "step": 1942,
+ "text_loss": 0.8410717844963074
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.126797769298504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12255859375,
+ "learning_rate": 0.0009509534058905369,
+ "loss": 0.016,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3134407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009229081333614886,
+ "skip_count": 0.0,
+ "step": 1944,
+ "text_loss": 0.47506049275398254
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.136190196653947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 0.0009508196298194517,
+ "loss": 0.0123,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3137053.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003630586201325059,
+ "skip_count": 0.0,
+ "step": 1946,
+ "text_loss": 0.32225799560546875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08349609375,
+ "learning_rate": 0.0009506856809934338,
+ "loss": 0.0119,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3140943.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007580445148050785,
+ "skip_count": 0.0,
+ "step": 1948,
+ "text_loss": 0.3120577931404114
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.0009505515594638127,
+ "loss": 0.0126,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3144298.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004471861757338047,
+ "skip_count": 0.0,
+ "step": 1950,
+ "text_loss": 0.22052447497844696
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 9.164367478720282,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.09130859375,
+ "learning_rate": 0.0009504172652819843,
+ "loss": 0.023,
+ "macro_f1": 1.0,
+ "num_tokens": 3147069.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009606664068996906,
+ "skip_count": 1.0,
+ "step": 1952,
+ "text_loss": 0.34773921966552734
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 9.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0625,
+ "learning_rate": 0.0009502827984994099,
+ "loss": 0.0148,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3149992.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006443799939006567,
+ "skip_count": 1.0,
+ "step": 1954,
+ "text_loss": 0.6442171335220337
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 0.0009501481591676177,
+ "loss": 0.0188,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3153167.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003219039412215352,
+ "skip_count": 0.0,
+ "step": 1956,
+ "text_loss": 0.43369221687316895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.192544760786616,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07470703125,
+ "learning_rate": 0.000950013347338202,
+ "loss": 0.0152,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3156590.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.025551019236445427,
+ "skip_count": 1.0,
+ "step": 1958,
+ "text_loss": 0.294479101896286
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 9.201937188142061,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1630859375,
+ "learning_rate": 0.0009498783630628225,
+ "loss": 0.0158,
+ "macro_f1": 1.0,
+ "num_tokens": 3159451.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013802438974380493,
+ "skip_count": 2.0,
+ "step": 1960,
+ "text_loss": 0.20888492465019226
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.211329615497505,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07666015625,
+ "learning_rate": 0.0009497432063932057,
+ "loss": 0.0137,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 3162889.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02852988988161087,
+ "skip_count": 2.0,
+ "step": 1962,
+ "text_loss": 0.5027125477790833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 0.0009496078773811437,
+ "loss": 0.0136,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3165979.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01784522272646427,
+ "skip_count": 2.0,
+ "step": 1964,
+ "text_loss": 0.1696339100599289
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060302734375,
+ "learning_rate": 0.000949472376078495,
+ "loss": 0.016,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3168683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017019887454807758,
+ "skip_count": 0.0,
+ "step": 1966,
+ "text_loss": 0.48905447125434875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.239506897563839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.000949336702537184,
+ "loss": 0.0108,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3171968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004817947279661894,
+ "skip_count": 2.0,
+ "step": 1968,
+ "text_loss": 0.20984773337841034
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.248899324919284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.0009492008568092007,
+ "loss": 0.0103,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3175947.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012963006738573313,
+ "skip_count": 0.0,
+ "step": 1970,
+ "text_loss": 0.5215106010437012
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 9.258291752274728,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.044921875,
+ "learning_rate": 0.0009490648389466019,
+ "loss": 0.0135,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 3179348.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03950481489300728,
+ "skip_count": 2.0,
+ "step": 1972,
+ "text_loss": 0.24640929698944092
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.09326171875,
+ "learning_rate": 0.0009489286490015097,
+ "loss": 0.0183,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3182640.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0043345349840819836,
+ "skip_count": 2.0,
+ "step": 1974,
+ "text_loss": 0.6362852454185486
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.277076606985618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07958984375,
+ "learning_rate": 0.0009487922870261122,
+ "loss": 0.0155,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3185657.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015687479171901941,
+ "skip_count": 0.0,
+ "step": 1976,
+ "text_loss": 0.8977144360542297
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.286469034341062,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.061279296875,
+ "learning_rate": 0.0009486557530726638,
+ "loss": 0.0139,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3188772.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010977238416671753,
+ "skip_count": 0.0,
+ "step": 1978,
+ "text_loss": 0.38512736558914185
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 9.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11279296875,
+ "learning_rate": 0.0009485190471934844,
+ "loss": 0.0196,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3193131.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.002264744369313121,
+ "skip_count": 0.0,
+ "step": 1980,
+ "text_loss": 0.4171289801597595
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.305253889051952,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09033203125,
+ "learning_rate": 0.00094838216944096,
+ "loss": 0.0219,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3196668.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.042320676147937775,
+ "skip_count": 1.0,
+ "step": 1982,
+ "text_loss": 0.19008000195026398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 9.314646316407396,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 0.0009482451198675424,
+ "loss": 0.0151,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 3200282.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01796630397439003,
+ "skip_count": 1.0,
+ "step": 1984,
+ "text_loss": 0.5009249448776245
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 9.324038743762841,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.061767578125,
+ "learning_rate": 0.0009481078985257494,
+ "loss": 0.0147,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3204439.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01052347756922245,
+ "skip_count": 1.0,
+ "step": 1986,
+ "text_loss": 0.15319275856018066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.333431171118287,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0732421875,
+ "learning_rate": 0.0009479705054681644,
+ "loss": 0.015,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 3207590.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.09640293568372726,
+ "skip_count": 3.0,
+ "step": 1988,
+ "text_loss": 0.3654652535915375
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.34282359847373,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0009478329407474366,
+ "loss": 0.0183,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3211172.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012670112773776054,
+ "skip_count": 1.0,
+ "step": 1990,
+ "text_loss": 0.5817596316337585
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 9.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05859375,
+ "learning_rate": 0.000947695204416281,
+ "loss": 0.0121,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3214050.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005263707600533962,
+ "skip_count": 0.0,
+ "step": 1992,
+ "text_loss": 0.5985888242721558
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.361608453184619,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0009475572965274787,
+ "loss": 0.0144,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3217318.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0682850033044815,
+ "skip_count": 0.0,
+ "step": 1994,
+ "text_loss": 0.316506564617157
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.371000880540064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0595703125,
+ "learning_rate": 0.000947419217133876,
+ "loss": 0.019,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3220012.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008508823812007904,
+ "skip_count": 2.0,
+ "step": 1996,
+ "text_loss": 0.09665893763303757
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 9.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053466796875,
+ "learning_rate": 0.0009472809662883852,
+ "loss": 0.0155,
+ "macro_f1": 1.0,
+ "num_tokens": 3223019.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01100847590714693,
+ "skip_count": 2.0,
+ "step": 1998,
+ "text_loss": 0.4938808083534241
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.389785735250953,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.0009471425440439844,
+ "loss": 0.0135,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 3226013.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04953207075595856,
+ "skip_count": 3.0,
+ "step": 2000,
+ "text_loss": 0.22258254885673523
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 9.399178162606399,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07568359375,
+ "learning_rate": 0.0009470039504537173,
+ "loss": 0.0186,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 3230031.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.052884332835674286,
+ "skip_count": 2.0,
+ "step": 2002,
+ "text_loss": 0.1741616576910019
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 9.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0869140625,
+ "learning_rate": 0.0009468651855706931,
+ "loss": 0.0204,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3232991.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008056716993451118,
+ "skip_count": 0.0,
+ "step": 2004,
+ "text_loss": 0.3173636198043823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0009467262494480868,
+ "loss": 0.0136,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3236390.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0053409393876791,
+ "skip_count": 0.0,
+ "step": 2006,
+ "text_loss": 0.5806330442428589
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.427355444672733,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.000946587142139139,
+ "loss": 0.0147,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3239267.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015652200672775507,
+ "skip_count": 0.0,
+ "step": 2008,
+ "text_loss": 0.6214317679405212
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.436747872028178,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.11376953125,
+ "learning_rate": 0.000946447863697156,
+ "loss": 0.0151,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 3242569.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011673987843096256,
+ "skip_count": 2.0,
+ "step": 2010,
+ "text_loss": 0.532565712928772
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.446140299383622,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0009463084141755093,
+ "loss": 0.0159,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3245669.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.028480790555477142,
+ "skip_count": 1.0,
+ "step": 2012,
+ "text_loss": 0.25210800766944885
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.455532726739067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0869140625,
+ "learning_rate": 0.0009461687936276364,
+ "loss": 0.0132,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3248751.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007234727032482624,
+ "skip_count": 0.0,
+ "step": 2014,
+ "text_loss": 0.35922971367836
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 9.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0009460290021070402,
+ "loss": 0.0195,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3252614.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.014691276475787163,
+ "skip_count": 0.0,
+ "step": 2016,
+ "text_loss": 0.2747853398323059
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051513671875,
+ "learning_rate": 0.0009458890396672888,
+ "loss": 0.0186,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3256374.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002385235857218504,
+ "skip_count": 0.0,
+ "step": 2018,
+ "text_loss": 0.5268719792366028
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 9.483710008805401,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.0009457489063620164,
+ "loss": 0.0133,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 3259792.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.047268565744161606,
+ "skip_count": 2.0,
+ "step": 2020,
+ "text_loss": 0.7785539627075195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.493102436160845,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1494140625,
+ "learning_rate": 0.0009456086022449221,
+ "loss": 0.0218,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3262833.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015878718346357346,
+ "skip_count": 1.0,
+ "step": 2022,
+ "text_loss": 0.42270028591156006
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.50249486351629,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08935546875,
+ "learning_rate": 0.0009454681273697711,
+ "loss": 0.0117,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3265718.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.030749641358852386,
+ "skip_count": 0.0,
+ "step": 2024,
+ "text_loss": 0.18668225407600403
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 9.511887290871735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.0009453274817903931,
+ "loss": 0.012,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3268158.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011538166552782059,
+ "skip_count": 1.0,
+ "step": 2026,
+ "text_loss": 0.34090787172317505
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.521279718227179,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.099609375,
+ "learning_rate": 0.000945186665560684,
+ "loss": 0.0218,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3271082.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009527760557830334,
+ "skip_count": 0.0,
+ "step": 2028,
+ "text_loss": 0.2110334187746048
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.530672145582624,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.119140625,
+ "learning_rate": 0.000945045678734605,
+ "loss": 0.0175,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 3273488.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03317151218652725,
+ "skip_count": 3.0,
+ "step": 2030,
+ "text_loss": 0.2233227640390396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.540064572938068,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12451171875,
+ "learning_rate": 0.0009449045213661822,
+ "loss": 0.0201,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3276646.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018510591238737106,
+ "skip_count": 1.0,
+ "step": 2032,
+ "text_loss": 0.16100332140922546
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 9.549457000293513,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.1318359375,
+ "learning_rate": 0.0009447631935095077,
+ "loss": 0.0185,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 3279441.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.028113311156630516,
+ "skip_count": 4.0,
+ "step": 2034,
+ "text_loss": 0.29208317399024963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.558849427648958,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.0009446216952187384,
+ "loss": 0.0164,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3282697.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008379172533750534,
+ "skip_count": 0.0,
+ "step": 2036,
+ "text_loss": 0.16026398539543152
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 0.0009444800265480967,
+ "loss": 0.0178,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3285574.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00941354501992464,
+ "skip_count": 0.0,
+ "step": 2038,
+ "text_loss": 0.29523080587387085
+ },
+ {
+ "acc_repeat": 0.75,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 9.577634282359847,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.8571428656578064,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.076171875,
+ "learning_rate": 0.0009443381875518703,
+ "loss": 0.0197,
+ "macro_f1": 0.8600732684135437,
+ "num_tokens": 3289159.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.04974055662751198,
+ "skip_count": 6.0,
+ "step": 2040,
+ "text_loss": 0.23033179342746735
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.587026709715293,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0537109375,
+ "learning_rate": 0.0009441961782844123,
+ "loss": 0.0146,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3293598.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022241825237870216,
+ "skip_count": 1.0,
+ "step": 2042,
+ "text_loss": 0.8299165368080139
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 0.0009440539988001408,
+ "loss": 0.0159,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3296648.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011019332334399223,
+ "skip_count": 0.0,
+ "step": 2044,
+ "text_loss": 0.18207129836082458
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.605811564426181,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0009439116491535394,
+ "loss": 0.0118,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3300058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002889640862122178,
+ "skip_count": 0.0,
+ "step": 2046,
+ "text_loss": 0.7051978707313538
+ },
+ {
+ "acc_repeat": 0.3333333432674408,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 9.615203991781627,
+ "f1_execute": 0.9333333373069763,
+ "f1_repeat": 0.5,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.078125,
+ "learning_rate": 0.0009437691293991563,
+ "loss": 0.0192,
+ "macro_f1": 0.7634921073913574,
+ "num_tokens": 3303296.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.07741832733154297,
+ "skip_count": 4.0,
+ "step": 2048,
+ "text_loss": 0.15563532710075378
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.09521484375,
+ "learning_rate": 0.0009436264395916061,
+ "loss": 0.0209,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3306204.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014225383289158344,
+ "skip_count": 2.0,
+ "step": 2050,
+ "text_loss": 0.18117287755012512
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.633988846492516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1416015625,
+ "learning_rate": 0.0009434835797855672,
+ "loss": 0.0165,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3309444.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023932650219649076,
+ "skip_count": 0.0,
+ "step": 2052,
+ "text_loss": 0.4645874798297882
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.643381273847961,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0009433405500357839,
+ "loss": 0.0153,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3312488.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03193361684679985,
+ "skip_count": 1.0,
+ "step": 2054,
+ "text_loss": 0.5291082859039307
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0009431973503970655,
+ "loss": 0.0134,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3315765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020529816392809153,
+ "skip_count": 0.0,
+ "step": 2056,
+ "text_loss": 0.5877931118011475
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.66216612855885,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07275390625,
+ "learning_rate": 0.0009430539809242864,
+ "loss": 0.0185,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3318877.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.07907948642969131,
+ "skip_count": 0.0,
+ "step": 2058,
+ "text_loss": 0.3836737871170044
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 9.671558555914293,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.095703125,
+ "learning_rate": 0.0009429104416723862,
+ "loss": 0.0163,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3322576.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.003006070153787732,
+ "skip_count": 0.0,
+ "step": 2060,
+ "text_loss": 0.3480920195579529
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.680950983269739,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 0.0009427667326963689,
+ "loss": 0.0127,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3325974.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005013179033994675,
+ "skip_count": 0.0,
+ "step": 2062,
+ "text_loss": 0.931358814239502
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.690343410625184,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0986328125,
+ "learning_rate": 0.0009426228540513047,
+ "loss": 0.0206,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3329398.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0059848143719136715,
+ "skip_count": 0.0,
+ "step": 2064,
+ "text_loss": 0.47568953037261963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.699735837980628,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0830078125,
+ "learning_rate": 0.0009424788057923277,
+ "loss": 0.0131,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3332029.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00783882662653923,
+ "skip_count": 0.0,
+ "step": 2066,
+ "text_loss": 0.22887596487998962
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 9.709128265336073,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0712890625,
+ "learning_rate": 0.0009423345879746376,
+ "loss": 0.0128,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3334858.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01866884157061577,
+ "skip_count": 2.0,
+ "step": 2068,
+ "text_loss": 0.17724967002868652
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.718520692691518,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06591796875,
+ "learning_rate": 0.000942190200653499,
+ "loss": 0.0162,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3338094.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.028636593371629715,
+ "skip_count": 2.0,
+ "step": 2070,
+ "text_loss": 0.34344956278800964
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 9.727913120046962,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.07568359375,
+ "learning_rate": 0.0009420456438842413,
+ "loss": 0.0165,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3340526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023245645686984062,
+ "skip_count": 2.0,
+ "step": 2072,
+ "text_loss": 0.7276164293289185
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.737305547402407,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11328125,
+ "learning_rate": 0.000941900917722259,
+ "loss": 0.0143,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3343303.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01565689593553543,
+ "skip_count": 0.0,
+ "step": 2074,
+ "text_loss": 0.5665070414543152
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1201171875,
+ "learning_rate": 0.0009417560222230115,
+ "loss": 0.0245,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3346409.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035056080669164658,
+ "skip_count": 0.0,
+ "step": 2076,
+ "text_loss": 0.5112795233726501
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.756090402113296,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06982421875,
+ "learning_rate": 0.0009416109574420229,
+ "loss": 0.0132,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3349220.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027565446216613054,
+ "skip_count": 0.0,
+ "step": 2078,
+ "text_loss": 0.5240910053253174
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 9.765482829468741,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08203125,
+ "learning_rate": 0.0009414657234348823,
+ "loss": 0.0186,
+ "macro_f1": 1.0,
+ "num_tokens": 3352627.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.01652451977133751,
+ "skip_count": 2.0,
+ "step": 2080,
+ "text_loss": 1.0217112302780151
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.774875256824185,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1630859375,
+ "learning_rate": 0.0009413203202572438,
+ "loss": 0.0179,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3355392.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.1012420505285263,
+ "skip_count": 2.0,
+ "step": 2082,
+ "text_loss": 0.4085482358932495
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08251953125,
+ "learning_rate": 0.000941174747964826,
+ "loss": 0.0154,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3358425.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004962718114256859,
+ "skip_count": 0.0,
+ "step": 2084,
+ "text_loss": 0.5833504796028137
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 9.793660111535075,
+ "f1_execute": 0.9583333134651184,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.11376953125,
+ "learning_rate": 0.0009410290066134124,
+ "loss": 0.0211,
+ "macro_f1": 0.8083333373069763,
+ "num_tokens": 3361925.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.07889176905155182,
+ "skip_count": 3.0,
+ "step": 2086,
+ "text_loss": 0.38126569986343384
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.803052538890519,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.051513671875,
+ "learning_rate": 0.0009408830962588517,
+ "loss": 0.0195,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 3365963.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.033715736120939255,
+ "skip_count": 2.0,
+ "step": 2088,
+ "text_loss": 0.23213914036750793
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.812444966245964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0732421875,
+ "learning_rate": 0.0009407370169570567,
+ "loss": 0.0169,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3369422.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014188943896442652,
+ "skip_count": 0.0,
+ "step": 2090,
+ "text_loss": 0.4648318886756897
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.82183739360141,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0712890625,
+ "learning_rate": 0.0009405907687640054,
+ "loss": 0.013,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3372506.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015339684672653675,
+ "skip_count": 1.0,
+ "step": 2092,
+ "text_loss": 0.2563800811767578
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 9.831229820956853,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.054443359375,
+ "learning_rate": 0.0009404443517357404,
+ "loss": 0.0146,
+ "macro_f1": 0.542222261428833,
+ "num_tokens": 3375653.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.06562861055135727,
+ "skip_count": 0.0,
+ "step": 2094,
+ "text_loss": 0.797835111618042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.840622248312298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.000940297765928369,
+ "loss": 0.0136,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3379018.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005745889153331518,
+ "skip_count": 0.0,
+ "step": 2096,
+ "text_loss": 0.4238114655017853
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0712890625,
+ "learning_rate": 0.0009401510113980631,
+ "loss": 0.0207,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3382855.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026634482201188803,
+ "skip_count": 0.0,
+ "step": 2098,
+ "text_loss": 0.4967166483402252
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0791015625,
+ "learning_rate": 0.0009400040882010592,
+ "loss": 0.0166,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3386386.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020642587915062904,
+ "skip_count": 0.0,
+ "step": 2100,
+ "text_loss": 0.44390562176704407
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.868799530378633,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.056640625,
+ "learning_rate": 0.0009398569963936589,
+ "loss": 0.017,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3389958.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013722737319767475,
+ "skip_count": 1.0,
+ "step": 2102,
+ "text_loss": 0.7207565903663635
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.878191957734076,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08837890625,
+ "learning_rate": 0.0009397097360322276,
+ "loss": 0.017,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3392892.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002051608171314001,
+ "skip_count": 0.0,
+ "step": 2104,
+ "text_loss": 0.3196398913860321
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.887584385089522,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07470703125,
+ "learning_rate": 0.000939562307173196,
+ "loss": 0.022,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3396636.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007085663266479969,
+ "skip_count": 0.0,
+ "step": 2106,
+ "text_loss": 0.5663776397705078
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 9.896976812444967,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.11328125,
+ "learning_rate": 0.0009394147098730592,
+ "loss": 0.02,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3399475.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019473131746053696,
+ "skip_count": 2.0,
+ "step": 2108,
+ "text_loss": 0.7708223462104797
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0009392669441883767,
+ "loss": 0.0134,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3402350.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028328890912234783,
+ "skip_count": 0.0,
+ "step": 2110,
+ "text_loss": 0.5888006091117859
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10693359375,
+ "learning_rate": 0.0009391190101757724,
+ "loss": 0.0166,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3405561.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023098422214388847,
+ "skip_count": 2.0,
+ "step": 2112,
+ "text_loss": 0.09865197539329529
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.925154094511301,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10107421875,
+ "learning_rate": 0.000938970907891935,
+ "loss": 0.0247,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3408513.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002896632067859173,
+ "skip_count": 0.0,
+ "step": 2114,
+ "text_loss": 0.6613234281539917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.934546521866745,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0947265625,
+ "learning_rate": 0.0009388226373936179,
+ "loss": 0.0211,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3411195.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015814457088708878,
+ "skip_count": 0.0,
+ "step": 2116,
+ "text_loss": 0.17363053560256958
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.94393894922219,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12451171875,
+ "learning_rate": 0.0009386741987376381,
+ "loss": 0.015,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 3414875.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02676783688366413,
+ "skip_count": 0.0,
+ "step": 2118,
+ "text_loss": 0.674056887626648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 9.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0751953125,
+ "learning_rate": 0.0009385255919808778,
+ "loss": 0.0203,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3418410.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01022857241332531,
+ "skip_count": 1.0,
+ "step": 2120,
+ "text_loss": 0.235092431306839
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 9.962723803933079,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0888671875,
+ "learning_rate": 0.0009383768171802836,
+ "loss": 0.0244,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3421289.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013572212308645248,
+ "skip_count": 2.0,
+ "step": 2122,
+ "text_loss": 0.5992844104766846
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0009382278743928659,
+ "loss": 0.0201,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3424781.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0051873656921088696,
+ "skip_count": 2.0,
+ "step": 2124,
+ "text_loss": 0.29915499687194824
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 9.981508658643968,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.07421875,
+ "learning_rate": 0.0009380787636757001,
+ "loss": 0.0155,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 3427942.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030079292133450508,
+ "skip_count": 4.0,
+ "step": 2126,
+ "text_loss": 0.24181491136550903
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.990901085999413,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0009379294850859256,
+ "loss": 0.0141,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3431314.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002675612922757864,
+ "skip_count": 0.0,
+ "step": 2128,
+ "text_loss": 0.4669873118400574
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.0009377800386807465,
+ "loss": 0.0177,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3435020.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009334275498986244,
+ "skip_count": 0.0,
+ "step": 2130,
+ "text_loss": 0.6478219628334045
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 10.009392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.134765625,
+ "learning_rate": 0.0009376304245174306,
+ "loss": 0.0137,
+ "macro_f1": 0.6000000238418579,
+ "num_tokens": 3438276.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.038227908313274384,
+ "skip_count": 2.0,
+ "step": 2132,
+ "text_loss": 0.4401201903820038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.018784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0009374806426533104,
+ "loss": 0.0113,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3440938.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006901399698108435,
+ "skip_count": 0.0,
+ "step": 2134,
+ "text_loss": 0.5948942303657532
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.0009373306931457827,
+ "loss": 0.0121,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3444028.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037061909679323435,
+ "skip_count": 0.0,
+ "step": 2136,
+ "text_loss": 0.5349751114845276
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.056884765625,
+ "learning_rate": 0.0009371805760523086,
+ "loss": 0.0111,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3448331.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025877030566334724,
+ "skip_count": 0.0,
+ "step": 2138,
+ "text_loss": 0.4591051936149597
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 10.046962136777223,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.0009370302914304129,
+ "loss": 0.0144,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 3451434.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018742674961686134,
+ "skip_count": 3.0,
+ "step": 2140,
+ "text_loss": 0.23470863699913025
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.056354564132668,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0009368798393376851,
+ "loss": 0.0122,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3454375.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02382594160735607,
+ "skip_count": 1.0,
+ "step": 2142,
+ "text_loss": 0.6077954769134521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 10.065746991488112,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.05517578125,
+ "learning_rate": 0.0009367292198317787,
+ "loss": 0.0164,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3457591.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03331060707569122,
+ "skip_count": 2.0,
+ "step": 2144,
+ "text_loss": 0.3691073954105377
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.075139418843557,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0009365784329704115,
+ "loss": 0.0186,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3460895.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016955457394942641,
+ "skip_count": 0.0,
+ "step": 2146,
+ "text_loss": 0.3947436511516571
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 10.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 0.0009364274788113651,
+ "loss": 0.0096,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3464101.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006169239990413189,
+ "skip_count": 0.0,
+ "step": 2148,
+ "text_loss": 0.3348555266857147
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 10.093924273554446,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0009362763574124858,
+ "loss": 0.019,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 3467417.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.024033790454268456,
+ "skip_count": 1.0,
+ "step": 2150,
+ "text_loss": 0.496633380651474
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.103316700909891,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0009361250688316829,
+ "loss": 0.0142,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3470917.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024986129719763994,
+ "skip_count": 0.0,
+ "step": 2152,
+ "text_loss": 0.6857671737670898
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 10.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0546875,
+ "learning_rate": 0.0009359736131269312,
+ "loss": 0.0153,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3473624.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008183322846889496,
+ "skip_count": 1.0,
+ "step": 2154,
+ "text_loss": 0.13883116841316223
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 10.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.0009358219903562684,
+ "loss": 0.0106,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3476472.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011198793537914753,
+ "skip_count": 3.0,
+ "step": 2156,
+ "text_loss": 0.24243666231632233
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.131493982976226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0009356702005777969,
+ "loss": 0.0125,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3479688.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002520184963941574,
+ "skip_count": 0.0,
+ "step": 2158,
+ "text_loss": 0.6407818794250488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.140886410331671,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0791015625,
+ "learning_rate": 0.0009355182438496825,
+ "loss": 0.0142,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3482598.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011065017897635698,
+ "skip_count": 0.0,
+ "step": 2160,
+ "text_loss": 0.7214245796203613
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 0.0009353661202301557,
+ "loss": 0.0144,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3486271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017824085662141442,
+ "skip_count": 0.0,
+ "step": 2162,
+ "text_loss": 0.5140969157218933
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.053466796875,
+ "learning_rate": 0.0009352138297775101,
+ "loss": 0.0145,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3489206.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001542879967018962,
+ "skip_count": 0.0,
+ "step": 2164,
+ "text_loss": 0.7956416606903076
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 10.169063692398003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0771484375,
+ "learning_rate": 0.000935061372550104,
+ "loss": 0.0134,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3492003.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01420794241130352,
+ "skip_count": 3.0,
+ "step": 2166,
+ "text_loss": 0.27489882707595825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 10.178456119753449,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.0009349087486063594,
+ "loss": 0.0166,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3494784.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003614309709519148,
+ "skip_count": 1.0,
+ "step": 2168,
+ "text_loss": 0.2962227761745453
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 10.187848547108894,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1259765625,
+ "learning_rate": 0.0009347559580047618,
+ "loss": 0.0175,
+ "macro_f1": 0.8814815282821655,
+ "num_tokens": 3497886.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.02122853323817253,
+ "skip_count": 4.0,
+ "step": 2170,
+ "text_loss": 0.5919580459594727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 10.197240974464338,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.000934603000803861,
+ "loss": 0.0135,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3500939.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02042219042778015,
+ "skip_count": 1.0,
+ "step": 2172,
+ "text_loss": 0.28722381591796875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0009344498770622704,
+ "loss": 0.013,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3504852.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004345106892287731,
+ "skip_count": 0.0,
+ "step": 2174,
+ "text_loss": 0.603236734867096
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.216025829175228,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1064453125,
+ "learning_rate": 0.0009342965868386673,
+ "loss": 0.0101,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3508320.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00368050136603415,
+ "skip_count": 0.0,
+ "step": 2176,
+ "text_loss": 0.6020491719245911
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.225418256530672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060302734375,
+ "learning_rate": 0.000934143130191793,
+ "loss": 0.0108,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3511278.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013425769284367561,
+ "skip_count": 0.0,
+ "step": 2178,
+ "text_loss": 0.5954724550247192
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060546875,
+ "learning_rate": 0.000933989507180452,
+ "loss": 0.0149,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3514361.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002896249992772937,
+ "skip_count": 0.0,
+ "step": 2180,
+ "text_loss": 0.39175131916999817
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 10.244203111241562,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052978515625,
+ "learning_rate": 0.0009338357178635135,
+ "loss": 0.0147,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 3517962.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011538350023329258,
+ "skip_count": 1.0,
+ "step": 2182,
+ "text_loss": 0.4482830762863159
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.253595538597006,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0869140625,
+ "learning_rate": 0.0009336817622999093,
+ "loss": 0.011,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3521299.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.022787930443882942,
+ "skip_count": 0.0,
+ "step": 2184,
+ "text_loss": 0.35177817940711975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.262987965952451,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0009335276405486357,
+ "loss": 0.0139,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3524611.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011597735807299614,
+ "skip_count": 1.0,
+ "step": 2186,
+ "text_loss": 0.24868851900100708
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11181640625,
+ "learning_rate": 0.0009333733526687524,
+ "loss": 0.0196,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3528012.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014253967441618443,
+ "skip_count": 0.0,
+ "step": 2188,
+ "text_loss": 0.3970910310745239
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.054931640625,
+ "learning_rate": 0.000933218898719383,
+ "loss": 0.0162,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3530908.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001659149187617004,
+ "skip_count": 0.0,
+ "step": 2190,
+ "text_loss": 0.7618573307991028
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0009330642787597141,
+ "loss": 0.0159,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3533993.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005574346985667944,
+ "skip_count": 0.0,
+ "step": 2192,
+ "text_loss": 0.16470147669315338
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.300557675374229,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0791015625,
+ "learning_rate": 0.0009329094928489969,
+ "loss": 0.0121,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3537310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026400673668831587,
+ "skip_count": 0.0,
+ "step": 2194,
+ "text_loss": 0.3400416374206543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 10.309950102729674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0849609375,
+ "learning_rate": 0.0009327545410465452,
+ "loss": 0.0124,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3540045.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008448398672044277,
+ "skip_count": 3.0,
+ "step": 2196,
+ "text_loss": 0.3110542297363281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.31934253008512,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.0009325994234117372,
+ "loss": 0.0122,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3544097.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.037553198635578156,
+ "skip_count": 2.0,
+ "step": 2198,
+ "text_loss": 0.36126700043678284
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 10.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09716796875,
+ "learning_rate": 0.000932444140004014,
+ "loss": 0.0124,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3547054.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006464479025453329,
+ "skip_count": 0.0,
+ "step": 2200,
+ "text_loss": 0.4947047233581543
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 10.338127384796008,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1015625,
+ "learning_rate": 0.0009322886908828805,
+ "loss": 0.0138,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3549903.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005384812597185373,
+ "skip_count": 0.0,
+ "step": 2202,
+ "text_loss": 0.5923738479614258
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 10.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0009321330761079052,
+ "loss": 0.0149,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3553745.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015346619300544262,
+ "skip_count": 2.0,
+ "step": 2204,
+ "text_loss": 0.1904175877571106
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 10.356912239506897,
+ "f1_execute": 0.9268292784690857,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.06494140625,
+ "learning_rate": 0.00093197729573872,
+ "loss": 0.0203,
+ "macro_f1": 0.8422764539718628,
+ "num_tokens": 3557235.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.1207597479224205,
+ "skip_count": 6.0,
+ "step": 2206,
+ "text_loss": 0.3904837667942047
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.366304666862343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0771484375,
+ "learning_rate": 0.0009318213498350202,
+ "loss": 0.0109,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3560795.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003334777895361185,
+ "skip_count": 0.0,
+ "step": 2208,
+ "text_loss": 0.4268290102481842
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.375697094217786,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0537109375,
+ "learning_rate": 0.0009316652384565645,
+ "loss": 0.0123,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3563754.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004230072256177664,
+ "skip_count": 0.0,
+ "step": 2210,
+ "text_loss": 0.40049710869789124
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.385089521573232,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046875,
+ "learning_rate": 0.0009315089616631751,
+ "loss": 0.0106,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3567173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006645230459980667,
+ "skip_count": 0.0,
+ "step": 2212,
+ "text_loss": 0.42568323016166687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.394481948928677,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07470703125,
+ "learning_rate": 0.0009313525195147376,
+ "loss": 0.0126,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3570831.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0097877848893404,
+ "skip_count": 0.0,
+ "step": 2214,
+ "text_loss": 0.45808279514312744
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 28.0,
+ "epoch": 10.40387437628412,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.5,
+ "grad_norm": 0.076171875,
+ "learning_rate": 0.000931195912071201,
+ "loss": 0.0187,
+ "macro_f1": 0.7018141150474548,
+ "num_tokens": 3573745.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.07351134717464447,
+ "skip_count": 3.0,
+ "step": 2216,
+ "text_loss": 0.285696804523468
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07666015625,
+ "learning_rate": 0.0009310391393925775,
+ "loss": 0.0125,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3576785.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033160944003611803,
+ "skip_count": 0.0,
+ "step": 2218,
+ "text_loss": 0.17516443133354187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 10.422659230995011,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 0.0009308822015389424,
+ "loss": 0.0241,
+ "macro_f1": 0.5427350401878357,
+ "num_tokens": 3580695.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.052930232137441635,
+ "skip_count": 1.0,
+ "step": 2220,
+ "text_loss": 0.5918155908584595
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 10.432051658350455,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.072265625,
+ "learning_rate": 0.0009307250985704352,
+ "loss": 0.0128,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 3583729.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.025454653427004814,
+ "skip_count": 4.0,
+ "step": 2222,
+ "text_loss": 0.2652169466018677
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 0.0009305678305472575,
+ "loss": 0.0158,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3586775.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011279845610260963,
+ "skip_count": 0.0,
+ "step": 2224,
+ "text_loss": 0.3511691987514496
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10791015625,
+ "learning_rate": 0.000930410397529675,
+ "loss": 0.017,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3589676.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002700264798477292,
+ "skip_count": 0.0,
+ "step": 2226,
+ "text_loss": 0.24045433104038239
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 10.460228940416789,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048095703125,
+ "learning_rate": 0.000930252799578016,
+ "loss": 0.0146,
+ "macro_f1": 1.0,
+ "num_tokens": 3593242.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00826631672680378,
+ "skip_count": 2.0,
+ "step": 2228,
+ "text_loss": 0.3777645528316498
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 10.469621367772234,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.0009300950367526728,
+ "loss": 0.0131,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 3596807.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.036221496760845184,
+ "skip_count": 2.0,
+ "step": 2230,
+ "text_loss": 0.502962589263916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0703125,
+ "learning_rate": 0.0009299371091141001,
+ "loss": 0.0131,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3600150.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006449893582612276,
+ "skip_count": 0.0,
+ "step": 2232,
+ "text_loss": 0.20256924629211426
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 10.488406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.0009297790167228161,
+ "loss": 0.012,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3602988.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007872486487030983,
+ "skip_count": 2.0,
+ "step": 2234,
+ "text_loss": 0.42476826906204224
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.497798649838568,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 0.0009296207596394022,
+ "loss": 0.0101,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3606071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.027397040277719498,
+ "skip_count": 2.0,
+ "step": 2236,
+ "text_loss": 0.23432791233062744
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0595703125,
+ "learning_rate": 0.0009294623379245028,
+ "loss": 0.0117,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3609389.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01042645052075386,
+ "skip_count": 0.0,
+ "step": 2238,
+ "text_loss": 0.16665785014629364
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.516583504549457,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 0.0009293037516388252,
+ "loss": 0.0161,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3612105.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012458425480872393,
+ "skip_count": 0.0,
+ "step": 2240,
+ "text_loss": 0.59421306848526
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 10.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0751953125,
+ "learning_rate": 0.0009291450008431404,
+ "loss": 0.0185,
+ "macro_f1": 1.0,
+ "num_tokens": 3615439.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005781981628388166,
+ "skip_count": 1.0,
+ "step": 2242,
+ "text_loss": 0.510798454284668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 10.535368359260346,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.0966796875,
+ "learning_rate": 0.0009289860855982814,
+ "loss": 0.0166,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 3618842.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.031195320188999176,
+ "skip_count": 3.0,
+ "step": 2244,
+ "text_loss": 0.7574363350868225
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.544760786615791,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.0009288270059651454,
+ "loss": 0.0133,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3621823.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001746491645462811,
+ "skip_count": 0.0,
+ "step": 2246,
+ "text_loss": 0.5125683546066284
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 10.554153213971237,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.220703125,
+ "learning_rate": 0.0009286677620046918,
+ "loss": 0.0159,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3624502.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03792348504066467,
+ "skip_count": 2.0,
+ "step": 2248,
+ "text_loss": 0.7533677220344543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07763671875,
+ "learning_rate": 0.0009285083537779429,
+ "loss": 0.0116,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3627057.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009684451506473124,
+ "skip_count": 0.0,
+ "step": 2250,
+ "text_loss": 0.2219279706478119
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 10.572938068682125,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.11767578125,
+ "learning_rate": 0.0009283487813459845,
+ "loss": 0.0148,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3629720.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022757573053240776,
+ "skip_count": 2.0,
+ "step": 2252,
+ "text_loss": 0.6903313994407654
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 10.582330496037569,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1376953125,
+ "learning_rate": 0.0009281890447699652,
+ "loss": 0.015,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3633234.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003613058477640152,
+ "skip_count": 0.0,
+ "step": 2254,
+ "text_loss": 0.6278893351554871
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0009280291441110961,
+ "loss": 0.0116,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3636289.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006214062683284283,
+ "skip_count": 0.0,
+ "step": 2256,
+ "text_loss": 0.3011114001274109
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 10.60111535074846,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0009278690794306517,
+ "loss": 0.014,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3640251.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.052556321024894714,
+ "skip_count": 2.0,
+ "step": 2258,
+ "text_loss": 0.19894185662269592
+ },
+ {
+ "acc_repeat": 0.75,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 10.610507778103903,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.8571428656578064,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08251953125,
+ "learning_rate": 0.0009277088507899689,
+ "loss": 0.0163,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 3643527.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.0572301521897316,
+ "skip_count": 1.0,
+ "step": 2260,
+ "text_loss": 0.5593410134315491
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.619900205459349,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 0.0009275484582504475,
+ "loss": 0.0104,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3646959.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008010074496269226,
+ "skip_count": 0.0,
+ "step": 2262,
+ "text_loss": 0.2128177285194397
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 28.0,
+ "epoch": 10.629292632814794,
+ "f1_execute": 0.95652174949646,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.0009273879018735505,
+ "loss": 0.0138,
+ "macro_f1": 0.8521739840507507,
+ "num_tokens": 3651298.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.035729870200157166,
+ "skip_count": 3.0,
+ "step": 2264,
+ "text_loss": 0.2987811267375946
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.638685060170237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.0009272271817208031,
+ "loss": 0.0182,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3655609.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002379779238253832,
+ "skip_count": 0.0,
+ "step": 2266,
+ "text_loss": 0.6024088263511658
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.0009270662978537939,
+ "loss": 0.0098,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3658444.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008943650871515274,
+ "skip_count": 0.0,
+ "step": 2268,
+ "text_loss": 0.1741207242012024
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 10.657469914881126,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.0009269052503341736,
+ "loss": 0.0161,
+ "macro_f1": 0.6595745086669922,
+ "num_tokens": 3662282.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.030201267451047897,
+ "skip_count": 4.0,
+ "step": 2270,
+ "text_loss": 0.7300035953521729
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.666862342236572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0009267440392236562,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3665531.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026635683607310057,
+ "skip_count": 0.0,
+ "step": 2272,
+ "text_loss": 0.31535038352012634
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0009265826645840178,
+ "loss": 0.0151,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3668407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004258926957845688,
+ "skip_count": 0.0,
+ "step": 2274,
+ "text_loss": 0.7272579073905945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 10.68564719694746,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.125,
+ "learning_rate": 0.0009264211264770976,
+ "loss": 0.0154,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 3671503.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.038987524807453156,
+ "skip_count": 4.0,
+ "step": 2276,
+ "text_loss": 0.7488982677459717
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 10.695039624302906,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.099609375,
+ "learning_rate": 0.0009262594249647975,
+ "loss": 0.0164,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3674107.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007211760152131319,
+ "skip_count": 1.0,
+ "step": 2278,
+ "text_loss": 0.1992369294166565
+ },
+ {
+ "acc_repeat": 0.75,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 10.704432051658351,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.8571428656578064,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0546875,
+ "learning_rate": 0.0009260975601090815,
+ "loss": 0.0112,
+ "macro_f1": 0.9446290731430054,
+ "num_tokens": 3677184.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.02538592554628849,
+ "skip_count": 3.0,
+ "step": 2280,
+ "text_loss": 0.46402135491371155
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0009259355319719768,
+ "loss": 0.0162,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3680683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038464947137981653,
+ "skip_count": 0.0,
+ "step": 2282,
+ "text_loss": 0.5804527401924133
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009257733406155726,
+ "loss": 0.0169,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3683928.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004841136280447245,
+ "skip_count": 0.0,
+ "step": 2284,
+ "text_loss": 0.4834538400173187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.0009256109861020212,
+ "loss": 0.0115,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3687101.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002191900508478284,
+ "skip_count": 0.0,
+ "step": 2286,
+ "text_loss": 0.8199604749679565
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 10.742001761080129,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0927734375,
+ "learning_rate": 0.000925448468493537,
+ "loss": 0.0162,
+ "macro_f1": 0.5427350401878357,
+ "num_tokens": 3690490.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03488675877451897,
+ "skip_count": 2.0,
+ "step": 2288,
+ "text_loss": 0.33263635635375977
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 10.751394188435574,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.0009252857878523971,
+ "loss": 0.0134,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3694109.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002897309372201562,
+ "skip_count": 0.0,
+ "step": 2290,
+ "text_loss": 0.47494807839393616
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 10.760786615791018,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05810546875,
+ "learning_rate": 0.000925122944240941,
+ "loss": 0.0153,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3697233.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01842675730586052,
+ "skip_count": 2.0,
+ "step": 2292,
+ "text_loss": 0.14693495631217957
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 10.770179043146463,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0009249599377215707,
+ "loss": 0.0146,
+ "macro_f1": 0.5866667032241821,
+ "num_tokens": 3700376.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04169808700680733,
+ "skip_count": 3.0,
+ "step": 2294,
+ "text_loss": 0.38051268458366394
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.779571470501908,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.0009247967683567507,
+ "loss": 0.0112,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3703212.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012183113023638725,
+ "skip_count": 1.0,
+ "step": 2296,
+ "text_loss": 0.23789077997207642
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 10.788963897857352,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.05712890625,
+ "learning_rate": 0.0009246334362090077,
+ "loss": 0.0137,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 3706490.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01880069635808468,
+ "skip_count": 2.0,
+ "step": 2298,
+ "text_loss": 0.29067978262901306
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.798356325212797,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08203125,
+ "learning_rate": 0.000924469941340931,
+ "loss": 0.0173,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3709804.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.027359159663319588,
+ "skip_count": 0.0,
+ "step": 2300,
+ "text_loss": 0.67828369140625
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.807748752568243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07275390625,
+ "learning_rate": 0.000924306283815172,
+ "loss": 0.0153,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3712824.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003152279881760478,
+ "skip_count": 0.0,
+ "step": 2302,
+ "text_loss": 0.8333184719085693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 10.817141179923686,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0703125,
+ "learning_rate": 0.0009241424636944445,
+ "loss": 0.0159,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3715385.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0442950464785099,
+ "skip_count": 2.0,
+ "step": 2304,
+ "text_loss": 0.41893699765205383
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 10.826533607279131,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.058837890625,
+ "learning_rate": 0.0009239784810415249,
+ "loss": 0.0137,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 3719080.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015729321166872978,
+ "skip_count": 2.0,
+ "step": 2306,
+ "text_loss": 0.13360483944416046
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 10.835926034634575,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.06787109375,
+ "learning_rate": 0.0009238143359192514,
+ "loss": 0.0136,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 3722439.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.028816604986786842,
+ "skip_count": 3.0,
+ "step": 2308,
+ "text_loss": 0.39594101905822754
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 10.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.000923650028390525,
+ "loss": 0.0166,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3725092.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036455015651881695,
+ "skip_count": 2.0,
+ "step": 2310,
+ "text_loss": 0.6169708371162415
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 10.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.09814453125,
+ "learning_rate": 0.0009234855585183086,
+ "loss": 0.014,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3728412.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007565604057163,
+ "skip_count": 1.0,
+ "step": 2312,
+ "text_loss": 0.21257059276103973
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 28.0,
+ "epoch": 10.86410331670091,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.0009233209263656273,
+ "loss": 0.0184,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 3731467.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.02510629966855049,
+ "skip_count": 3.0,
+ "step": 2314,
+ "text_loss": 0.21639840304851532
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.873495744056354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.057861328125,
+ "learning_rate": 0.0009231561319955684,
+ "loss": 0.0154,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3734906.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00872227642685175,
+ "skip_count": 0.0,
+ "step": 2316,
+ "text_loss": 0.35639774799346924
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08349609375,
+ "learning_rate": 0.0009229911754712815,
+ "loss": 0.0176,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3737943.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004695790819823742,
+ "skip_count": 0.0,
+ "step": 2318,
+ "text_loss": 0.5269573330879211
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.892280598767243,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0009228260568559781,
+ "loss": 0.0115,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3741833.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0217357836663723,
+ "skip_count": 0.0,
+ "step": 2320,
+ "text_loss": 0.5110208988189697
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.901673026122689,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1953125,
+ "learning_rate": 0.0009226607762129322,
+ "loss": 0.0201,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3744642.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.05595960095524788,
+ "skip_count": 1.0,
+ "step": 2322,
+ "text_loss": 0.6291998624801636
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.056884765625,
+ "learning_rate": 0.0009224953336054796,
+ "loss": 0.0161,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3748127.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0071634589694440365,
+ "skip_count": 0.0,
+ "step": 2324,
+ "text_loss": 0.7404762506484985
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 0.000922329729097018,
+ "loss": 0.0169,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3751373.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011676300782710314,
+ "skip_count": 0.0,
+ "step": 2326,
+ "text_loss": 0.2915459871292114
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.929850308189023,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.061279296875,
+ "learning_rate": 0.0009221639627510075,
+ "loss": 0.0126,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3754518.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01039792038500309,
+ "skip_count": 0.0,
+ "step": 2328,
+ "text_loss": 0.22066321969032288
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0751953125,
+ "learning_rate": 0.0009219980346309702,
+ "loss": 0.0128,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3757621.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032070958986878395,
+ "skip_count": 0.0,
+ "step": 2330,
+ "text_loss": 0.5558560490608215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.948635162899912,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.076171875,
+ "learning_rate": 0.0009218319448004899,
+ "loss": 0.0118,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3760885.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007085457909852266,
+ "skip_count": 0.0,
+ "step": 2332,
+ "text_loss": 0.4348253607749939
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 10.958027590255357,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1103515625,
+ "learning_rate": 0.0009216656933232129,
+ "loss": 0.016,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3764462.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005504854489117861,
+ "skip_count": 1.0,
+ "step": 2334,
+ "text_loss": 0.35828644037246704
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 0.0009214992802628463,
+ "loss": 0.0131,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3767159.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013970810687169433,
+ "skip_count": 0.0,
+ "step": 2336,
+ "text_loss": 0.2956557869911194
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.976812444966246,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08203125,
+ "learning_rate": 0.0009213327056831607,
+ "loss": 0.0181,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3770408.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0427570566534996,
+ "skip_count": 1.0,
+ "step": 2338,
+ "text_loss": 0.14883014559745789
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.986204872321691,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0009211659696479875,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3773474.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011273405980318785,
+ "skip_count": 0.0,
+ "step": 2340,
+ "text_loss": 0.26011669635772705
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.995597299677135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.059814453125,
+ "learning_rate": 0.00092099907222122,
+ "loss": 0.0148,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3776909.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016178421210497618,
+ "skip_count": 0.0,
+ "step": 2342,
+ "text_loss": 0.49078530073165894
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.004696213677722,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.000920832013466814,
+ "loss": 0.0129,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3780741.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005510095041245222,
+ "skip_count": 0.0,
+ "step": 2344,
+ "text_loss": 0.4870249927043915
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.014088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0009206647934487866,
+ "loss": 0.0114,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3784673.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0047357892617583275,
+ "skip_count": 0.0,
+ "step": 2346,
+ "text_loss": 0.3251725733280182
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 0.0009204974122312167,
+ "loss": 0.0142,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3787503.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00795028731226921,
+ "skip_count": 1.0,
+ "step": 2348,
+ "text_loss": 0.18282145261764526
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060546875,
+ "learning_rate": 0.0009203298698782452,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3790528.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0009506374481134117,
+ "skip_count": 0.0,
+ "step": 2350,
+ "text_loss": 0.4093080461025238
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.042265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 0.0009201621664540747,
+ "loss": 0.0155,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3794134.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005159572698175907,
+ "skip_count": 0.0,
+ "step": 2352,
+ "text_loss": 0.5451981425285339
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.051658350454945,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07666015625,
+ "learning_rate": 0.0009199943020229694,
+ "loss": 0.0148,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3797414.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002356168581172824,
+ "skip_count": 0.0,
+ "step": 2354,
+ "text_loss": 0.3070453405380249
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0810546875,
+ "learning_rate": 0.0009198262766492554,
+ "loss": 0.0141,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3800094.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0051761893555521965,
+ "skip_count": 1.0,
+ "step": 2356,
+ "text_loss": 0.5880904197692871
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.070443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.00091965809039732,
+ "loss": 0.0132,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3803280.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025952060241252184,
+ "skip_count": 0.0,
+ "step": 2358,
+ "text_loss": 0.5210731625556946
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.079835632521279,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06787109375,
+ "learning_rate": 0.0009194897433316127,
+ "loss": 0.0125,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3805866.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0042560105212032795,
+ "skip_count": 2.0,
+ "step": 2360,
+ "text_loss": 0.6472984552383423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07568359375,
+ "learning_rate": 0.0009193212355166446,
+ "loss": 0.0109,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3808952.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026232977397739887,
+ "skip_count": 0.0,
+ "step": 2362,
+ "text_loss": 0.450063556432724
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0009191525670169881,
+ "loss": 0.0109,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3812080.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034355956595391035,
+ "skip_count": 0.0,
+ "step": 2364,
+ "text_loss": 0.49727216362953186
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.108012914587613,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.000918983737897277,
+ "loss": 0.0112,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3815282.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0055653867311775684,
+ "skip_count": 1.0,
+ "step": 2366,
+ "text_loss": 0.6336377859115601
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 11.117405341943059,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0009188147482222071,
+ "loss": 0.008,
+ "macro_f1": 1.0,
+ "num_tokens": 3818106.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.011016021482646465,
+ "skip_count": 2.0,
+ "step": 2368,
+ "text_loss": 0.22513329982757568
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.126797769298504,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0009186455980565358,
+ "loss": 0.0105,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3821228.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.014039464294910431,
+ "skip_count": 0.0,
+ "step": 2370,
+ "text_loss": 0.21331638097763062
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.136190196653947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0009184762874650816,
+ "loss": 0.0128,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3825048.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001088051125407219,
+ "skip_count": 0.0,
+ "step": 2372,
+ "text_loss": 0.6031543612480164
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.095703125,
+ "learning_rate": 0.0009183068165127245,
+ "loss": 0.013,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3828781.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006263940595090389,
+ "skip_count": 1.0,
+ "step": 2374,
+ "text_loss": 0.6249601244926453
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06982421875,
+ "learning_rate": 0.0009181371852644062,
+ "loss": 0.0133,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3832507.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001987969037145376,
+ "skip_count": 0.0,
+ "step": 2376,
+ "text_loss": 0.37972065806388855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.164367478720282,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0908203125,
+ "learning_rate": 0.0009179673937851299,
+ "loss": 0.0158,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3835644.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007635094691067934,
+ "skip_count": 1.0,
+ "step": 2378,
+ "text_loss": 0.46319663524627686
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0830078125,
+ "learning_rate": 0.0009177974421399598,
+ "loss": 0.0137,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3838700.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01617279462516308,
+ "skip_count": 2.0,
+ "step": 2380,
+ "text_loss": 0.32141056656837463
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056396484375,
+ "learning_rate": 0.0009176273303940217,
+ "loss": 0.011,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3841953.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022273799404501915,
+ "skip_count": 2.0,
+ "step": 2382,
+ "text_loss": 0.5908139944076538
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.192544760786616,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0009174570586125026,
+ "loss": 0.0122,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 3845763.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.030915161594748497,
+ "skip_count": 0.0,
+ "step": 2384,
+ "text_loss": 0.41400137543678284
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.201937188142061,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.0009172866268606513,
+ "loss": 0.0122,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3848984.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010480951517820358,
+ "skip_count": 2.0,
+ "step": 2386,
+ "text_loss": 0.2560874819755554
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056396484375,
+ "learning_rate": 0.0009171160352037775,
+ "loss": 0.0124,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3852118.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00809961836785078,
+ "skip_count": 1.0,
+ "step": 2388,
+ "text_loss": 0.28236693143844604
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 11.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 0.0009169452837072521,
+ "loss": 0.0105,
+ "macro_f1": 1.0,
+ "num_tokens": 3855314.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005569872446358204,
+ "skip_count": 1.0,
+ "step": 2390,
+ "text_loss": 0.4578137695789337
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1123046875,
+ "learning_rate": 0.0009167743724365073,
+ "loss": 0.0105,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3858301.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038610948249697685,
+ "skip_count": 1.0,
+ "step": 2392,
+ "text_loss": 0.14082716405391693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.239506897563839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1376953125,
+ "learning_rate": 0.0009166033014570368,
+ "loss": 0.0109,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3861296.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017607157351449132,
+ "skip_count": 0.0,
+ "step": 2394,
+ "text_loss": 0.384442001581192
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 11.248899324919284,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.0009164320708343954,
+ "loss": 0.0131,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3863985.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.009627950377762318,
+ "skip_count": 0.0,
+ "step": 2396,
+ "text_loss": 0.6969521045684814
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.258291752274728,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07666015625,
+ "learning_rate": 0.0009162606806341989,
+ "loss": 0.0107,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3866636.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006915586534887552,
+ "skip_count": 0.0,
+ "step": 2398,
+ "text_loss": 0.48069697618484497
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.0009160891309221242,
+ "loss": 0.0149,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3870867.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0013031222624704242,
+ "skip_count": 0.0,
+ "step": 2400,
+ "text_loss": 0.3882075846195221
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.277076606985618,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.0009159174217639096,
+ "loss": 0.0112,
+ "macro_f1": 0.5427350401878357,
+ "num_tokens": 3873663.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.06621067970991135,
+ "skip_count": 1.0,
+ "step": 2402,
+ "text_loss": 0.5740041136741638
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.286469034341062,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0009157455532253547,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3876788.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005957918707281351,
+ "skip_count": 0.0,
+ "step": 2404,
+ "text_loss": 0.26025933027267456
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 11.295861461696507,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.08642578125,
+ "learning_rate": 0.0009155735253723191,
+ "loss": 0.0126,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 3879942.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.039429809898138046,
+ "skip_count": 4.0,
+ "step": 2406,
+ "text_loss": 1.1349908113479614
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.305253889051952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 0.0009154013382707251,
+ "loss": 0.0113,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3882682.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012570557883009315,
+ "skip_count": 0.0,
+ "step": 2408,
+ "text_loss": 0.5611135363578796
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.314646316407396,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0009152289919865543,
+ "loss": 0.0123,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3886425.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017455556662753224,
+ "skip_count": 0.0,
+ "step": 2410,
+ "text_loss": 0.7523751854896545
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.324038743762841,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0009150564865858506,
+ "loss": 0.0114,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3889273.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011178011074662209,
+ "skip_count": 1.0,
+ "step": 2412,
+ "text_loss": 0.26942551136016846
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 11.333431171118287,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.0009148838221347182,
+ "loss": 0.0107,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 3892199.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.019628092646598816,
+ "skip_count": 0.0,
+ "step": 2414,
+ "text_loss": 0.5492315888404846
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.0009147109986993225,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3895362.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012255983427166939,
+ "skip_count": 0.0,
+ "step": 2416,
+ "text_loss": 0.23798216879367828
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11669921875,
+ "learning_rate": 0.0009145380163458899,
+ "loss": 0.0178,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3898476.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007018954027444124,
+ "skip_count": 0.0,
+ "step": 2418,
+ "text_loss": 0.1923145055770874
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.361608453184619,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0009143648751407074,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3901817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008574824314564466,
+ "skip_count": 0.0,
+ "step": 2420,
+ "text_loss": 0.4001806974411011
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 11.371000880540064,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.11328125,
+ "learning_rate": 0.0009141915751501231,
+ "loss": 0.0102,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3905461.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01572350226342678,
+ "skip_count": 2.0,
+ "step": 2422,
+ "text_loss": 0.19519129395484924
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0009140181164405458,
+ "loss": 0.0109,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3908878.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005503420252352953,
+ "skip_count": 0.0,
+ "step": 2424,
+ "text_loss": 0.6937088370323181
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0009138444990784454,
+ "loss": 0.013,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3912053.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007556677330285311,
+ "skip_count": 0.0,
+ "step": 2426,
+ "text_loss": 0.35431069135665894
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.399178162606399,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 0.000913670723130352,
+ "loss": 0.0117,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3915192.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013609991874545813,
+ "skip_count": 0.0,
+ "step": 2428,
+ "text_loss": 0.5171207189559937
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 11.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.0009134967886628573,
+ "loss": 0.0115,
+ "macro_f1": 1.0,
+ "num_tokens": 3917927.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.010895746760070324,
+ "skip_count": 2.0,
+ "step": 2430,
+ "text_loss": 0.2852934002876282
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.417963017317287,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0009133226957426133,
+ "loss": 0.0132,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3921460.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04196908697485924,
+ "skip_count": 0.0,
+ "step": 2432,
+ "text_loss": 0.4864770770072937
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.427355444672733,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1025390625,
+ "learning_rate": 0.0009131484444363324,
+ "loss": 0.0155,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3924662.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004484197124838829,
+ "skip_count": 0.0,
+ "step": 2434,
+ "text_loss": 0.7568684220314026
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.0009129740348107882,
+ "loss": 0.0114,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3927337.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004351360257714987,
+ "skip_count": 2.0,
+ "step": 2436,
+ "text_loss": 0.5953161716461182
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 11.446140299383622,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.04736328125,
+ "learning_rate": 0.0009127994669328151,
+ "loss": 0.0085,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 3930407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01664198748767376,
+ "skip_count": 4.0,
+ "step": 2438,
+ "text_loss": 0.5320524573326111
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.455532726739067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0595703125,
+ "learning_rate": 0.0009126247408693071,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3933184.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017819046042859554,
+ "skip_count": 1.0,
+ "step": 2440,
+ "text_loss": 0.6051273345947266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.0009124498566872204,
+ "loss": 0.0105,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3936620.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005519696045666933,
+ "skip_count": 0.0,
+ "step": 2442,
+ "text_loss": 0.12987950444221497
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.474317581449956,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 0.0009122748144535704,
+ "loss": 0.0111,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3940010.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04543351009488106,
+ "skip_count": 2.0,
+ "step": 2444,
+ "text_loss": 0.4642033576965332
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.483710008805401,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0009120996142354338,
+ "loss": 0.0121,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3943135.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00550565542653203,
+ "skip_count": 0.0,
+ "step": 2446,
+ "text_loss": 0.5697627067565918
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.493102436160845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05029296875,
+ "learning_rate": 0.0009119242560999477,
+ "loss": 0.0132,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3946650.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008842485956847668,
+ "skip_count": 0.0,
+ "step": 2448,
+ "text_loss": 0.17046524584293365
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08154296875,
+ "learning_rate": 0.0009117487401143095,
+ "loss": 0.0154,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3949470.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005900127813220024,
+ "skip_count": 0.0,
+ "step": 2450,
+ "text_loss": 0.37260866165161133
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 11.511887290871735,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0009115730663457773,
+ "loss": 0.0137,
+ "macro_f1": 1.0,
+ "num_tokens": 3952546.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003409258322790265,
+ "skip_count": 1.0,
+ "step": 2452,
+ "text_loss": 0.5308008193969727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.521279718227179,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0009113972348616698,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3955817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010098597034811974,
+ "skip_count": 1.0,
+ "step": 2454,
+ "text_loss": 0.39226648211479187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 11.530672145582624,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1640625,
+ "learning_rate": 0.0009112212457293658,
+ "loss": 0.0102,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3958911.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08184818178415298,
+ "skip_count": 0.0,
+ "step": 2456,
+ "text_loss": 0.45411455631256104
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0009110450990163047,
+ "loss": 0.0127,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3962584.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009352223132736981,
+ "skip_count": 0.0,
+ "step": 2458,
+ "text_loss": 0.47292324900627136
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.549457000293513,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0009108687947899863,
+ "loss": 0.0077,
+ "macro_f1": 1.0,
+ "num_tokens": 3965597.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008150188252329826,
+ "skip_count": 2.0,
+ "step": 2460,
+ "text_loss": 0.33208340406417847
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 11.558849427648958,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0009106923331179707,
+ "loss": 0.0125,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3968664.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.050999004393815994,
+ "skip_count": 2.0,
+ "step": 2462,
+ "text_loss": 0.2459995150566101
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0009105157140678782,
+ "loss": 0.0126,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3971772.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006196586415171623,
+ "skip_count": 1.0,
+ "step": 2464,
+ "text_loss": 0.23956991732120514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.577634282359847,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0009103389377073896,
+ "loss": 0.01,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3976224.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008181816898286343,
+ "skip_count": 0.0,
+ "step": 2466,
+ "text_loss": 0.3235875070095062
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.057373046875,
+ "learning_rate": 0.0009101620041042462,
+ "loss": 0.0116,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3978876.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015451472718268633,
+ "skip_count": 0.0,
+ "step": 2468,
+ "text_loss": 0.4038759469985962
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.596419137070736,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09130859375,
+ "learning_rate": 0.000909984913326249,
+ "loss": 0.0131,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3981992.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021785033866763115,
+ "skip_count": 1.0,
+ "step": 2470,
+ "text_loss": 0.6346460580825806
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.605811564426181,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0712890625,
+ "learning_rate": 0.0009098076654412595,
+ "loss": 0.0094,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3984560.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011462471447885036,
+ "skip_count": 0.0,
+ "step": 2472,
+ "text_loss": 0.3449646532535553
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.0009096302605171996,
+ "loss": 0.0103,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3987548.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014367027906700969,
+ "skip_count": 0.0,
+ "step": 2474,
+ "text_loss": 0.5918350219726562
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0478515625,
+ "learning_rate": 0.0009094526986220513,
+ "loss": 0.0124,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3990727.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008977655088528991,
+ "skip_count": 0.0,
+ "step": 2476,
+ "text_loss": 0.463350385427475
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.633988846492516,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0009092749798238563,
+ "loss": 0.015,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3993757.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.016712551936507225,
+ "skip_count": 0.0,
+ "step": 2478,
+ "text_loss": 0.5621229410171509
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.643381273847961,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.000909097104190717,
+ "loss": 0.0172,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3997259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04134179651737213,
+ "skip_count": 2.0,
+ "step": 2480,
+ "text_loss": 0.375476598739624
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0009089190717907956,
+ "loss": 0.0117,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4000563.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003462378401309252,
+ "skip_count": 0.0,
+ "step": 2482,
+ "text_loss": 0.5553798675537109
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06982421875,
+ "learning_rate": 0.0009087408826923146,
+ "loss": 0.0182,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4004065.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008057428523898125,
+ "skip_count": 2.0,
+ "step": 2484,
+ "text_loss": 0.4329465329647064
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.671558555914293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.0009085625369635564,
+ "loss": 0.0114,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4007119.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005759050603955984,
+ "skip_count": 0.0,
+ "step": 2486,
+ "text_loss": 0.501268744468689
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.680950983269739,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1240234375,
+ "learning_rate": 0.0009083840346728631,
+ "loss": 0.0122,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4010547.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.020763102918863297,
+ "skip_count": 0.0,
+ "step": 2488,
+ "text_loss": 0.480196475982666
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.690343410625184,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.0009082053758886374,
+ "loss": 0.0117,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4014600.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005801836494356394,
+ "skip_count": 1.0,
+ "step": 2490,
+ "text_loss": 0.18249782919883728
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 11.699735837980628,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0009080265606793416,
+ "loss": 0.0128,
+ "macro_f1": 1.0,
+ "num_tokens": 4017964.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004226063843816519,
+ "skip_count": 1.0,
+ "step": 2492,
+ "text_loss": 0.6573076248168945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.000907847589113498,
+ "loss": 0.0125,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4020694.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004281101748347282,
+ "skip_count": 2.0,
+ "step": 2494,
+ "text_loss": 0.3944586217403412
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.718520692691518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.061279296875,
+ "learning_rate": 0.000907668461259689,
+ "loss": 0.0152,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4023757.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008786370046436787,
+ "skip_count": 1.0,
+ "step": 2496,
+ "text_loss": 0.6452898979187012
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.727913120046962,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0009074891771865566,
+ "loss": 0.0125,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4026601.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005209595896303654,
+ "skip_count": 0.0,
+ "step": 2498,
+ "text_loss": 0.9633619785308838
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 11.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0009073097369628028,
+ "loss": 0.013,
+ "macro_f1": 1.0,
+ "num_tokens": 4030321.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.00860709697008133,
+ "skip_count": 1.0,
+ "step": 2500,
+ "text_loss": 0.48566827178001404
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.0009071301406571893,
+ "loss": 0.0132,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4033234.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035277456045150757,
+ "skip_count": 0.0,
+ "step": 2502,
+ "text_loss": 0.3771554231643677
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.756090402113296,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.000906950388338538,
+ "loss": 0.0136,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4036417.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013424850767478347,
+ "skip_count": 0.0,
+ "step": 2504,
+ "text_loss": 0.8962806463241577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.765482829468741,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09912109375,
+ "learning_rate": 0.0009067704800757301,
+ "loss": 0.0095,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4039564.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010423909407109022,
+ "skip_count": 0.0,
+ "step": 2506,
+ "text_loss": 0.43170279264450073
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.774875256824185,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.000906590415937707,
+ "loss": 0.0094,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4043212.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021780289709568024,
+ "skip_count": 1.0,
+ "step": 2508,
+ "text_loss": 0.41495826840400696
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0009064101959934696,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4046687.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007261929102241993,
+ "skip_count": 1.0,
+ "step": 2510,
+ "text_loss": 0.21821187436580658
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.057861328125,
+ "learning_rate": 0.0009062298203120783,
+ "loss": 0.0102,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4050735.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007447180338203907,
+ "skip_count": 2.0,
+ "step": 2512,
+ "text_loss": 0.1818767935037613
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.803052538890519,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06494140625,
+ "learning_rate": 0.0009060492889626535,
+ "loss": 0.0142,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4054426.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0718490406870842,
+ "skip_count": 0.0,
+ "step": 2514,
+ "text_loss": 0.22798970341682434
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.812444966245964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.099609375,
+ "learning_rate": 0.0009058686020143753,
+ "loss": 0.0183,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4057615.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0052676633931696415,
+ "skip_count": 0.0,
+ "step": 2516,
+ "text_loss": 0.1712338626384735
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0009056877595364832,
+ "loss": 0.0137,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4060338.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018052728846669197,
+ "skip_count": 0.0,
+ "step": 2518,
+ "text_loss": 0.6811438798904419
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.083984375,
+ "learning_rate": 0.0009055067615982761,
+ "loss": 0.0113,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4062887.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009029926732182503,
+ "skip_count": 0.0,
+ "step": 2520,
+ "text_loss": 0.5480356812477112
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.840622248312298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.0009053256082691133,
+ "loss": 0.0106,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4065357.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027515271212905645,
+ "skip_count": 0.0,
+ "step": 2522,
+ "text_loss": 0.5234101414680481
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08203125,
+ "learning_rate": 0.0009051442996184127,
+ "loss": 0.0174,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4068111.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002199822571128607,
+ "skip_count": 0.0,
+ "step": 2524,
+ "text_loss": 0.2418575882911682
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0625,
+ "learning_rate": 0.0009049628357156521,
+ "loss": 0.0143,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4071284.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006303096655756235,
+ "skip_count": 2.0,
+ "step": 2526,
+ "text_loss": 0.7948065996170044
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.868799530378633,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.000904781216630369,
+ "loss": 0.0068,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 4074750.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01791904680430889,
+ "skip_count": 2.0,
+ "step": 2528,
+ "text_loss": 0.809726357460022
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 11.878191957734076,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 0.0009045994424321602,
+ "loss": 0.0102,
+ "macro_f1": 1.0,
+ "num_tokens": 4078617.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.016553178429603577,
+ "skip_count": 2.0,
+ "step": 2530,
+ "text_loss": 0.8755000829696655
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.887584385089522,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.061767578125,
+ "learning_rate": 0.0009044175131906817,
+ "loss": 0.0145,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4080936.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00884837657213211,
+ "skip_count": 0.0,
+ "step": 2532,
+ "text_loss": 0.795871913433075
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.896976812444967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05029296875,
+ "learning_rate": 0.0009042354289756491,
+ "loss": 0.0122,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4084459.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024387789890170097,
+ "skip_count": 0.0,
+ "step": 2534,
+ "text_loss": 0.18875400722026825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0625,
+ "learning_rate": 0.0009040531898568379,
+ "loss": 0.0171,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4088464.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00491489190608263,
+ "skip_count": 0.0,
+ "step": 2536,
+ "text_loss": 0.334369033575058
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.091796875,
+ "learning_rate": 0.000903870795904082,
+ "loss": 0.0145,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4091659.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004592662677168846,
+ "skip_count": 2.0,
+ "step": 2538,
+ "text_loss": 0.21298295259475708
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 11.925154094511301,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 0.000903688247187275,
+ "loss": 0.0137,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4095496.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011647242121398449,
+ "skip_count": 2.0,
+ "step": 2540,
+ "text_loss": 0.2985081672668457
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.934546521866745,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.0009035055437763704,
+ "loss": 0.0124,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4098663.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021238960325717926,
+ "skip_count": 0.0,
+ "step": 2542,
+ "text_loss": 0.35359489917755127
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05859375,
+ "learning_rate": 0.0009033226857413803,
+ "loss": 0.0163,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4101588.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0024701557122170925,
+ "skip_count": 0.0,
+ "step": 2544,
+ "text_loss": 1.1577601432800293
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.080078125,
+ "learning_rate": 0.000903139673152376,
+ "loss": 0.012,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4104643.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002499542199075222,
+ "skip_count": 0.0,
+ "step": 2546,
+ "text_loss": 1.0173401832580566
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.962723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.059814453125,
+ "learning_rate": 0.0009029565060794885,
+ "loss": 0.0165,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4109247.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034200598020106554,
+ "skip_count": 0.0,
+ "step": 2548,
+ "text_loss": 0.5690504312515259
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.972116231288524,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.06884765625,
+ "learning_rate": 0.0009027731845929079,
+ "loss": 0.0155,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 4112597.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015981333330273628,
+ "skip_count": 1.0,
+ "step": 2550,
+ "text_loss": 0.294549822807312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 11.981508658643968,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.06103515625,
+ "learning_rate": 0.0009025897087628829,
+ "loss": 0.0064,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4115844.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02606951631605625,
+ "skip_count": 2.0,
+ "step": 2552,
+ "text_loss": 0.22692419588565826
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.990901085999413,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.080078125,
+ "learning_rate": 0.0009024060786597222,
+ "loss": 0.0202,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4118634.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001026194542646408,
+ "skip_count": 0.0,
+ "step": 2554,
+ "text_loss": 0.6807059645652771
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.000902222294353793,
+ "loss": 0.0124,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4122024.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001974924933165312,
+ "skip_count": 0.0,
+ "step": 2556,
+ "text_loss": 0.7373668551445007
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.009392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04833984375,
+ "learning_rate": 0.0009020383559155219,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 4124803.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004662613850086927,
+ "skip_count": 2.0,
+ "step": 2558,
+ "text_loss": 0.21808166801929474
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.018784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.0009018542634153943,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4127680.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006881687790155411,
+ "skip_count": 0.0,
+ "step": 2560,
+ "text_loss": 0.25192978978157043
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 12.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.0009016700169239551,
+ "loss": 0.0105,
+ "macro_f1": 1.0,
+ "num_tokens": 4130431.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005977808032184839,
+ "skip_count": 1.0,
+ "step": 2562,
+ "text_loss": 0.4700816869735718
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0009014856165118075,
+ "loss": 0.0153,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4133535.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007005698047578335,
+ "skip_count": 1.0,
+ "step": 2564,
+ "text_loss": 0.6558199524879456
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.046962136777223,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0009013010622496144,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4136534.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007262171246111393,
+ "skip_count": 0.0,
+ "step": 2566,
+ "text_loss": 0.2565421462059021
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 12.056354564132668,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0009011163542080971,
+ "loss": 0.0088,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 4139762.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05431923270225525,
+ "skip_count": 3.0,
+ "step": 2568,
+ "text_loss": 0.19896510243415833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.0009009314924580363,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4143398.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003667369019240141,
+ "skip_count": 0.0,
+ "step": 2570,
+ "text_loss": 0.6581419110298157
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.075139418843557,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052978515625,
+ "learning_rate": 0.0009007464770702712,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4146248.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00132099783513695,
+ "skip_count": 0.0,
+ "step": 2572,
+ "text_loss": 0.5316711068153381
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0009005613081157002,
+ "loss": 0.0132,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4149455.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020061524119228125,
+ "skip_count": 0.0,
+ "step": 2574,
+ "text_loss": 0.5400773882865906
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05517578125,
+ "learning_rate": 0.0009003759856652802,
+ "loss": 0.0111,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4152774.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002621434163302183,
+ "skip_count": 1.0,
+ "step": 2576,
+ "text_loss": 0.3672606945037842
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.103316700909891,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051513671875,
+ "learning_rate": 0.0009001905097900273,
+ "loss": 0.0121,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4155835.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005290219560265541,
+ "skip_count": 0.0,
+ "step": 2578,
+ "text_loss": 0.8159038424491882
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0009000048805610161,
+ "loss": 0.0119,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4158874.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013576085912063718,
+ "skip_count": 0.0,
+ "step": 2580,
+ "text_loss": 0.5518951416015625
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.138671875,
+ "learning_rate": 0.00089981909804938,
+ "loss": 0.0143,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4162076.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021483441814780235,
+ "skip_count": 0.0,
+ "step": 2582,
+ "text_loss": 0.43552228808403015
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.25,
+ "avg_layers": 28.0,
+ "epoch": 12.131493982976226,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0008996331623263114,
+ "loss": 0.0117,
+ "macro_f1": 0.7795917987823486,
+ "num_tokens": 4165041.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0544300302863121,
+ "skip_count": 4.0,
+ "step": 2584,
+ "text_loss": 0.24812501668930054
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.140886410331671,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 0.0008994470734630611,
+ "loss": 0.0101,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4168290.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017150711501017213,
+ "skip_count": 0.0,
+ "step": 2586,
+ "text_loss": 0.6392097473144531
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 12.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0008992608315309388,
+ "loss": 0.015,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4171310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0046473173424601555,
+ "skip_count": 2.0,
+ "step": 2588,
+ "text_loss": 0.6534156799316406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.15967126504256,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06591796875,
+ "learning_rate": 0.0008990744366013125,
+ "loss": 0.0105,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 4174042.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.060913100838661194,
+ "skip_count": 1.0,
+ "step": 2590,
+ "text_loss": 0.5365690588951111
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 12.169063692398003,
+ "f1_execute": 0.9583333134651184,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.055419921875,
+ "learning_rate": 0.0008988878887456093,
+ "loss": 0.0118,
+ "macro_f1": 0.6051587462425232,
+ "num_tokens": 4177666.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06268956512212753,
+ "skip_count": 4.0,
+ "step": 2592,
+ "text_loss": 0.226226806640625
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.178456119753449,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0008987011880353149,
+ "loss": 0.0089,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 4180490.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030141465365886688,
+ "skip_count": 2.0,
+ "step": 2594,
+ "text_loss": 0.2581401765346527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 12.187848547108894,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0008985143345419729,
+ "loss": 0.0082,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4183300.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018745863810181618,
+ "skip_count": 2.0,
+ "step": 2596,
+ "text_loss": 0.7778542637825012
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 12.197240974464338,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0008983273283371862,
+ "loss": 0.0096,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4186535.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.026792079210281372,
+ "skip_count": 2.0,
+ "step": 2598,
+ "text_loss": 0.34700271487236023
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0008981401694926159,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4189082.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001914160675369203,
+ "skip_count": 0.0,
+ "step": 2600,
+ "text_loss": 0.6879339218139648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 12.216025829175228,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.0008979528580799815,
+ "loss": 0.0136,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4192330.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007978348061442375,
+ "skip_count": 2.0,
+ "step": 2602,
+ "text_loss": 0.3524550497531891
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 12.225418256530672,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.0008977653941710613,
+ "loss": 0.0134,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4196117.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0035376469604671,
+ "skip_count": 0.0,
+ "step": 2604,
+ "text_loss": 0.42356348037719727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05810546875,
+ "learning_rate": 0.0008975777778376916,
+ "loss": 0.0156,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4200423.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008262477815151215,
+ "skip_count": 1.0,
+ "step": 2606,
+ "text_loss": 0.5272893905639648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.244203111241562,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0732421875,
+ "learning_rate": 0.0008973900091517675,
+ "loss": 0.0114,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4203257.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022957922890782356,
+ "skip_count": 1.0,
+ "step": 2608,
+ "text_loss": 0.2713734805583954
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 12.253595538597006,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043701171875,
+ "learning_rate": 0.000897202088185242,
+ "loss": 0.0085,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4206243.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006623407825827599,
+ "skip_count": 2.0,
+ "step": 2610,
+ "text_loss": 0.5920525789260864
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.262987965952451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.0008970140150101274,
+ "loss": 0.0116,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4209264.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008602747693657875,
+ "skip_count": 0.0,
+ "step": 2612,
+ "text_loss": 0.33421996235847473
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0008968257896984932,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4212058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024653903674334288,
+ "skip_count": 1.0,
+ "step": 2614,
+ "text_loss": 0.37923356890678406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 0.0008966374123224677,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4214929.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010878405533730984,
+ "skip_count": 0.0,
+ "step": 2616,
+ "text_loss": 0.4350503981113434
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.291165248018785,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0008964488829542376,
+ "loss": 0.0083,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4219170.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02864212542772293,
+ "skip_count": 1.0,
+ "step": 2618,
+ "text_loss": 0.26250728964805603
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 12.300557675374229,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0008962602016660478,
+ "loss": 0.0096,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4222077.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010444172658026218,
+ "skip_count": 2.0,
+ "step": 2620,
+ "text_loss": 0.4718937575817108
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.309950102729674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0478515625,
+ "learning_rate": 0.0008960713685302011,
+ "loss": 0.0105,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4225383.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006409442983567715,
+ "skip_count": 1.0,
+ "step": 2622,
+ "text_loss": 0.30420538783073425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.31934253008512,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0008958823836190588,
+ "loss": 0.005,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4228349.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009996986016631126,
+ "skip_count": 1.0,
+ "step": 2624,
+ "text_loss": 0.5392362475395203
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0008956932470050404,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4232007.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014383369125425816,
+ "skip_count": 0.0,
+ "step": 2626,
+ "text_loss": 0.7112401127815247
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 12.338127384796008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0008955039587606233,
+ "loss": 0.0109,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4235122.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00781513936817646,
+ "skip_count": 3.0,
+ "step": 2628,
+ "text_loss": 0.17802883684635162
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 12.347519812151454,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.0008953145189583429,
+ "loss": 0.0126,
+ "macro_f1": 0.542222261428833,
+ "num_tokens": 4238248.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.062252625823020935,
+ "skip_count": 4.0,
+ "step": 2630,
+ "text_loss": 0.5551572442054749
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0008951249276707933,
+ "loss": 0.0116,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4241042.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011421777307987213,
+ "skip_count": 0.0,
+ "step": 2632,
+ "text_loss": 0.7092233896255493
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.366304666862343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07177734375,
+ "learning_rate": 0.0008949351849706261,
+ "loss": 0.0117,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4243939.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032689040526747704,
+ "skip_count": 0.0,
+ "step": 2634,
+ "text_loss": 0.19925718009471893
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 12.375697094217786,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0008947452909305509,
+ "loss": 0.0109,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4247535.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002066014800220728,
+ "skip_count": 0.0,
+ "step": 2636,
+ "text_loss": 0.5249715447425842
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 29.0,
+ "epoch": 12.385089521573232,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.09326171875,
+ "learning_rate": 0.0008945552456233356,
+ "loss": 0.0169,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 4251441.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.029332537204027176,
+ "skip_count": 2.0,
+ "step": 2638,
+ "text_loss": 0.19229578971862793
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.394481948928677,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.078125,
+ "learning_rate": 0.0008943650491218058,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4254314.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0075911120511591434,
+ "skip_count": 0.0,
+ "step": 2640,
+ "text_loss": 0.27059751749038696
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.0008941747014988453,
+ "loss": 0.0156,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4257442.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009030844084918499,
+ "skip_count": 0.0,
+ "step": 2642,
+ "text_loss": 0.36747801303863525
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.123046875,
+ "learning_rate": 0.0008939842028273956,
+ "loss": 0.0112,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4260386.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007844001986086369,
+ "skip_count": 1.0,
+ "step": 2644,
+ "text_loss": 0.6397647857666016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.422659230995011,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.0008937935531804562,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4263516.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018789108144119382,
+ "skip_count": 0.0,
+ "step": 2646,
+ "text_loss": 0.4795534908771515
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.432051658350455,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06494140625,
+ "learning_rate": 0.0008936027526310844,
+ "loss": 0.0098,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4266744.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0348590686917305,
+ "skip_count": 1.0,
+ "step": 2648,
+ "text_loss": 0.27691999077796936
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07275390625,
+ "learning_rate": 0.000893411801252395,
+ "loss": 0.015,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4269766.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004543309565633535,
+ "skip_count": 1.0,
+ "step": 2650,
+ "text_loss": 0.18867231905460358
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0008932206991175615,
+ "loss": 0.0141,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4273513.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035277456045150757,
+ "skip_count": 1.0,
+ "step": 2652,
+ "text_loss": 0.45613357424736023
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 12.460228940416789,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055908203125,
+ "learning_rate": 0.0008930294462998143,
+ "loss": 0.015,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4276878.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011337592266499996,
+ "skip_count": 0.0,
+ "step": 2654,
+ "text_loss": 0.24733254313468933
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0869140625,
+ "learning_rate": 0.0008928380428724419,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4279915.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010295971296727657,
+ "skip_count": 1.0,
+ "step": 2656,
+ "text_loss": 0.41722849011421204
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 12.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.0008926464889087903,
+ "loss": 0.0116,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4282888.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017198545392602682,
+ "skip_count": 2.0,
+ "step": 2658,
+ "text_loss": 0.738322377204895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.488406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0008924547844822634,
+ "loss": 0.0103,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4285805.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001339946174994111,
+ "skip_count": 0.0,
+ "step": 2660,
+ "text_loss": 0.4802379906177521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.497798649838568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05322265625,
+ "learning_rate": 0.000892262929666323,
+ "loss": 0.0103,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4290282.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022340165451169014,
+ "skip_count": 0.0,
+ "step": 2662,
+ "text_loss": 0.6503544449806213
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0008920709245344878,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4294106.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005288850050419569,
+ "skip_count": 1.0,
+ "step": 2664,
+ "text_loss": 0.12312037497758865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.516583504549457,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0008918787691603347,
+ "loss": 0.0121,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4298013.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004259659443050623,
+ "skip_count": 1.0,
+ "step": 2666,
+ "text_loss": 0.3070000112056732
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.000891686463617498,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4300799.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009489355608820915,
+ "skip_count": 1.0,
+ "step": 2668,
+ "text_loss": 0.18535588681697845
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055908203125,
+ "learning_rate": 0.0008914940079796696,
+ "loss": 0.0114,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4304641.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025417013093829155,
+ "skip_count": 0.0,
+ "step": 2670,
+ "text_loss": 0.482585072517395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.544760786615791,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 0.0008913014023205988,
+ "loss": 0.0108,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4307462.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006371749565005302,
+ "skip_count": 0.0,
+ "step": 2672,
+ "text_loss": 0.7064456939697266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0008911086467140925,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4310396.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027512952219694853,
+ "skip_count": 0.0,
+ "step": 2674,
+ "text_loss": 0.23532851040363312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05712890625,
+ "learning_rate": 0.000890915741234015,
+ "loss": 0.0133,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4314781.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008253013715147972,
+ "skip_count": 1.0,
+ "step": 2676,
+ "text_loss": 0.30950358510017395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 12.572938068682125,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0008907226859542879,
+ "loss": 0.0105,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4317988.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005409995559602976,
+ "skip_count": 2.0,
+ "step": 2678,
+ "text_loss": 0.4930732846260071
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 12.582330496037569,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.060546875,
+ "learning_rate": 0.0008905294809488907,
+ "loss": 0.0084,
+ "macro_f1": 1.0,
+ "num_tokens": 4321014.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0029942214023321867,
+ "skip_count": 1.0,
+ "step": 2680,
+ "text_loss": 0.6224040389060974
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06982421875,
+ "learning_rate": 0.0008903361262918595,
+ "loss": 0.0115,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4324268.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008411120623350143,
+ "skip_count": 1.0,
+ "step": 2682,
+ "text_loss": 0.16296671330928802
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 12.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05126953125,
+ "learning_rate": 0.0008901426220572884,
+ "loss": 0.0138,
+ "macro_f1": 1.0,
+ "num_tokens": 4327494.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01039006095379591,
+ "skip_count": 4.0,
+ "step": 2684,
+ "text_loss": 0.43866512179374695
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.610507778103903,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060791015625,
+ "learning_rate": 0.0008899489683193286,
+ "loss": 0.0107,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4330936.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009329111780971289,
+ "skip_count": 0.0,
+ "step": 2686,
+ "text_loss": 0.44250962138175964
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.619900205459349,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07421875,
+ "learning_rate": 0.0008897551651521885,
+ "loss": 0.0111,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4334123.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003197216661646962,
+ "skip_count": 0.0,
+ "step": 2688,
+ "text_loss": 0.48313501477241516
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.629292632814794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09716796875,
+ "learning_rate": 0.0008895612126301339,
+ "loss": 0.0157,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4337610.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033548236824572086,
+ "skip_count": 0.0,
+ "step": 2690,
+ "text_loss": 0.4715327322483063
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.638685060170237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051513671875,
+ "learning_rate": 0.0008893671108274877,
+ "loss": 0.0118,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4341026.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024757643695920706,
+ "skip_count": 0.0,
+ "step": 2692,
+ "text_loss": 0.43402785062789917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0008891728598186302,
+ "loss": 0.0109,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4344422.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003317243419587612,
+ "skip_count": 0.0,
+ "step": 2694,
+ "text_loss": 0.8498559594154358
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 12.657469914881126,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0008889784596779986,
+ "loss": 0.009,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 4347507.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01577926240861416,
+ "skip_count": 3.0,
+ "step": 2696,
+ "text_loss": 0.5646669864654541
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.666862342236572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11328125,
+ "learning_rate": 0.0008887839104800876,
+ "loss": 0.0124,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4350414.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002953822258859873,
+ "skip_count": 0.0,
+ "step": 2698,
+ "text_loss": 0.5145012140274048
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05029296875,
+ "learning_rate": 0.0008885892122994486,
+ "loss": 0.0112,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4354110.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005849295295774937,
+ "skip_count": 0.0,
+ "step": 2700,
+ "text_loss": 0.580982506275177
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.0008883943652106903,
+ "loss": 0.0086,
+ "macro_f1": 1.0,
+ "num_tokens": 4357323.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012347398325800896,
+ "skip_count": 2.0,
+ "step": 2702,
+ "text_loss": 0.2234988808631897
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.695039624302906,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 0.0008881993692884787,
+ "loss": 0.0128,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4360228.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003574999049305916,
+ "skip_count": 1.0,
+ "step": 2704,
+ "text_loss": 0.4261806607246399
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.704432051658351,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0008880042246075365,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4363905.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031574300955981016,
+ "skip_count": 0.0,
+ "step": 2706,
+ "text_loss": 0.691118061542511
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.0008878089312426433,
+ "loss": 0.0091,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4366736.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003195564029738307,
+ "skip_count": 0.0,
+ "step": 2708,
+ "text_loss": 0.613926112651825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6000000238418579,
+ "avg_layers": 25.0,
+ "epoch": 12.72321690636924,
+ "f1_execute": 0.9583333134651184,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.75,
+ "grad_norm": 0.054443359375,
+ "learning_rate": 0.0008876134892686363,
+ "loss": 0.011,
+ "macro_f1": 0.5694444179534912,
+ "num_tokens": 4370146.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.038784291595220566,
+ "skip_count": 5.0,
+ "step": 2710,
+ "text_loss": 0.2723451852798462
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0830078125,
+ "learning_rate": 0.000887417898760409,
+ "loss": 0.0126,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4373653.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006457131239585578,
+ "skip_count": 0.0,
+ "step": 2712,
+ "text_loss": 0.31667640805244446
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.742001761080129,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10498046875,
+ "learning_rate": 0.000887222159792912,
+ "loss": 0.0155,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 4376993.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.045078590512275696,
+ "skip_count": 1.0,
+ "step": 2714,
+ "text_loss": 0.5872798562049866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.751394188435574,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0008870262724411528,
+ "loss": 0.012,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4380160.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003628545207902789,
+ "skip_count": 0.0,
+ "step": 2716,
+ "text_loss": 0.7468157410621643
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 12.760786615791018,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.11181640625,
+ "learning_rate": 0.0008868302367801962,
+ "loss": 0.0118,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 4383100.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.05404464527964592,
+ "skip_count": 3.0,
+ "step": 2718,
+ "text_loss": 0.2970244884490967
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0008866340528851629,
+ "loss": 0.0103,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4386700.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007000274024903774,
+ "skip_count": 0.0,
+ "step": 2720,
+ "text_loss": 0.34521186351776123
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 12.779571470501908,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.052978515625,
+ "learning_rate": 0.0008864377208312313,
+ "loss": 0.0082,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 4390299.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02025366574525833,
+ "skip_count": 2.0,
+ "step": 2722,
+ "text_loss": 1.0536936521530151
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.788963897857352,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.000886241240693636,
+ "loss": 0.0098,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4393353.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00251673418097198,
+ "skip_count": 0.0,
+ "step": 2724,
+ "text_loss": 0.5678093433380127
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 12.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 0.0008860446125476686,
+ "loss": 0.0135,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4396446.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009532532654702663,
+ "skip_count": 0.0,
+ "step": 2726,
+ "text_loss": 0.23775041103363037
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 12.807748752568243,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.091796875,
+ "learning_rate": 0.0008858478364686776,
+ "loss": 0.0099,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4399977.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008062181062996387,
+ "skip_count": 0.0,
+ "step": 2728,
+ "text_loss": 0.18888695538043976
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.817141179923686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0008856509125320678,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4404406.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007731119985692203,
+ "skip_count": 0.0,
+ "step": 2730,
+ "text_loss": 0.47331541776657104
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.826533607279131,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.0008854538408133006,
+ "loss": 0.0114,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4407165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003115242812782526,
+ "skip_count": 1.0,
+ "step": 2732,
+ "text_loss": 0.491370290517807
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0008852566213878947,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4410101.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008958528051152825,
+ "skip_count": 0.0,
+ "step": 2734,
+ "text_loss": 0.42188262939453125
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 12.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07763671875,
+ "learning_rate": 0.0008850592543314246,
+ "loss": 0.0118,
+ "macro_f1": 1.0,
+ "num_tokens": 4413015.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01139112375676632,
+ "skip_count": 1.0,
+ "step": 2736,
+ "text_loss": 0.4716498553752899
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 12.854710889345466,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 0.0008848617397195218,
+ "loss": 0.0084,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 4416404.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01609630137681961,
+ "skip_count": 1.0,
+ "step": 2738,
+ "text_loss": 0.19490821659564972
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0008846640776278745,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4419408.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001489170710556209,
+ "skip_count": 0.0,
+ "step": 2740,
+ "text_loss": 0.6443108320236206
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 12.873495744056354,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0008844662681322269,
+ "loss": 0.0144,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4422067.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0014755792217329144,
+ "skip_count": 0.0,
+ "step": 2742,
+ "text_loss": 0.9150356650352478
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.0008842683113083801,
+ "loss": 0.0149,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4425647.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008962674997746944,
+ "skip_count": 1.0,
+ "step": 2744,
+ "text_loss": 0.7103227972984314
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 12.892280598767243,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0751953125,
+ "learning_rate": 0.0008840702072321915,
+ "loss": 0.0104,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 4428855.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02554207295179367,
+ "skip_count": 3.0,
+ "step": 2746,
+ "text_loss": 0.27141591906547546
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.901673026122689,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.0008838719559795751,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4432838.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011747616808861494,
+ "skip_count": 0.0,
+ "step": 2748,
+ "text_loss": 0.4007738530635834
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 12.911065453478134,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0008836735576265009,
+ "loss": 0.0073,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4435793.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017564335837960243,
+ "skip_count": 2.0,
+ "step": 2750,
+ "text_loss": 0.5972410440444946
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 12.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044921875,
+ "learning_rate": 0.0008834750122489956,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4438871.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007004009559750557,
+ "skip_count": 0.0,
+ "step": 2752,
+ "text_loss": 0.2294853925704956
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.929850308189023,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.0008832763199231423,
+ "loss": 0.0107,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4441846.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014562139986082911,
+ "skip_count": 0.0,
+ "step": 2754,
+ "text_loss": 0.722432017326355
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.939242735544468,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0751953125,
+ "learning_rate": 0.0008830774807250802,
+ "loss": 0.013,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4444786.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.024773593991994858,
+ "skip_count": 0.0,
+ "step": 2756,
+ "text_loss": 0.507905125617981
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 12.948635162899912,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0008828784947310049,
+ "loss": 0.0129,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 4448442.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04959975928068161,
+ "skip_count": 2.0,
+ "step": 2758,
+ "text_loss": 0.3617522418498993
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.958027590255357,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1025390625,
+ "learning_rate": 0.000882679362017168,
+ "loss": 0.0149,
+ "macro_f1": 1.0,
+ "num_tokens": 4451401.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005783245898783207,
+ "skip_count": 2.0,
+ "step": 2760,
+ "text_loss": 0.49187400937080383
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0791015625,
+ "learning_rate": 0.0008824800826598778,
+ "loss": 0.0127,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4454537.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00656260596588254,
+ "skip_count": 0.0,
+ "step": 2762,
+ "text_loss": 0.6823583245277405
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 12.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0546875,
+ "learning_rate": 0.0008822806567354983,
+ "loss": 0.0111,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4457706.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005298966076225042,
+ "skip_count": 0.0,
+ "step": 2764,
+ "text_loss": 0.554322361946106
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.986204872321691,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046630859375,
+ "learning_rate": 0.0008820810843204501,
+ "loss": 0.0096,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4460710.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03164982795715332,
+ "skip_count": 1.0,
+ "step": 2766,
+ "text_loss": 0.1656961441040039
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.995597299677135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.072265625,
+ "learning_rate": 0.0008818813654912095,
+ "loss": 0.0162,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4464001.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000715116853825748,
+ "skip_count": 0.0,
+ "step": 2768,
+ "text_loss": 0.5818144083023071
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.004696213677722,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.056396484375,
+ "learning_rate": 0.0008816815003243093,
+ "loss": 0.0133,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4467364.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002851625671610236,
+ "skip_count": 0.0,
+ "step": 2770,
+ "text_loss": 0.6068631410598755
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.014088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0008814814888963383,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4470681.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004729873035103083,
+ "skip_count": 1.0,
+ "step": 2772,
+ "text_loss": 0.5386646389961243
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.000881281331283941,
+ "loss": 0.0091,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4473734.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031853127293288708,
+ "skip_count": 1.0,
+ "step": 2774,
+ "text_loss": 0.5695263147354126
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0008810810275638182,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4478404.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008977465913631022,
+ "skip_count": 0.0,
+ "step": 2776,
+ "text_loss": 0.4750773310661316
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.042265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0008808805778127269,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4481287.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00469845999032259,
+ "skip_count": 0.0,
+ "step": 2778,
+ "text_loss": 0.14078612625598907
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 13.051658350454945,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.0008806799821074796,
+ "loss": 0.0079,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4483929.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01789761893451214,
+ "skip_count": 2.0,
+ "step": 2780,
+ "text_loss": 0.2167191207408905
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.056396484375,
+ "learning_rate": 0.0008804792405249451,
+ "loss": 0.0123,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4487468.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001018838956952095,
+ "skip_count": 0.0,
+ "step": 2782,
+ "text_loss": 0.5424665212631226
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 28.0,
+ "epoch": 13.070443205165835,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.000880278353142048,
+ "loss": 0.0077,
+ "macro_f1": 0.8200000524520874,
+ "num_tokens": 4490942.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03260354697704315,
+ "skip_count": 3.0,
+ "step": 2784,
+ "text_loss": 0.20994654297828674
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.079835632521279,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05322265625,
+ "learning_rate": 0.0008800773200357683,
+ "loss": 0.0122,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4493986.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003019835101440549,
+ "skip_count": 0.0,
+ "step": 2786,
+ "text_loss": 0.5709528923034668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0008798761412831429,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4498232.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00285192858427763,
+ "skip_count": 0.0,
+ "step": 2788,
+ "text_loss": 0.5103896260261536
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044921875,
+ "learning_rate": 0.0008796748169612634,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4501231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012469831854104996,
+ "skip_count": 0.0,
+ "step": 2790,
+ "text_loss": 0.43669697642326355
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.108012914587613,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0008794733471472778,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4504208.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011512776836752892,
+ "skip_count": 1.0,
+ "step": 2792,
+ "text_loss": 0.2299770563840866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.117405341943059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0008792717319183899,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4507013.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00834917277097702,
+ "skip_count": 0.0,
+ "step": 2794,
+ "text_loss": 0.2130603939294815
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.126797769298504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0008790699713518587,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4510286.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008616939187049866,
+ "skip_count": 2.0,
+ "step": 2796,
+ "text_loss": 0.4377101957798004
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.136190196653947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0008788680655249994,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4513762.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003408568911254406,
+ "skip_count": 0.0,
+ "step": 2798,
+ "text_loss": 0.435138463973999
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 13.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0008786660145151826,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4516696.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0029398901388049126,
+ "skip_count": 0.0,
+ "step": 2800,
+ "text_loss": 0.3195655047893524
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0008784638183998348,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4519760.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013777425047010183,
+ "skip_count": 0.0,
+ "step": 2802,
+ "text_loss": 0.8129430413246155
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.164367478720282,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0008782614772564379,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4522106.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031694830395281315,
+ "skip_count": 0.0,
+ "step": 2804,
+ "text_loss": 0.18083660304546356
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0008780589911625293,
+ "loss": 0.0114,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4525743.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002161208540201187,
+ "skip_count": 0.0,
+ "step": 2806,
+ "text_loss": 0.8228182792663574
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07177734375,
+ "learning_rate": 0.0008778563601957021,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4529573.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028444856870919466,
+ "skip_count": 1.0,
+ "step": 2808,
+ "text_loss": 0.3715563118457794
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.192544760786616,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0008776535844336049,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4532452.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003807213855907321,
+ "skip_count": 0.0,
+ "step": 2810,
+ "text_loss": 0.6012523174285889
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.201937188142061,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0008774506639539417,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4536077.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006698979996144772,
+ "skip_count": 0.0,
+ "step": 2812,
+ "text_loss": 0.27097949385643005
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.099609375,
+ "learning_rate": 0.0008772475988344722,
+ "loss": 0.013,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4539057.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004849409218877554,
+ "skip_count": 1.0,
+ "step": 2814,
+ "text_loss": 1.026973843574524
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 13.22072204285295,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0008770443891530109,
+ "loss": 0.0115,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 4542253.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019148651510477066,
+ "skip_count": 3.0,
+ "step": 2816,
+ "text_loss": 0.2717585563659668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.230114470208395,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 0.0008768410349874286,
+ "loss": 0.0098,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 4545047.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02231316640973091,
+ "skip_count": 2.0,
+ "step": 2818,
+ "text_loss": 0.274346262216568
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.239506897563839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0008766375364156508,
+ "loss": 0.0091,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4548371.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008014129474759102,
+ "skip_count": 2.0,
+ "step": 2820,
+ "text_loss": 0.22850871086120605
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.248899324919284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.0008764338935156586,
+ "loss": 0.0095,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4551276.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014544493751600385,
+ "skip_count": 0.0,
+ "step": 2822,
+ "text_loss": 0.6308462023735046
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 13.258291752274728,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.000876230106365488,
+ "loss": 0.0123,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4554143.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00818584579974413,
+ "skip_count": 3.0,
+ "step": 2824,
+ "text_loss": 0.3484207093715668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 13.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0008760261750432312,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4557256.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006275608204305172,
+ "skip_count": 3.0,
+ "step": 2826,
+ "text_loss": 0.1927330046892166
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 13.277076606985618,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0008758220996270348,
+ "loss": 0.0103,
+ "macro_f1": 1.0,
+ "num_tokens": 4560202.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0055974251590669155,
+ "skip_count": 2.0,
+ "step": 2828,
+ "text_loss": 0.7796496748924255
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.286469034341062,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.0008756178801951007,
+ "loss": 0.0129,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4563508.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019799957517534494,
+ "skip_count": 0.0,
+ "step": 2830,
+ "text_loss": 0.49633297324180603
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 0.0008754135168256865,
+ "loss": 0.0095,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4566776.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004538947716355324,
+ "skip_count": 0.0,
+ "step": 2832,
+ "text_loss": 0.5346745252609253
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.305253889051952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0008752090095971044,
+ "loss": 0.0091,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4569787.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001663343166001141,
+ "skip_count": 0.0,
+ "step": 2834,
+ "text_loss": 0.5524004697799683
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.314646316407396,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.000875004358587722,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4572813.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022988212294876575,
+ "skip_count": 0.0,
+ "step": 2836,
+ "text_loss": 0.4232870042324066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.324038743762841,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.000874799563875962,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4575563.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007781553082168102,
+ "skip_count": 1.0,
+ "step": 2838,
+ "text_loss": 0.19239822030067444
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 13.333431171118287,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0008745946255403021,
+ "loss": 0.0072,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4578117.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01872488670051098,
+ "skip_count": 2.0,
+ "step": 2840,
+ "text_loss": 0.2148810178041458
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 13.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0008743895436592749,
+ "loss": 0.0078,
+ "macro_f1": 1.0,
+ "num_tokens": 4582330.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005634195636957884,
+ "skip_count": 1.0,
+ "step": 2842,
+ "text_loss": 0.4929640591144562
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048583984375,
+ "learning_rate": 0.0008741843183114685,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4585765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008928569150157273,
+ "skip_count": 0.0,
+ "step": 2844,
+ "text_loss": 0.32702967524528503
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 13.361608453184619,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.0008739789495755253,
+ "loss": 0.0094,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4589000.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014715569093823433,
+ "skip_count": 4.0,
+ "step": 2846,
+ "text_loss": 0.25125816464424133
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.371000880540064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.0008737734375301433,
+ "loss": 0.0135,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4592391.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017551190685480833,
+ "skip_count": 0.0,
+ "step": 2848,
+ "text_loss": 0.6595172882080078
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0008735677822540749,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4596662.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006456313421949744,
+ "skip_count": 0.0,
+ "step": 2850,
+ "text_loss": 0.6290773153305054
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0008733619838261276,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4599682.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00765060493722558,
+ "skip_count": 2.0,
+ "step": 2852,
+ "text_loss": 0.3268161416053772
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.399178162606399,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0008731560423251637,
+ "loss": 0.01,
+ "macro_f1": 1.0,
+ "num_tokens": 4603324.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01161442045122385,
+ "skip_count": 2.0,
+ "step": 2854,
+ "text_loss": 0.3029932975769043
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 25.0,
+ "epoch": 13.408570589961844,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.888888955116272,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.0008729499578301005,
+ "loss": 0.0098,
+ "macro_f1": 0.9555556178092957,
+ "num_tokens": 4606975.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02055389992892742,
+ "skip_count": 5.0,
+ "step": 2856,
+ "text_loss": 0.6268532872200012
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.00087274373041991,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4609629.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013911726418882608,
+ "skip_count": 0.0,
+ "step": 2858,
+ "text_loss": 0.534355640411377
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 13.427355444672733,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.0008725373601736188,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4612913.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01010701060295105,
+ "skip_count": 0.0,
+ "step": 2860,
+ "text_loss": 0.3391380310058594
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.0008723308471703085,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4616718.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005969462916254997,
+ "skip_count": 1.0,
+ "step": 2862,
+ "text_loss": 0.47250816226005554
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.446140299383622,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046630859375,
+ "learning_rate": 0.0008721241914891152,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4619680.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027780034579336643,
+ "skip_count": 0.0,
+ "step": 2864,
+ "text_loss": 0.3249278664588928
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.455532726739067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.0008719173932092295,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4622700.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015912104863673449,
+ "skip_count": 0.0,
+ "step": 2866,
+ "text_loss": 0.7789985537528992
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05126953125,
+ "learning_rate": 0.0008717104524098973,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4626637.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036539011634886265,
+ "skip_count": 0.0,
+ "step": 2868,
+ "text_loss": 0.619088351726532
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10400390625,
+ "learning_rate": 0.0008715033691704187,
+ "loss": 0.0118,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4629863.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008402476087212563,
+ "skip_count": 1.0,
+ "step": 2870,
+ "text_loss": 0.5550018548965454
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.483710008805401,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 0.0008712961435701479,
+ "loss": 0.0161,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4632657.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01400839351117611,
+ "skip_count": 1.0,
+ "step": 2872,
+ "text_loss": 0.17368625104427338
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.493102436160845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.0008710887756884947,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4635885.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014573842054232955,
+ "skip_count": 0.0,
+ "step": 2874,
+ "text_loss": 0.5138643383979797
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0008708812656049225,
+ "loss": 0.009,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4639341.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002810224425047636,
+ "skip_count": 1.0,
+ "step": 2876,
+ "text_loss": 0.70310378074646
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 27.0,
+ "epoch": 13.511887290871735,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0008706736133989497,
+ "loss": 0.0105,
+ "macro_f1": 0.9449735879898071,
+ "num_tokens": 4642163.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.029783209785819054,
+ "skip_count": 4.0,
+ "step": 2878,
+ "text_loss": 0.26898008584976196
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.521279718227179,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0008704658191501491,
+ "loss": 0.0095,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4645858.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009193966398015618,
+ "skip_count": 0.0,
+ "step": 2880,
+ "text_loss": 0.6047570705413818
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 13.530672145582624,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.0008702578829381475,
+ "loss": 0.0131,
+ "macro_f1": 0.8814815282821655,
+ "num_tokens": 4649237.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.05698608607053757,
+ "skip_count": 4.0,
+ "step": 2882,
+ "text_loss": 0.10695219784975052
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0008700498048426269,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4652362.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011786938412114978,
+ "skip_count": 0.0,
+ "step": 2884,
+ "text_loss": 0.4442957937717438
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 13.549457000293513,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.0008698415849433229,
+ "loss": 0.0092,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4655616.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.02142646163702011,
+ "skip_count": 0.0,
+ "step": 2886,
+ "text_loss": 0.5820964574813843
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.558849427648958,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0008696332233200262,
+ "loss": 0.0121,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4659294.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004038636106997728,
+ "skip_count": 0.0,
+ "step": 2888,
+ "text_loss": 0.11847645789384842
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0478515625,
+ "learning_rate": 0.0008694247200525806,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4662512.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013256469974294305,
+ "skip_count": 0.0,
+ "step": 2890,
+ "text_loss": 0.4873582720756531
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.577634282359847,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0008692160752208856,
+ "loss": 0.0129,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4666190.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04477972164750099,
+ "skip_count": 1.0,
+ "step": 2892,
+ "text_loss": 0.44243401288986206
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.09521484375,
+ "learning_rate": 0.0008690072889048941,
+ "loss": 0.0127,
+ "macro_f1": 1.0,
+ "num_tokens": 4668884.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004407547414302826,
+ "skip_count": 2.0,
+ "step": 2894,
+ "text_loss": 0.6847127079963684
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0008687983611846133,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4672093.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005245382897555828,
+ "skip_count": 1.0,
+ "step": 2896,
+ "text_loss": 0.25583332777023315
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.605811564426181,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 0.0008685892921401049,
+ "loss": 0.0108,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4674917.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010470855049788952,
+ "skip_count": 0.0,
+ "step": 2898,
+ "text_loss": 0.41998377442359924
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0008683800818514844,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4677739.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009026622399687767,
+ "skip_count": 2.0,
+ "step": 2900,
+ "text_loss": 0.303053081035614
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09619140625,
+ "learning_rate": 0.0008681707303989215,
+ "loss": 0.0108,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4680721.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004500916693359613,
+ "skip_count": 0.0,
+ "step": 2902,
+ "text_loss": 0.5573288798332214
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.633988846492516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06982421875,
+ "learning_rate": 0.0008679612378626404,
+ "loss": 0.0098,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4683339.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005047840531915426,
+ "skip_count": 1.0,
+ "step": 2904,
+ "text_loss": 0.321353554725647
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.643381273847961,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0008677516043229187,
+ "loss": 0.0083,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4686453.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010256914421916008,
+ "skip_count": 1.0,
+ "step": 2906,
+ "text_loss": 0.4300784468650818
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 13.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05029296875,
+ "learning_rate": 0.0008675418298600883,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4689645.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0022669637110084295,
+ "skip_count": 0.0,
+ "step": 2908,
+ "text_loss": 0.5064885020256042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0008673319145545358,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4692320.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011188550852239132,
+ "skip_count": 0.0,
+ "step": 2910,
+ "text_loss": 0.7114819884300232
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.671558555914293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0008671218584867003,
+ "loss": 0.0102,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4695116.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002966561820358038,
+ "skip_count": 2.0,
+ "step": 2912,
+ "text_loss": 0.5662392973899841
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.680950983269739,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 0.0008669116617370762,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4698040.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012894890969619155,
+ "skip_count": 0.0,
+ "step": 2914,
+ "text_loss": 0.718977689743042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.690343410625184,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0008667013243862111,
+ "loss": 0.0162,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4700963.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007232456118799746,
+ "skip_count": 0.0,
+ "step": 2916,
+ "text_loss": 0.3447718024253845
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.699735837980628,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.000866490846514707,
+ "loss": 0.0075,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4704471.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015166680328547955,
+ "skip_count": 0.0,
+ "step": 2918,
+ "text_loss": 0.454946368932724
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 13.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04736328125,
+ "learning_rate": 0.000866280228203219,
+ "loss": 0.0073,
+ "macro_f1": 1.0,
+ "num_tokens": 4707238.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0061312485486269,
+ "skip_count": 1.0,
+ "step": 2920,
+ "text_loss": 0.721788227558136
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.718520692691518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055908203125,
+ "learning_rate": 0.0008660694695324564,
+ "loss": 0.0125,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4711323.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00169933564029634,
+ "skip_count": 0.0,
+ "step": 2922,
+ "text_loss": 0.7562121748924255
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.727913120046962,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0008658585705831829,
+ "loss": 0.0128,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4714417.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022731393110007048,
+ "skip_count": 0.0,
+ "step": 2924,
+ "text_loss": 0.5726147890090942
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.737305547402407,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0008656475314362148,
+ "loss": 0.0131,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 4717445.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.06477782875299454,
+ "skip_count": 3.0,
+ "step": 2926,
+ "text_loss": 0.4505867660045624
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 27.0,
+ "epoch": 13.74669797475785,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.0008654363521724229,
+ "loss": 0.0129,
+ "macro_f1": 0.9449735879898071,
+ "num_tokens": 4722253.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.027405790984630585,
+ "skip_count": 4.0,
+ "step": 2928,
+ "text_loss": 0.24767601490020752
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.756090402113296,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0537109375,
+ "learning_rate": 0.0008652250328727315,
+ "loss": 0.0112,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4725465.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006544729229062796,
+ "skip_count": 2.0,
+ "step": 2930,
+ "text_loss": 0.4478724002838135
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 13.765482829468741,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.0008650135736181184,
+ "loss": 0.0134,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4729213.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0055119614116847515,
+ "skip_count": 0.0,
+ "step": 2932,
+ "text_loss": 0.6749323010444641
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.774875256824185,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 0.0008648019744896154,
+ "loss": 0.0101,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4732280.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008374541997909546,
+ "skip_count": 0.0,
+ "step": 2934,
+ "text_loss": 0.4647359251976013
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 13.78426768417963,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 0.0008645902355683077,
+ "loss": 0.0091,
+ "macro_f1": 0.6595745086669922,
+ "num_tokens": 4736244.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.068686343729496,
+ "skip_count": 4.0,
+ "step": 2936,
+ "text_loss": 0.5356017351150513
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 13.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0008643783569353339,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4739810.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.017954571172595024,
+ "skip_count": 0.0,
+ "step": 2938,
+ "text_loss": 0.3145926296710968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.803052538890519,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.054443359375,
+ "learning_rate": 0.0008641663386718863,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4742720.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006261351052671671,
+ "skip_count": 1.0,
+ "step": 2940,
+ "text_loss": 0.3200613856315613
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 13.812444966245964,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0008639541808592109,
+ "loss": 0.0093,
+ "macro_f1": 1.0,
+ "num_tokens": 4745870.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0025341357104480267,
+ "skip_count": 1.0,
+ "step": 2942,
+ "text_loss": 0.5020416378974915
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0008637418835786067,
+ "loss": 0.0094,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4748943.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008970048278570175,
+ "skip_count": 2.0,
+ "step": 2944,
+ "text_loss": 0.14517110586166382
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055908203125,
+ "learning_rate": 0.0008635294469114265,
+ "loss": 0.0112,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4751360.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002133632078766823,
+ "skip_count": 0.0,
+ "step": 2946,
+ "text_loss": 0.5367856025695801
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.840622248312298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08837890625,
+ "learning_rate": 0.0008633168709390766,
+ "loss": 0.0116,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4754403.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011866620043292642,
+ "skip_count": 0.0,
+ "step": 2948,
+ "text_loss": 0.38302522897720337
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 13.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0008631041557430163,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4757867.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0026854004245251417,
+ "skip_count": 0.0,
+ "step": 2950,
+ "text_loss": 0.43433454632759094
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05859375,
+ "learning_rate": 0.0008628913014047585,
+ "loss": 0.0102,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4761171.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002433479530736804,
+ "skip_count": 0.0,
+ "step": 2952,
+ "text_loss": 0.4725971519947052
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.868799530378633,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0008626783080058696,
+ "loss": 0.0066,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4764752.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.017182493582367897,
+ "skip_count": 0.0,
+ "step": 2954,
+ "text_loss": 0.460641473531723
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.878191957734076,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12353515625,
+ "learning_rate": 0.0008624651756279687,
+ "loss": 0.0198,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4767453.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018134774873033166,
+ "skip_count": 0.0,
+ "step": 2956,
+ "text_loss": 0.4091459810733795
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.887584385089522,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053466796875,
+ "learning_rate": 0.000862251904352729,
+ "loss": 0.0108,
+ "macro_f1": 0.9259259104728699,
+ "num_tokens": 4771110.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.0365753099322319,
+ "skip_count": 3.0,
+ "step": 2958,
+ "text_loss": 0.22408585250377655
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.896976812444967,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05029296875,
+ "learning_rate": 0.000862038494261876,
+ "loss": 0.0109,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4774464.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.024343067780137062,
+ "skip_count": 1.0,
+ "step": 2960,
+ "text_loss": 0.16483014822006226
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0008618249454371891,
+ "loss": 0.01,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4777894.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008310087723657489,
+ "skip_count": 0.0,
+ "step": 2962,
+ "text_loss": 0.5573428869247437
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0008616112579605006,
+ "loss": 0.0117,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4781116.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0065494864247739315,
+ "skip_count": 0.0,
+ "step": 2964,
+ "text_loss": 0.18816794455051422
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.925154094511301,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.0008613974319136957,
+ "loss": 0.009,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4784886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019726944155991077,
+ "skip_count": 0.0,
+ "step": 2966,
+ "text_loss": 0.5097305774688721
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.934546521866745,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0849609375,
+ "learning_rate": 0.0008611834673787134,
+ "loss": 0.0118,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4787563.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006327496841549873,
+ "skip_count": 0.0,
+ "step": 2968,
+ "text_loss": 0.6953814029693604
+ },
+ {
+ "acc_repeat": 0.3333333432674408,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 13.94393894922219,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.5,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056884765625,
+ "learning_rate": 0.0008609693644375449,
+ "loss": 0.0086,
+ "macro_f1": 0.8200000524520874,
+ "num_tokens": 4790421.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.042896661907434464,
+ "skip_count": 1.0,
+ "step": 2970,
+ "text_loss": 0.2573051154613495
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 13.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.14453125,
+ "learning_rate": 0.000860755123172235,
+ "loss": 0.0096,
+ "macro_f1": 1.0,
+ "num_tokens": 4793786.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.013228793628513813,
+ "skip_count": 1.0,
+ "step": 2972,
+ "text_loss": 0.46614497900009155
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.962723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.0008605407436648815,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4796864.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007294759154319763,
+ "skip_count": 2.0,
+ "step": 2974,
+ "text_loss": 0.21555091440677643
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 13.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.057861328125,
+ "learning_rate": 0.0008603262259976348,
+ "loss": 0.0129,
+ "macro_f1": 1.0,
+ "num_tokens": 4800080.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0024024227168411016,
+ "skip_count": 5.0,
+ "step": 2976,
+ "text_loss": 0.7855485081672668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.981508658643968,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07666015625,
+ "learning_rate": 0.0008601115702526987,
+ "loss": 0.0113,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4802899.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001433031284250319,
+ "skip_count": 0.0,
+ "step": 2978,
+ "text_loss": 0.6777765154838562
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.990901085999413,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.0008598967765123293,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4805835.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003073975909501314,
+ "skip_count": 0.0,
+ "step": 2980,
+ "text_loss": 0.5926910638809204
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 14.0,
+ "f1_execute": 0.9333333373069763,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.05322265625,
+ "learning_rate": 0.0008596818448588364,
+ "loss": 0.0139,
+ "macro_f1": 0.8666667342185974,
+ "num_tokens": 4809028.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06438573449850082,
+ "skip_count": 6.0,
+ "step": 2982,
+ "text_loss": 0.23975612223148346
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.009392427355445,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0008594667753745821,
+ "loss": 0.0054,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4812831.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014817612245678902,
+ "skip_count": 1.0,
+ "step": 2984,
+ "text_loss": 0.17292268574237823
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 14.018784854710889,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07421875,
+ "learning_rate": 0.0008592515681419813,
+ "loss": 0.0078,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4816005.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.025407327339053154,
+ "skip_count": 0.0,
+ "step": 2986,
+ "text_loss": 0.6403061151504517
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0008590362232435018,
+ "loss": 0.0103,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4818901.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006826757453382015,
+ "skip_count": 0.0,
+ "step": 2988,
+ "text_loss": 0.2572069466114044
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0008588207407616644,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4823120.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009054148104041815,
+ "skip_count": 0.0,
+ "step": 2990,
+ "text_loss": 0.4827076196670532
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.046962136777223,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0008586051207790422,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4825774.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012294676853343844,
+ "skip_count": 0.0,
+ "step": 2992,
+ "text_loss": 0.40157821774482727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 14.056354564132668,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.052734375,
+ "learning_rate": 0.0008583893633782612,
+ "loss": 0.0084,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4828841.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011474622413516045,
+ "skip_count": 2.0,
+ "step": 2994,
+ "text_loss": 0.14842072129249573
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.058837890625,
+ "learning_rate": 0.0008581734686419999,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4831458.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009154081344604492,
+ "skip_count": 2.0,
+ "step": 2996,
+ "text_loss": 0.365400105714798
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.075139418843557,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.00085795743665299,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4834609.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002899336162954569,
+ "skip_count": 0.0,
+ "step": 2998,
+ "text_loss": 0.5574684143066406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.0008577412674940152,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4838324.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034664268605411053,
+ "skip_count": 0.0,
+ "step": 3000,
+ "text_loss": 0.6752855777740479
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0008575249612479117,
+ "loss": 0.0127,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4841877.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036425739526748657,
+ "skip_count": 2.0,
+ "step": 3002,
+ "text_loss": 0.6332980394363403
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.103316700909891,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048095703125,
+ "learning_rate": 0.0008573085179975685,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4845840.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013783496106043458,
+ "skip_count": 0.0,
+ "step": 3004,
+ "text_loss": 0.4219617545604706
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0008570919378259274,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4848766.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004823608323931694,
+ "skip_count": 1.0,
+ "step": 3006,
+ "text_loss": 0.7987180948257446
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.000856875220815982,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4852310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014760984340682626,
+ "skip_count": 0.0,
+ "step": 3008,
+ "text_loss": 0.35592713952064514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.131493982976226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0008566583670507788,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4856146.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031717263627797365,
+ "skip_count": 1.0,
+ "step": 3010,
+ "text_loss": 0.19379083812236786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.140886410331671,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.0008564413766134164,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4859386.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003361492184922099,
+ "skip_count": 0.0,
+ "step": 3012,
+ "text_loss": 0.39129266142845154
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048583984375,
+ "learning_rate": 0.0008562242495870463,
+ "loss": 0.0113,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4862661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010563990799710155,
+ "skip_count": 0.0,
+ "step": 3014,
+ "text_loss": 0.5966938734054565
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0008560069860548716,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4865410.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001233913702890277,
+ "skip_count": 0.0,
+ "step": 3016,
+ "text_loss": 0.3386077880859375
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.169063692398003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055419921875,
+ "learning_rate": 0.0008557895861001484,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4868931.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018066301709041,
+ "skip_count": 0.0,
+ "step": 3018,
+ "text_loss": 0.5222050547599792
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.178456119753449,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0008555720498061845,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4873492.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0050385501235723495,
+ "skip_count": 1.0,
+ "step": 3020,
+ "text_loss": 0.4558849334716797
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.187848547108894,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0008553543772563403,
+ "loss": 0.009,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4877026.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004828717093914747,
+ "skip_count": 0.0,
+ "step": 3022,
+ "text_loss": 0.36598992347717285
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 25.0,
+ "epoch": 14.197240974464338,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.888888955116272,
+ "grad_norm": 0.06103515625,
+ "learning_rate": 0.0008551365685340285,
+ "loss": 0.0084,
+ "macro_f1": 0.9555556178092957,
+ "num_tokens": 4879655.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02049369551241398,
+ "skip_count": 5.0,
+ "step": 3024,
+ "text_loss": 0.5069093704223633
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 14.206633401819783,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0008549186237227138,
+ "loss": 0.0088,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 4882606.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03947242721915245,
+ "skip_count": 2.0,
+ "step": 3026,
+ "text_loss": 0.2600715458393097
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 14.216025829175228,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.0008547005429059128,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4885246.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0026363315992057323,
+ "skip_count": 0.0,
+ "step": 3028,
+ "text_loss": 0.37642326951026917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.225418256530672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0008544823261671948,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4888109.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003858231008052826,
+ "skip_count": 0.0,
+ "step": 3030,
+ "text_loss": 0.5875385999679565
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 14.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.061279296875,
+ "learning_rate": 0.0008542639735901804,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 4891168.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004789089784026146,
+ "skip_count": 1.0,
+ "step": 3032,
+ "text_loss": 0.6417325139045715
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.244203111241562,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0008540454852585434,
+ "loss": 0.0115,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4894355.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007334680762141943,
+ "skip_count": 2.0,
+ "step": 3034,
+ "text_loss": 0.23697198927402496
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 14.253595538597006,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0008538268612560084,
+ "loss": 0.0058,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 4897543.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022096361964941025,
+ "skip_count": 3.0,
+ "step": 3036,
+ "text_loss": 0.1989550143480301
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.262987965952451,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 0.0008536081016663527,
+ "loss": 0.0101,
+ "macro_f1": 1.0,
+ "num_tokens": 4900752.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0037680594250559807,
+ "skip_count": 2.0,
+ "step": 3038,
+ "text_loss": 0.5001366138458252
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0008533892065734055,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4903581.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032373068388551474,
+ "skip_count": 1.0,
+ "step": 3040,
+ "text_loss": 0.5019411444664001
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0008531701760610476,
+ "loss": 0.0121,
+ "macro_f1": 1.0,
+ "num_tokens": 4907108.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0078013185411691666,
+ "skip_count": 2.0,
+ "step": 3042,
+ "text_loss": 0.3460627794265747
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 28.0,
+ "epoch": 14.291165248018785,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.04833984375,
+ "learning_rate": 0.000852951010213212,
+ "loss": 0.0089,
+ "macro_f1": 0.8200000524520874,
+ "num_tokens": 4911269.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03576689213514328,
+ "skip_count": 3.0,
+ "step": 3044,
+ "text_loss": 0.268994003534317
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 14.300557675374229,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0008527317091138835,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 4914203.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0032140621915459633,
+ "skip_count": 1.0,
+ "step": 3046,
+ "text_loss": 0.9998719692230225
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.309950102729674,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0008525122728470987,
+ "loss": 0.0102,
+ "macro_f1": 1.0,
+ "num_tokens": 4918562.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008559177629649639,
+ "skip_count": 3.0,
+ "step": 3048,
+ "text_loss": 0.3062439560890198
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.31934253008512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0008522927014969459,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4921940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008735597133636475,
+ "skip_count": 2.0,
+ "step": 3050,
+ "text_loss": 0.3637430965900421
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05517578125,
+ "learning_rate": 0.0008520729951475652,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4925416.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012709591537714005,
+ "skip_count": 0.0,
+ "step": 3052,
+ "text_loss": 0.542036235332489
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.338127384796008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.0008518531538831488,
+ "loss": 0.0096,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4928695.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010660928674042225,
+ "skip_count": 1.0,
+ "step": 3054,
+ "text_loss": 0.43144503235816956
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.059326171875,
+ "learning_rate": 0.00085163317778794,
+ "loss": 0.0096,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4931504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004558971151709557,
+ "skip_count": 2.0,
+ "step": 3056,
+ "text_loss": 0.5257010459899902
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.0008514130669462341,
+ "loss": 0.0105,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4934935.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010774781927466393,
+ "skip_count": 2.0,
+ "step": 3058,
+ "text_loss": 0.26061776280403137
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.366304666862343,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0008511928214423782,
+ "loss": 0.0103,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 4938047.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.014763157814741135,
+ "skip_count": 2.0,
+ "step": 3060,
+ "text_loss": 0.2856905460357666
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 14.375697094217786,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.0008509724413607705,
+ "loss": 0.0087,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4941041.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004613345488905907,
+ "skip_count": 0.0,
+ "step": 3062,
+ "text_loss": 0.2870287001132965
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.385089521573232,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 0.0008507519267858612,
+ "loss": 0.015,
+ "macro_f1": 1.0,
+ "num_tokens": 4944708.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008584189228713512,
+ "skip_count": 2.0,
+ "step": 3064,
+ "text_loss": 0.15828095376491547
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.394481948928677,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0008505312778021519,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4948295.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014670816017314792,
+ "skip_count": 0.0,
+ "step": 3066,
+ "text_loss": 0.36697930097579956
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0927734375,
+ "learning_rate": 0.0008503104944941958,
+ "loss": 0.0107,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4951983.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005348859820514917,
+ "skip_count": 2.0,
+ "step": 3068,
+ "text_loss": 0.21612997353076935
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0008500895769465972,
+ "loss": 0.0111,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4955023.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013203793205320835,
+ "skip_count": 0.0,
+ "step": 3070,
+ "text_loss": 0.9757798314094543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.422659230995011,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0478515625,
+ "learning_rate": 0.0008498685252440124,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4957600.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006907356437295675,
+ "skip_count": 0.0,
+ "step": 3072,
+ "text_loss": 0.356107234954834
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.432051658350455,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.061279296875,
+ "learning_rate": 0.0008496473394711487,
+ "loss": 0.0116,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4960746.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027704904787242413,
+ "skip_count": 1.0,
+ "step": 3074,
+ "text_loss": 0.6812908053398132
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 0.0008494260197127649,
+ "loss": 0.0093,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4963845.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036796489730477333,
+ "skip_count": 2.0,
+ "step": 3076,
+ "text_loss": 0.7215370535850525
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0556640625,
+ "learning_rate": 0.0008492045660536712,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4966887.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037137691397219896,
+ "skip_count": 1.0,
+ "step": 3078,
+ "text_loss": 0.8700299859046936
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 14.460228940416789,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0008489829785787291,
+ "loss": 0.0078,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 4969859.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.016492314636707306,
+ "skip_count": 2.0,
+ "step": 3080,
+ "text_loss": 0.6520360112190247
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043701171875,
+ "learning_rate": 0.0008487612573728513,
+ "loss": 0.0094,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4972628.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004022917244583368,
+ "skip_count": 2.0,
+ "step": 3082,
+ "text_loss": 0.17498187720775604
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0008485394025210016,
+ "loss": 0.0076,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4975475.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009141159243881702,
+ "skip_count": 1.0,
+ "step": 3084,
+ "text_loss": 0.5975366234779358
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.488406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 0.0008483174141081956,
+ "loss": 0.0113,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4978858.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031561285723000765,
+ "skip_count": 0.0,
+ "step": 3086,
+ "text_loss": 0.18748866021633148
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.497798649838568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0008480952922194991,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4982142.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007894713780842721,
+ "skip_count": 0.0,
+ "step": 3088,
+ "text_loss": 0.42083197832107544
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.0008478730369400302,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4984872.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005908289458602667,
+ "skip_count": 0.0,
+ "step": 3090,
+ "text_loss": 0.45337188243865967
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.516583504549457,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0008476506483549573,
+ "loss": 0.0101,
+ "macro_f1": 1.0,
+ "num_tokens": 4988137.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016509373672306538,
+ "skip_count": 2.0,
+ "step": 3092,
+ "text_loss": 0.6397262811660767
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0008474281265495002,
+ "loss": 0.0076,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4991164.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004088304936885834,
+ "skip_count": 1.0,
+ "step": 3094,
+ "text_loss": 0.18352322280406952
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0008472054716089295,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4993876.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005200014915317297,
+ "skip_count": 0.0,
+ "step": 3096,
+ "text_loss": 0.2776511013507843
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.544760786615791,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0008469826836185673,
+ "loss": 0.01,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 4997068.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012686059810221195,
+ "skip_count": 2.0,
+ "step": 3098,
+ "text_loss": 0.23209233582019806
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.055419921875,
+ "learning_rate": 0.0008467597626637858,
+ "loss": 0.0074,
+ "macro_f1": 1.0,
+ "num_tokens": 5000038.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006401528604328632,
+ "skip_count": 2.0,
+ "step": 3100,
+ "text_loss": 0.45936745405197144
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.56354564132668,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 0.0008465367088300093,
+ "loss": 0.0075,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5002870.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016640547662973404,
+ "skip_count": 1.0,
+ "step": 3102,
+ "text_loss": 0.44502779841423035
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.572938068682125,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0008463135222027124,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5006357.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008411331102252007,
+ "skip_count": 2.0,
+ "step": 3104,
+ "text_loss": 0.3414570391178131
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.582330496037569,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0008460902028674204,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5009059.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010406570509076118,
+ "skip_count": 0.0,
+ "step": 3106,
+ "text_loss": 0.5931221842765808
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0008458667509097098,
+ "loss": 0.0115,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5012327.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001959054498001933,
+ "skip_count": 0.0,
+ "step": 3108,
+ "text_loss": 0.5191171169281006
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.0008456431664152078,
+ "loss": 0.0127,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5015472.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000994380097836256,
+ "skip_count": 0.0,
+ "step": 3110,
+ "text_loss": 0.4455361068248749
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.610507778103903,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0008454194494695923,
+ "loss": 0.0109,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5018901.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037662344984710217,
+ "skip_count": 0.0,
+ "step": 3112,
+ "text_loss": 0.5335362553596497
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 14.619900205459349,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.0008451956001585923,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5022520.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008664715103805065,
+ "skip_count": 3.0,
+ "step": 3114,
+ "text_loss": 0.16230148077011108
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.629292632814794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.000844971618567987,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5025505.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015904927859082818,
+ "skip_count": 0.0,
+ "step": 3116,
+ "text_loss": 0.6989432573318481
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.638685060170237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0008447475047836068,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5028767.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005853322334587574,
+ "skip_count": 1.0,
+ "step": 3118,
+ "text_loss": 0.31420737504959106
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 14.648077487525683,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 0.0008445232588913325,
+ "loss": 0.0115,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5032577.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012760105542838573,
+ "skip_count": 0.0,
+ "step": 3120,
+ "text_loss": 0.5534627437591553
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0008442988809770953,
+ "loss": 0.0095,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5035381.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022257440723478794,
+ "skip_count": 0.0,
+ "step": 3122,
+ "text_loss": 0.42492759227752686
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.666862342236572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0008440743711268775,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5038743.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004648433532565832,
+ "skip_count": 0.0,
+ "step": 3124,
+ "text_loss": 0.16404685378074646
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0008438497294267117,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5041492.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006313877180218697,
+ "skip_count": 0.0,
+ "step": 3126,
+ "text_loss": 0.23191484808921814
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 14.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07666015625,
+ "learning_rate": 0.0008436249559626807,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5043955.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0036270488053560257,
+ "skip_count": 0.0,
+ "step": 3128,
+ "text_loss": 0.5782018303871155
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.695039624302906,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0008434000508209187,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5047571.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003809858812019229,
+ "skip_count": 1.0,
+ "step": 3130,
+ "text_loss": 0.7129825949668884
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.704432051658351,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0008431750140876092,
+ "loss": 0.0128,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5051608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022369057405740023,
+ "skip_count": 0.0,
+ "step": 3132,
+ "text_loss": 0.4433445930480957
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.713824479013795,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.000842949845848987,
+ "loss": 0.0135,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 5054656.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0425117202103138,
+ "skip_count": 2.0,
+ "step": 3134,
+ "text_loss": 0.38721024990081787
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0712890625,
+ "learning_rate": 0.0008427245461913368,
+ "loss": 0.0121,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5059108.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018077283166348934,
+ "skip_count": 0.0,
+ "step": 3136,
+ "text_loss": 0.7496368885040283
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.12109375,
+ "learning_rate": 0.0008424991152009941,
+ "loss": 0.0111,
+ "macro_f1": 1.0,
+ "num_tokens": 5062371.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008801834657788277,
+ "skip_count": 2.0,
+ "step": 3138,
+ "text_loss": 0.5337086319923401
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 14.742001761080129,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0008422735529643444,
+ "loss": 0.0097,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5065593.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00548676960170269,
+ "skip_count": 3.0,
+ "step": 3140,
+ "text_loss": 0.2561623156070709
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.751394188435574,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0008420478595678233,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5068271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006389956455677748,
+ "skip_count": 0.0,
+ "step": 3142,
+ "text_loss": 0.15605193376541138
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.760786615791018,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07958984375,
+ "learning_rate": 0.0008418220350979175,
+ "loss": 0.0128,
+ "macro_f1": 1.0,
+ "num_tokens": 5071358.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012387622147798538,
+ "skip_count": 2.0,
+ "step": 3144,
+ "text_loss": 0.3085838258266449
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0008415960796411628,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5075584.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00311864772811532,
+ "skip_count": 1.0,
+ "step": 3146,
+ "text_loss": 0.4786977469921112
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.779571470501908,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1591796875,
+ "learning_rate": 0.0008413699932841461,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5078388.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030679800547659397,
+ "skip_count": 0.0,
+ "step": 3148,
+ "text_loss": 0.5222916603088379
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.788963897857352,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0008411437761135039,
+ "loss": 0.011,
+ "macro_f1": 1.0,
+ "num_tokens": 5081584.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012907958589494228,
+ "skip_count": 2.0,
+ "step": 3150,
+ "text_loss": 0.5369884371757507
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0008409174282159232,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5084450.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012314042076468468,
+ "skip_count": 2.0,
+ "step": 3152,
+ "text_loss": 0.25685277581214905
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 14.807748752568243,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.000840690949678141,
+ "loss": 0.0091,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5087865.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00899206381291151,
+ "skip_count": 0.0,
+ "step": 3154,
+ "text_loss": 0.1717093288898468
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.817141179923686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06103515625,
+ "learning_rate": 0.0008404643405869441,
+ "loss": 0.0098,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5090857.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013312003575265408,
+ "skip_count": 0.0,
+ "step": 3156,
+ "text_loss": 0.27446436882019043
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.826533607279131,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1533203125,
+ "learning_rate": 0.0008402376010291695,
+ "loss": 0.0126,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5093917.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002653320087119937,
+ "skip_count": 0.0,
+ "step": 3158,
+ "text_loss": 0.4237489402294159
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0008400107310917045,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5096656.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012976993806660175,
+ "skip_count": 2.0,
+ "step": 3160,
+ "text_loss": 0.42361980676651
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.000839783730861486,
+ "loss": 0.0097,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5099582.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006936746649444103,
+ "skip_count": 2.0,
+ "step": 3162,
+ "text_loss": 0.26656073331832886
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.0008395566004255008,
+ "loss": 0.0127,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5102908.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006619359832257032,
+ "skip_count": 1.0,
+ "step": 3164,
+ "text_loss": 0.590774416923523
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06884765625,
+ "learning_rate": 0.0008393293398707858,
+ "loss": 0.0076,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5105829.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010120268911123276,
+ "skip_count": 2.0,
+ "step": 3166,
+ "text_loss": 0.605930507183075
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.873495744056354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.0008391019492844275,
+ "loss": 0.0108,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5109850.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004940980114042759,
+ "skip_count": 2.0,
+ "step": 3168,
+ "text_loss": 0.12973152101039886
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0008388744287535627,
+ "loss": 0.0094,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5113353.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031777634285390377,
+ "skip_count": 1.0,
+ "step": 3170,
+ "text_loss": 0.18577200174331665
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052734375,
+ "learning_rate": 0.0008386467783653775,
+ "loss": 0.0103,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5116421.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005431659985333681,
+ "skip_count": 0.0,
+ "step": 3172,
+ "text_loss": 0.2302747517824173
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 14.901673026122689,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.000838418998207108,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5119457.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0077286697924137115,
+ "skip_count": 4.0,
+ "step": 3174,
+ "text_loss": 0.19606637954711914
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 0.0008381910883660399,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5123201.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003982985392212868,
+ "skip_count": 0.0,
+ "step": 3176,
+ "text_loss": 0.716376006603241
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.09423828125,
+ "learning_rate": 0.0008379630489295089,
+ "loss": 0.0109,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5126035.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005626026075333357,
+ "skip_count": 1.0,
+ "step": 3178,
+ "text_loss": 0.5144625902175903
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.929850308189023,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 0.0008377348799849,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5129179.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015458245761692524,
+ "skip_count": 2.0,
+ "step": 3180,
+ "text_loss": 0.29887503385543823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 14.939242735544468,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0008375065816196479,
+ "loss": 0.0086,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 5132149.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012210468761622906,
+ "skip_count": 2.0,
+ "step": 3182,
+ "text_loss": 0.8981851935386658
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.948635162899912,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0008372781539212371,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5135287.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0052537876181304455,
+ "skip_count": 0.0,
+ "step": 3184,
+ "text_loss": 0.4245666563510895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.958027590255357,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0008370495969772014,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5138589.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012873421423137188,
+ "skip_count": 2.0,
+ "step": 3186,
+ "text_loss": 0.40581050515174866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 14.9674200176108,
+ "f1_execute": 0.95652174949646,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07470703125,
+ "learning_rate": 0.0008368209108751244,
+ "loss": 0.0127,
+ "macro_f1": 0.6521739363670349,
+ "num_tokens": 5141635.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.07720445841550827,
+ "skip_count": 4.0,
+ "step": 3188,
+ "text_loss": 0.3755173981189728
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0008365920957026389,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5144728.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001440995605662465,
+ "skip_count": 0.0,
+ "step": 3190,
+ "text_loss": 0.5067034363746643
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 14.986204872321691,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0008363631515474275,
+ "loss": 0.0089,
+ "macro_f1": 0.6538461446762085,
+ "num_tokens": 5147963.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.018752984702587128,
+ "skip_count": 2.0,
+ "step": 3192,
+ "text_loss": 0.20224551856517792
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.995597299677135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0008361340784972217,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5151184.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005360354552976787,
+ "skip_count": 0.0,
+ "step": 3194,
+ "text_loss": 0.4588058292865753
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.004696213677722,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0008359048766398031,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5153889.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009184491937048733,
+ "skip_count": 1.0,
+ "step": 3196,
+ "text_loss": 0.2980220317840576
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.014088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.000835675546063002,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5156758.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001252970308996737,
+ "skip_count": 0.0,
+ "step": 3198,
+ "text_loss": 0.6775755882263184
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 0.0008354460868546985,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5160247.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037315806839615107,
+ "skip_count": 0.0,
+ "step": 3200,
+ "text_loss": 0.35867011547088623
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 15.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0008352164991028217,
+ "loss": 0.0092,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5163456.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001497485558502376,
+ "skip_count": 0.0,
+ "step": 3202,
+ "text_loss": 0.690290093421936
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.042265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.0008349867828953501,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5166139.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001051135826855898,
+ "skip_count": 0.0,
+ "step": 3204,
+ "text_loss": 0.3340415954589844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.051658350454945,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0008347569383203113,
+ "loss": 0.0098,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5169009.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010544003453105688,
+ "skip_count": 0.0,
+ "step": 3206,
+ "text_loss": 0.8584878444671631
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 15.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0008345269654657823,
+ "loss": 0.0085,
+ "macro_f1": 1.0,
+ "num_tokens": 5172618.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007312417030334473,
+ "skip_count": 1.0,
+ "step": 3208,
+ "text_loss": 0.19500218331813812
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.070443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0008342968644198892,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5175857.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00276504410430789,
+ "skip_count": 0.0,
+ "step": 3210,
+ "text_loss": 0.5446314215660095
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.079835632521279,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0008340666352708068,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5178585.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002669303445145488,
+ "skip_count": 0.0,
+ "step": 3212,
+ "text_loss": 0.3687484860420227
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0008338362781067596,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5181777.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031585274264216423,
+ "skip_count": 0.0,
+ "step": 3214,
+ "text_loss": 0.27325859665870667
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.000833605793016021,
+ "loss": 0.009,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5184312.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008807534351944923,
+ "skip_count": 2.0,
+ "step": 3216,
+ "text_loss": 0.4466548562049866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.108012914587613,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0008333751800869133,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5187497.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003171310294419527,
+ "skip_count": 0.0,
+ "step": 3218,
+ "text_loss": 0.5423526763916016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.117405341943059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0008331444394078076,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5190982.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016481258207932115,
+ "skip_count": 2.0,
+ "step": 3220,
+ "text_loss": 0.48984917998313904
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 15.126797769298504,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.000832913571067124,
+ "loss": 0.0107,
+ "macro_f1": 1.0,
+ "num_tokens": 5194044.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003957313951104879,
+ "skip_count": 1.0,
+ "step": 3222,
+ "text_loss": 0.4533331096172333
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.136190196653947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0008326825751533322,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5197092.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016904744552448392,
+ "skip_count": 0.0,
+ "step": 3224,
+ "text_loss": 0.5538802742958069
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0008324514517549501,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5199941.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005608258303254843,
+ "skip_count": 1.0,
+ "step": 3226,
+ "text_loss": 0.416242778301239
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 15.154975051364836,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0008322202009605444,
+ "loss": 0.0072,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 5202618.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.020965175703167915,
+ "skip_count": 2.0,
+ "step": 3228,
+ "text_loss": 0.17496295273303986
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 15.164367478720282,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0008319888228587311,
+ "loss": 0.0063,
+ "macro_f1": 1.0,
+ "num_tokens": 5206414.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.021259209141135216,
+ "skip_count": 5.0,
+ "step": 3230,
+ "text_loss": 0.22471418976783752
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.0008317573175381745,
+ "loss": 0.0115,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5209768.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018647604156285524,
+ "skip_count": 0.0,
+ "step": 3232,
+ "text_loss": 0.4415269196033478
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.0008315256850875881,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5213257.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002345515415072441,
+ "skip_count": 0.0,
+ "step": 3234,
+ "text_loss": 0.347247838973999
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 15.192544760786616,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.0008312939255957336,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5215800.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007112892810255289,
+ "skip_count": 3.0,
+ "step": 3236,
+ "text_loss": 0.31091734766960144
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.201937188142061,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0008310620391514219,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5219205.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00432228296995163,
+ "skip_count": 0.0,
+ "step": 3238,
+ "text_loss": 0.3421775996685028
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0008308300258435124,
+ "loss": 0.0085,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5222422.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0076514314860105515,
+ "skip_count": 2.0,
+ "step": 3240,
+ "text_loss": 0.22378318011760712
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0008305978857609128,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5225625.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007617069641128182,
+ "skip_count": 0.0,
+ "step": 3242,
+ "text_loss": 0.5880323648452759
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0008303656189925799,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5229113.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017418119823560119,
+ "skip_count": 0.0,
+ "step": 3244,
+ "text_loss": 0.3302813768386841
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.239506897563839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0008301332256275183,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5232061.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026667986530810595,
+ "skip_count": 0.0,
+ "step": 3246,
+ "text_loss": 0.5679706335067749
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.248899324919284,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0008299007057547821,
+ "loss": 0.0106,
+ "macro_f1": 1.0,
+ "num_tokens": 5235279.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011016624979674816,
+ "skip_count": 2.0,
+ "step": 3248,
+ "text_loss": 0.5081504583358765
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 15.258291752274728,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0008296680594634731,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5239655.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005492044147104025,
+ "skip_count": 0.0,
+ "step": 3250,
+ "text_loss": 0.14675180613994598
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0008294352868427418,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5243579.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00404445780441165,
+ "skip_count": 1.0,
+ "step": 3252,
+ "text_loss": 0.4201085865497589
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.277076606985618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.0008292023879817871,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5247059.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006886140909045935,
+ "skip_count": 1.0,
+ "step": 3254,
+ "text_loss": 0.2289208322763443
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.286469034341062,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.057861328125,
+ "learning_rate": 0.0008289693629698564,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5249940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005736657767556608,
+ "skip_count": 0.0,
+ "step": 3256,
+ "text_loss": 0.5670450925827026
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.295861461696507,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0008287362118962452,
+ "loss": 0.006,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5253580.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011349895037710667,
+ "skip_count": 1.0,
+ "step": 3258,
+ "text_loss": 0.5042323470115662
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.305253889051952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0008285029348502973,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5257080.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013626761501654983,
+ "skip_count": 0.0,
+ "step": 3260,
+ "text_loss": 0.3227672874927521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.314646316407396,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.0008282695319214053,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5259951.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00471635302528739,
+ "skip_count": 0.0,
+ "step": 3262,
+ "text_loss": 0.20773714780807495
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.324038743762841,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0008280360031990093,
+ "loss": 0.0107,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5263314.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010472415015101433,
+ "skip_count": 2.0,
+ "step": 3264,
+ "text_loss": 0.34397366642951965
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.333431171118287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.000827802348772598,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5267358.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007814752752892673,
+ "skip_count": 0.0,
+ "step": 3266,
+ "text_loss": 0.747342586517334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.0008275685687317084,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5270400.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000902949133887887,
+ "skip_count": 0.0,
+ "step": 3268,
+ "text_loss": 0.43782034516334534
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0008273346631659252,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5273147.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00043462219764478505,
+ "skip_count": 0.0,
+ "step": 3270,
+ "text_loss": 0.6358205080032349
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.361608453184619,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0008271006321648816,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5277638.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002211218234151602,
+ "skip_count": 0.0,
+ "step": 3272,
+ "text_loss": 0.20220105350017548
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 15.371000880540064,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.0008268664758182589,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5280638.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010536720044910908,
+ "skip_count": 0.0,
+ "step": 3274,
+ "text_loss": 0.7579061388969421
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.0008266321942157859,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5283847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017158017726615071,
+ "skip_count": 0.0,
+ "step": 3276,
+ "text_loss": 0.669302761554718
+ },
+ {
+ "acc_repeat": 0.800000011920929,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 15.389785735250953,
+ "f1_execute": 0.9743589162826538,
+ "f1_repeat": 0.888888955116272,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06005859375,
+ "learning_rate": 0.0008263977874472399,
+ "loss": 0.0088,
+ "macro_f1": 0.9544159770011902,
+ "num_tokens": 5286627.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.011220700107514858,
+ "skip_count": 4.0,
+ "step": 3278,
+ "text_loss": 0.8703984022140503
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.399178162606399,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 0.0008261632556024461,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5289766.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020442772656679153,
+ "skip_count": 0.0,
+ "step": 3280,
+ "text_loss": 0.5009346008300781
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10107421875,
+ "learning_rate": 0.0008259285987712774,
+ "loss": 0.0106,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5293010.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005645765457302332,
+ "skip_count": 0.0,
+ "step": 3282,
+ "text_loss": 0.2546011209487915
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0008256938170436549,
+ "loss": 0.0111,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5296732.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027385836001485586,
+ "skip_count": 2.0,
+ "step": 3284,
+ "text_loss": 0.5244000554084778
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 15.427355444672733,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.0008254589105095473,
+ "loss": 0.0061,
+ "macro_f1": 1.0,
+ "num_tokens": 5299926.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007451715879142284,
+ "skip_count": 1.0,
+ "step": 3286,
+ "text_loss": 0.28979742527008057
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0008252238792589711,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5303006.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004805843345820904,
+ "skip_count": 2.0,
+ "step": 3288,
+ "text_loss": 0.5131978392601013
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.446140299383622,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.000824988723381991,
+ "loss": 0.0091,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5306953.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010639613494277,
+ "skip_count": 1.0,
+ "step": 3290,
+ "text_loss": 0.4901447296142578
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 15.455532726739067,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.0008247534429687191,
+ "loss": 0.007,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 5310516.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013625577092170715,
+ "skip_count": 2.0,
+ "step": 3292,
+ "text_loss": 0.2124534696340561
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0008245180381093152,
+ "loss": 0.0114,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5313959.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004958513658493757,
+ "skip_count": 1.0,
+ "step": 3294,
+ "text_loss": 0.46682238578796387
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0008242825088939867,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5316609.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003962756600230932,
+ "skip_count": 0.0,
+ "step": 3296,
+ "text_loss": 0.7010108232498169
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.483710008805401,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0008240468554129892,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5319638.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006996620795689523,
+ "skip_count": 0.0,
+ "step": 3298,
+ "text_loss": 0.4966355860233307
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.493102436160845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0008238110777566255,
+ "loss": 0.0101,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5323019.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016031896229833364,
+ "skip_count": 0.0,
+ "step": 3300,
+ "text_loss": 0.38668957352638245
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 15.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0008235751760152459,
+ "loss": 0.0063,
+ "macro_f1": 1.0,
+ "num_tokens": 5326099.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.00344281829893589,
+ "skip_count": 2.0,
+ "step": 3302,
+ "text_loss": 0.5330720543861389
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.511887290871735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06005859375,
+ "learning_rate": 0.0008233391502792484,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5328993.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007886730134487152,
+ "skip_count": 1.0,
+ "step": 3304,
+ "text_loss": 0.5470269322395325
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.521279718227179,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0008231030006390786,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5331554.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008180000819265842,
+ "skip_count": 1.0,
+ "step": 3306,
+ "text_loss": 0.4023340344429016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.0008228667271852294,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5335712.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0002942821884062141,
+ "skip_count": 0.0,
+ "step": 3308,
+ "text_loss": 0.5306711792945862
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.0008226303300082414,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5338701.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006134595023468137,
+ "skip_count": 0.0,
+ "step": 3310,
+ "text_loss": 0.5906263589859009
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.549457000293513,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.0008223938091987022,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5342274.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016656654188409448,
+ "skip_count": 0.0,
+ "step": 3312,
+ "text_loss": 0.5201764106750488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.558849427648958,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 0.0008221571648472472,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5345185.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038612703792750835,
+ "skip_count": 0.0,
+ "step": 3314,
+ "text_loss": 0.36633720993995667
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.568241855004402,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0008219203970445589,
+ "loss": 0.011,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5348804.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009782899171113968,
+ "skip_count": 1.0,
+ "step": 3316,
+ "text_loss": 0.3117460012435913
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.577634282359847,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055908203125,
+ "learning_rate": 0.0008216835058813672,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5351896.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007713229861110449,
+ "skip_count": 0.0,
+ "step": 3318,
+ "text_loss": 0.253496378660202
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0008214464914484492,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5355058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006227815989404917,
+ "skip_count": 2.0,
+ "step": 3320,
+ "text_loss": 0.32693132758140564
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0008212093538366292,
+ "loss": 0.0099,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5358365.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002601418411359191,
+ "skip_count": 0.0,
+ "step": 3322,
+ "text_loss": 0.40394455194473267
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 15.605811564426181,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.000820972093136779,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5360981.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005545300897210836,
+ "skip_count": 3.0,
+ "step": 3324,
+ "text_loss": 0.6758295893669128
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 15.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.0008207347094398172,
+ "loss": 0.0096,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5364018.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001924700103700161,
+ "skip_count": 0.0,
+ "step": 3326,
+ "text_loss": 0.5196860432624817
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0008204972028367097,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5366986.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012254828587174416,
+ "skip_count": 1.0,
+ "step": 3328,
+ "text_loss": 0.24661913514137268
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.633988846492516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0008202595734184694,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5371463.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005094083491712809,
+ "skip_count": 0.0,
+ "step": 3330,
+ "text_loss": 0.2525769770145416
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 15.643381273847961,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0008200218212761566,
+ "loss": 0.0108,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5374823.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0025883198250085115,
+ "skip_count": 0.0,
+ "step": 3332,
+ "text_loss": 0.21849912405014038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.000819783946500878,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5377640.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008240507915616035,
+ "skip_count": 0.0,
+ "step": 3334,
+ "text_loss": 0.2662734091281891
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 15.66216612855885,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 0.000819545949183788,
+ "loss": 0.01,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 5380593.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.038378193974494934,
+ "skip_count": 3.0,
+ "step": 3336,
+ "text_loss": 0.2431795746088028
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 15.671558555914293,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0008193078294160874,
+ "loss": 0.0097,
+ "macro_f1": 1.0,
+ "num_tokens": 5384487.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005926199723035097,
+ "skip_count": 1.0,
+ "step": 3338,
+ "text_loss": 0.5663705468177795
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.680950983269739,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0008190695872890242,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5387511.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010842559859156609,
+ "skip_count": 2.0,
+ "step": 3340,
+ "text_loss": 0.11517292261123657
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.690343410625184,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.0008188312228938933,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5390698.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001304097007960081,
+ "skip_count": 0.0,
+ "step": 3342,
+ "text_loss": 0.4827076196670532
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 15.699735837980628,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0008185927363220363,
+ "loss": 0.0087,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5393778.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005354117136448622,
+ "skip_count": 0.0,
+ "step": 3344,
+ "text_loss": 0.44467049837112427
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0008183541276648418,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5396925.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004800073802471161,
+ "skip_count": 2.0,
+ "step": 3346,
+ "text_loss": 0.2032834142446518
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.718520692691518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0008181153970137449,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5400522.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021674633026123047,
+ "skip_count": 0.0,
+ "step": 3348,
+ "text_loss": 0.4507528841495514
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.727913120046962,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.051513671875,
+ "learning_rate": 0.0008178765444602278,
+ "loss": 0.0117,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 5403526.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04263930395245552,
+ "skip_count": 2.0,
+ "step": 3350,
+ "text_loss": 0.3606615960597992
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 15.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0008176375700958194,
+ "loss": 0.0087,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5407127.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006953123956918716,
+ "skip_count": 0.0,
+ "step": 3352,
+ "text_loss": 0.2290353775024414
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0008173984740120948,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5410829.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014363783411681652,
+ "skip_count": 0.0,
+ "step": 3354,
+ "text_loss": 0.4220392405986786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.756090402113296,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0008171592563006762,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5414152.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00202389364130795,
+ "skip_count": 1.0,
+ "step": 3356,
+ "text_loss": 0.37729766964912415
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.765482829468741,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0008169199170532323,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5417312.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006253739818930626,
+ "skip_count": 2.0,
+ "step": 3358,
+ "text_loss": 0.1304289996623993
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 15.774875256824185,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0703125,
+ "learning_rate": 0.0008166804563614785,
+ "loss": 0.0084,
+ "macro_f1": 1.0,
+ "num_tokens": 5421227.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01622140221297741,
+ "skip_count": 2.0,
+ "step": 3360,
+ "text_loss": 0.298664391040802
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0008164408743171763,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 5424646.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0037176944315433502,
+ "skip_count": 2.0,
+ "step": 3362,
+ "text_loss": 0.12147632241249084
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046630859375,
+ "learning_rate": 0.0008162011710121339,
+ "loss": 0.0076,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5427897.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020403533708304167,
+ "skip_count": 1.0,
+ "step": 3364,
+ "text_loss": 0.2656533420085907
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.803052538890519,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0008159613465382066,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5430474.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018634048756211996,
+ "skip_count": 0.0,
+ "step": 3366,
+ "text_loss": 0.9133086204528809
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.812444966245964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0008157214009872951,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5433113.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012944488786160946,
+ "skip_count": 2.0,
+ "step": 3368,
+ "text_loss": 0.24352453649044037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05712890625,
+ "learning_rate": 0.0008154813344513472,
+ "loss": 0.0143,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5436259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002347963862121105,
+ "skip_count": 2.0,
+ "step": 3370,
+ "text_loss": 0.7601244449615479
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0008152411470223568,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5439126.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016609140438959002,
+ "skip_count": 0.0,
+ "step": 3372,
+ "text_loss": 0.5551947355270386
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.840622248312298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0008150008387923643,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5442739.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008321396075189114,
+ "skip_count": 0.0,
+ "step": 3374,
+ "text_loss": 0.25028282403945923
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 15.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08544921875,
+ "learning_rate": 0.000814760409853456,
+ "loss": 0.0105,
+ "macro_f1": 1.0,
+ "num_tokens": 5445247.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.009738070890307426,
+ "skip_count": 1.0,
+ "step": 3376,
+ "text_loss": 0.37271201610565186
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0008145198602977651,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5449044.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028421466704458,
+ "skip_count": 0.0,
+ "step": 3378,
+ "text_loss": 0.1458655595779419
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.868799530378633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11474609375,
+ "learning_rate": 0.0008142791902174701,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5453063.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015170135302469134,
+ "skip_count": 0.0,
+ "step": 3380,
+ "text_loss": 0.5548722743988037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.878191957734076,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0008140383997047966,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5455814.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022444510832428932,
+ "skip_count": 1.0,
+ "step": 3382,
+ "text_loss": 0.8034513592720032
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.887584385089522,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.000813797488852016,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5459392.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00038578867679461837,
+ "skip_count": 0.0,
+ "step": 3384,
+ "text_loss": 0.6940088868141174
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.896976812444967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0008135564577514458,
+ "loss": 0.011,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5462413.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019727381877601147,
+ "skip_count": 0.0,
+ "step": 3386,
+ "text_loss": 0.5124650597572327
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.099609375,
+ "learning_rate": 0.0008133153064954495,
+ "loss": 0.0107,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5465552.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019896167796105146,
+ "skip_count": 0.0,
+ "step": 3388,
+ "text_loss": 0.4292517900466919
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 15.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0008130740351764367,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 5468573.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030118159484118223,
+ "skip_count": 1.0,
+ "step": 3390,
+ "text_loss": 0.48903173208236694
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.925154094511301,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.000812832643886863,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5471547.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005084246397018433,
+ "skip_count": 2.0,
+ "step": 3392,
+ "text_loss": 0.35789889097213745
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.934546521866745,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0008125911327192299,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5474331.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008874498889781535,
+ "skip_count": 0.0,
+ "step": 3394,
+ "text_loss": 0.6267408728599548
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0008123495017660851,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5477633.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001794386887922883,
+ "skip_count": 0.0,
+ "step": 3396,
+ "text_loss": 0.3701885938644409
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0008121077511200221,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5481277.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002140481723472476,
+ "skip_count": 0.0,
+ "step": 3398,
+ "text_loss": 0.6362857818603516
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.962723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0556640625,
+ "learning_rate": 0.00081186588087368,
+ "loss": 0.0116,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5484237.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000867189432028681,
+ "skip_count": 0.0,
+ "step": 3400,
+ "text_loss": 1.0847382545471191
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.0008116238911197442,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5487423.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029817656613886356,
+ "skip_count": 0.0,
+ "step": 3402,
+ "text_loss": 0.3813740313053131
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.981508658643968,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.0008113817819509454,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5490155.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035141287371516228,
+ "skip_count": 0.0,
+ "step": 3404,
+ "text_loss": 0.2113083451986313
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.990901085999413,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.0008111395534600603,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5493415.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003317659953609109,
+ "skip_count": 0.0,
+ "step": 3406,
+ "text_loss": 0.5869330167770386
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 0.0008108972057399114,
+ "loss": 0.0123,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5496032.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003833734430372715,
+ "skip_count": 2.0,
+ "step": 3408,
+ "text_loss": 0.2938928008079529
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.009392427355444,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.11328125,
+ "learning_rate": 0.0008106547388833669,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5498890.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002622978063300252,
+ "skip_count": 1.0,
+ "step": 3410,
+ "text_loss": 0.3130980432033539
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0008104121529833402,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5502010.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007447598036378622,
+ "skip_count": 0.0,
+ "step": 3412,
+ "text_loss": 0.4413072466850281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.000810169448132791,
+ "loss": 0.0093,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5505212.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031087708193808794,
+ "skip_count": 1.0,
+ "step": 3414,
+ "text_loss": 0.2910428047180176
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.037569709421778,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0008099266244247243,
+ "loss": 0.0082,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5508755.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02510393038392067,
+ "skip_count": 1.0,
+ "step": 3416,
+ "text_loss": 0.33022749423980713
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0008096836819521903,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5512034.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020537273958325386,
+ "skip_count": 1.0,
+ "step": 3418,
+ "text_loss": 0.4731218218803406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0008094406208082853,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5515707.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004218162503093481,
+ "skip_count": 2.0,
+ "step": 3420,
+ "text_loss": 0.23429590463638306
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 16.065746991488112,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0869140625,
+ "learning_rate": 0.0008091974410861507,
+ "loss": 0.0069,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 5518436.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013488355092704296,
+ "skip_count": 3.0,
+ "step": 3422,
+ "text_loss": 0.45768749713897705
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.07513941884356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0008089541428789733,
+ "loss": 0.0097,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5522368.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010335417464375496,
+ "skip_count": 1.0,
+ "step": 3424,
+ "text_loss": 0.43423423171043396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0008087107262799855,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5526061.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002134323585778475,
+ "skip_count": 0.0,
+ "step": 3426,
+ "text_loss": 0.4031757414340973
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1318359375,
+ "learning_rate": 0.0008084671913824651,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5529284.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0097216060385108,
+ "skip_count": 2.0,
+ "step": 3428,
+ "text_loss": 0.2836039960384369
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.000808223538279735,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5532159.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001684269867837429,
+ "skip_count": 0.0,
+ "step": 3430,
+ "text_loss": 0.5804527401924133
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0008079797670651637,
+ "loss": 0.008,
+ "macro_f1": 1.0,
+ "num_tokens": 5536050.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013918434269726276,
+ "skip_count": 1.0,
+ "step": 3432,
+ "text_loss": 0.31325826048851013
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0008077358778321647,
+ "loss": 0.011,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5538885.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007751787197776139,
+ "skip_count": 0.0,
+ "step": 3434,
+ "text_loss": 0.783108115196228
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.131493982976224,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0008074918706741966,
+ "loss": 0.0063,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 5541909.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.021819550544023514,
+ "skip_count": 2.0,
+ "step": 3436,
+ "text_loss": 0.6558083295822144
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.14088641033167,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.0008072477456847638,
+ "loss": 0.0057,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5545101.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03309348225593567,
+ "skip_count": 0.0,
+ "step": 3438,
+ "text_loss": 0.9877075552940369
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.0008070035029574151,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 5548971.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008696741424500942,
+ "skip_count": 1.0,
+ "step": 3440,
+ "text_loss": 0.24766330420970917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 16.159671265042558,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.000806759142585745,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5552174.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004240929149091244,
+ "skip_count": 3.0,
+ "step": 3442,
+ "text_loss": 0.37255001068115234
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05322265625,
+ "learning_rate": 0.0008065146646633927,
+ "loss": 0.0088,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5555005.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014345484785735607,
+ "skip_count": 1.0,
+ "step": 3444,
+ "text_loss": 0.26157206296920776
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06005859375,
+ "learning_rate": 0.0008062700692840428,
+ "loss": 0.0083,
+ "macro_f1": 1.0,
+ "num_tokens": 5559127.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008315163664519787,
+ "skip_count": 2.0,
+ "step": 3446,
+ "text_loss": 0.21971040964126587
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 16.187848547108892,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.056396484375,
+ "learning_rate": 0.0008060253565414246,
+ "loss": 0.009,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 5562254.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009582413360476494,
+ "skip_count": 3.0,
+ "step": 3448,
+ "text_loss": 0.6758295893669128
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.19724097446434,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0008057805265293124,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5565515.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002429503947496414,
+ "skip_count": 0.0,
+ "step": 3450,
+ "text_loss": 0.696592390537262
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0008055355793415257,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5568392.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007724192109890282,
+ "skip_count": 0.0,
+ "step": 3452,
+ "text_loss": 0.7092870473861694
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.216025829175226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0008052905150719285,
+ "loss": 0.0099,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5571090.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010859938338398933,
+ "skip_count": 0.0,
+ "step": 3454,
+ "text_loss": 0.6593860387802124
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.225418256530673,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0008050453338144301,
+ "loss": 0.0072,
+ "macro_f1": 1.0,
+ "num_tokens": 5574552.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030258705373853445,
+ "skip_count": 1.0,
+ "step": 3456,
+ "text_loss": 0.3479384481906891
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0008048000356629844,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5577484.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005052885971963406,
+ "skip_count": 2.0,
+ "step": 3458,
+ "text_loss": 0.21858671307563782
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.0008045546207115901,
+ "loss": 0.0068,
+ "macro_f1": 1.0,
+ "num_tokens": 5581605.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009976249188184738,
+ "skip_count": 3.0,
+ "step": 3460,
+ "text_loss": 0.16868001222610474
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.253595538597008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0008043090890542904,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5584994.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00270817126147449,
+ "skip_count": 0.0,
+ "step": 3462,
+ "text_loss": 0.785690426826477
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0008040634407851739,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5588067.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018436965765431523,
+ "skip_count": 0.0,
+ "step": 3464,
+ "text_loss": 0.5006644129753113
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0008038176759983731,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5590789.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008516279980540276,
+ "skip_count": 2.0,
+ "step": 3466,
+ "text_loss": 0.20963478088378906
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.281772820663342,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0008035717947880659,
+ "loss": 0.0091,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5593472.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016293043736368418,
+ "skip_count": 0.0,
+ "step": 3468,
+ "text_loss": 0.7376078963279724
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0008033257972484742,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5596108.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002364142332226038,
+ "skip_count": 0.0,
+ "step": 3470,
+ "text_loss": 0.5156455039978027
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0008030796834738649,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5599103.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008872323669493198,
+ "skip_count": 0.0,
+ "step": 3472,
+ "text_loss": 0.2996419668197632
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 16.309950102729672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043701171875,
+ "learning_rate": 0.0008028334535585491,
+ "loss": 0.0087,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5602410.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011508257128298283,
+ "skip_count": 3.0,
+ "step": 3474,
+ "text_loss": 0.25438693165779114
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.31934253008512,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.0008025871075968827,
+ "loss": 0.0106,
+ "macro_f1": 1.0,
+ "num_tokens": 5605424.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.017225435003638268,
+ "skip_count": 2.0,
+ "step": 3476,
+ "text_loss": 0.2549574077129364
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.328734957440563,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0008023406456832657,
+ "loss": 0.0111,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 5608266.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.039165645837783813,
+ "skip_count": 2.0,
+ "step": 3478,
+ "text_loss": 0.1797947734594345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.338127384796007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0008020940679121429,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5611471.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009718866203911602,
+ "skip_count": 0.0,
+ "step": 3480,
+ "text_loss": 0.8267702460289001
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0008018473743780036,
+ "loss": 0.0093,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5615046.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006087122485041618,
+ "skip_count": 2.0,
+ "step": 3482,
+ "text_loss": 0.7267677187919617
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.000801600565175381,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5618350.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007539413054473698,
+ "skip_count": 0.0,
+ "step": 3484,
+ "text_loss": 0.5910211801528931
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.0008013536403988529,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5621381.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008076327503658831,
+ "skip_count": 0.0,
+ "step": 3486,
+ "text_loss": 0.30616798996925354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 16.375697094217788,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0008011066001430412,
+ "loss": 0.0086,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 5624617.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023835813626646996,
+ "skip_count": 4.0,
+ "step": 3488,
+ "text_loss": 0.3376443088054657
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.38508952157323,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0008008594445026122,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5627989.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004226419143378735,
+ "skip_count": 2.0,
+ "step": 3490,
+ "text_loss": 0.8185343146324158
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.394481948928675,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0008006121735722767,
+ "loss": 0.0084,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 5632286.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0366671048104763,
+ "skip_count": 2.0,
+ "step": 3492,
+ "text_loss": 0.2209547609090805
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.403874376284122,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0008003647874467892,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5635368.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012956378981471062,
+ "skip_count": 0.0,
+ "step": 3494,
+ "text_loss": 0.20468664169311523
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.059814453125,
+ "learning_rate": 0.0008001172862209485,
+ "loss": 0.0103,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5638440.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017375422175973654,
+ "skip_count": 0.0,
+ "step": 3496,
+ "text_loss": 0.6647221446037292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 16.42265923099501,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.0007998696699895976,
+ "loss": 0.0091,
+ "macro_f1": 0.6592592597007751,
+ "num_tokens": 5641996.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.025240756571292877,
+ "skip_count": 5.0,
+ "step": 3498,
+ "text_loss": 0.23892143368721008
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.432051658350456,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.0007996219388476236,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5645071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007436830550432205,
+ "skip_count": 1.0,
+ "step": 3500,
+ "text_loss": 0.7580804228782654
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.0007993740928899571,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5648175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001126602990552783,
+ "skip_count": 0.0,
+ "step": 3502,
+ "text_loss": 0.5281378626823425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.0007991261322115737,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5650973.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007907263352535665,
+ "skip_count": 0.0,
+ "step": 3504,
+ "text_loss": 0.25220927596092224
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.000798878056907492,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 5654252.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006263538729399443,
+ "skip_count": 2.0,
+ "step": 3506,
+ "text_loss": 0.46569153666496277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 16.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0703125,
+ "learning_rate": 0.0007986298670727752,
+ "loss": 0.0098,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5657229.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004049144219607115,
+ "skip_count": 3.0,
+ "step": 3508,
+ "text_loss": 0.15174436569213867
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 28.0,
+ "epoch": 16.479013795127678,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0791015625,
+ "learning_rate": 0.0007983815628025301,
+ "loss": 0.0074,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 5659974.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0471976138651371,
+ "skip_count": 3.0,
+ "step": 3510,
+ "text_loss": 0.39072203636169434
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.488406222483125,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.000798133144191907,
+ "loss": 0.0082,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5662893.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04030488431453705,
+ "skip_count": 1.0,
+ "step": 3512,
+ "text_loss": 0.3562147617340088
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.49779864983857,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0595703125,
+ "learning_rate": 0.0007978846113361009,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5666476.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007475079502910376,
+ "skip_count": 1.0,
+ "step": 3514,
+ "text_loss": 0.26518192887306213
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.0007976359643303497,
+ "loss": 0.013,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5669647.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00558585487306118,
+ "skip_count": 2.0,
+ "step": 3516,
+ "text_loss": 0.29284560680389404
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.516583504549455,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0007973872032699354,
+ "loss": 0.0082,
+ "macro_f1": 1.0,
+ "num_tokens": 5673491.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0026981087867170572,
+ "skip_count": 1.0,
+ "step": 3518,
+ "text_loss": 0.35089045763015747
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.000797138328250184,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5676529.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0027328627184033394,
+ "skip_count": 0.0,
+ "step": 3520,
+ "text_loss": 0.41077399253845215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 16.535368359260346,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.0007968893393664646,
+ "loss": 0.01,
+ "macro_f1": 0.6592592597007751,
+ "num_tokens": 5679987.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02695014327764511,
+ "skip_count": 5.0,
+ "step": 3522,
+ "text_loss": 0.44942837953567505
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0007966402367141903,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5683185.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00817026849836111,
+ "skip_count": 2.0,
+ "step": 3524,
+ "text_loss": 0.14528048038482666
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.0007963910203888176,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5686544.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021973433904349804,
+ "skip_count": 0.0,
+ "step": 3526,
+ "text_loss": 0.22358648478984833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.56354564132668,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.0007961416904858469,
+ "loss": 0.0078,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5689579.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.033712416887283325,
+ "skip_count": 1.0,
+ "step": 3528,
+ "text_loss": 0.3083649277687073
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.572938068682124,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0007958922471008217,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5692869.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011182719841599464,
+ "skip_count": 2.0,
+ "step": 3530,
+ "text_loss": 0.21288011968135834
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0007956426903293292,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5696007.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015808293828740716,
+ "skip_count": 0.0,
+ "step": 3532,
+ "text_loss": 0.6068631410598755
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.591722923393014,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052734375,
+ "learning_rate": 0.0007953930202670001,
+ "loss": 0.0062,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 5699474.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03205178305506706,
+ "skip_count": 0.0,
+ "step": 3534,
+ "text_loss": 0.4317135512828827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.601115350748458,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0007951432370095084,
+ "loss": 0.0105,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5703483.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003518853336572647,
+ "skip_count": 0.0,
+ "step": 3536,
+ "text_loss": 0.5432273149490356
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.610507778103905,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.11083984375,
+ "learning_rate": 0.0007948933406525715,
+ "loss": 0.01,
+ "macro_f1": 1.0,
+ "num_tokens": 5707301.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004982157610356808,
+ "skip_count": 1.0,
+ "step": 3538,
+ "text_loss": 0.40061065554618835
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.61990020545935,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0751953125,
+ "learning_rate": 0.0007946433312919502,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5710847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003067734418436885,
+ "skip_count": 0.0,
+ "step": 3540,
+ "text_loss": 0.5396234393119812
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 16.629292632814792,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0007943932090234486,
+ "loss": 0.0097,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 5713683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03728383034467697,
+ "skip_count": 2.0,
+ "step": 3542,
+ "text_loss": 0.18310914933681488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 16.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0007941429739429138,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5716397.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025092530995607376,
+ "skip_count": 3.0,
+ "step": 3544,
+ "text_loss": 0.5806207060813904
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0007938926261462366,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5719984.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002493767999112606,
+ "skip_count": 0.0,
+ "step": 3546,
+ "text_loss": 0.38606807589530945
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 16.657469914881126,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.0007936421657293507,
+ "loss": 0.0094,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 5723571.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.014810923486948013,
+ "skip_count": 2.0,
+ "step": 3548,
+ "text_loss": 0.49558472633361816
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.666862342236573,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.0007933915927882327,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5726405.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00152928801253438,
+ "skip_count": 0.0,
+ "step": 3550,
+ "text_loss": 0.8674797415733337
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.000793140907418903,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5729955.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005522782914340496,
+ "skip_count": 2.0,
+ "step": 3552,
+ "text_loss": 0.3274473249912262
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0007928901097174248,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5733030.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009207013063132763,
+ "skip_count": 2.0,
+ "step": 3554,
+ "text_loss": 0.18237128853797913
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.695039624302908,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0007926391997799039,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5735978.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00695531303063035,
+ "skip_count": 0.0,
+ "step": 3556,
+ "text_loss": 0.3266434967517853
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.0007923881777024898,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5738901.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002743212040513754,
+ "skip_count": 1.0,
+ "step": 3558,
+ "text_loss": 0.4971913695335388
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.0007921370435813741,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5741946.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007037297356873751,
+ "skip_count": 0.0,
+ "step": 3560,
+ "text_loss": 0.5645473599433899
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.723216906369238,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.0007918857975127924,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5744987.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030746585689485073,
+ "skip_count": 0.0,
+ "step": 3562,
+ "text_loss": 0.17717665433883667
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0007916344395930224,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5747837.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004522138275206089,
+ "skip_count": 0.0,
+ "step": 3564,
+ "text_loss": 0.7676118612289429
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.000791382969918385,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5750716.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026240211445838213,
+ "skip_count": 0.0,
+ "step": 3566,
+ "text_loss": 0.4975173771381378
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.751394188435572,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.000791131388585244,
+ "loss": 0.011,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 5754368.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.021831991150975227,
+ "skip_count": 2.0,
+ "step": 3568,
+ "text_loss": 0.9670342206954956
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0007908796956900055,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5757076.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017586691537871957,
+ "skip_count": 0.0,
+ "step": 3570,
+ "text_loss": 0.3057977259159088
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.000790627891329119,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5760613.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005515786819159985,
+ "skip_count": 0.0,
+ "step": 3572,
+ "text_loss": 0.5860086679458618
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.779571470501907,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0007903759755990763,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5763557.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004096484277397394,
+ "skip_count": 0.0,
+ "step": 3574,
+ "text_loss": 0.17175781726837158
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.788963897857354,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.000790123948596412,
+ "loss": 0.0119,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5767430.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005216122139245272,
+ "skip_count": 0.0,
+ "step": 3576,
+ "text_loss": 0.7520374059677124
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07177734375,
+ "learning_rate": 0.0007898718104177031,
+ "loss": 0.0108,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5770175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037980107590556145,
+ "skip_count": 0.0,
+ "step": 3578,
+ "text_loss": 0.18117885291576385
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.80774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.0007896195611595699,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5773032.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003672175807878375,
+ "skip_count": 2.0,
+ "step": 3580,
+ "text_loss": 0.7241058349609375
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.817141179923688,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0007893672009186744,
+ "loss": 0.0083,
+ "macro_f1": 1.0,
+ "num_tokens": 5776077.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01229850109666586,
+ "skip_count": 3.0,
+ "step": 3582,
+ "text_loss": 0.29140418767929077
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0007891147297917216,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5779088.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0035251814406365156,
+ "skip_count": 0.0,
+ "step": 3584,
+ "text_loss": 0.1727485954761505
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.055908203125,
+ "learning_rate": 0.000788862147875459,
+ "loss": 0.0094,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5782201.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004725661128759384,
+ "skip_count": 2.0,
+ "step": 3586,
+ "text_loss": 0.43512848019599915
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.845318461990022,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.0007886094552666765,
+ "loss": 0.0106,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5785039.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005632172804325819,
+ "skip_count": 0.0,
+ "step": 3588,
+ "text_loss": 0.3534786105155945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0556640625,
+ "learning_rate": 0.0007883566520622062,
+ "loss": 0.0109,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5788017.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006249965168535709,
+ "skip_count": 1.0,
+ "step": 3590,
+ "text_loss": 0.2089710384607315
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0007881037383589229,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5791168.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013797614956274629,
+ "skip_count": 0.0,
+ "step": 3592,
+ "text_loss": 0.4349329471588135
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06982421875,
+ "learning_rate": 0.0007878507142537436,
+ "loss": 0.0091,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5793927.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019719740375876427,
+ "skip_count": 1.0,
+ "step": 3594,
+ "text_loss": 0.6087368726730347
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0007875975798436274,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5797214.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0037070370744913816,
+ "skip_count": 0.0,
+ "step": 3596,
+ "text_loss": 0.4258122444152832
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048583984375,
+ "learning_rate": 0.0007873443352255764,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5800691.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008431311696767807,
+ "skip_count": 0.0,
+ "step": 3598,
+ "text_loss": 0.6006711721420288
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.901673026122687,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055419921875,
+ "learning_rate": 0.0007870909804966337,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5804712.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017720256000757217,
+ "skip_count": 0.0,
+ "step": 3600,
+ "text_loss": 0.6055042743682861
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.911065453478134,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.0007868375157538861,
+ "loss": 0.0083,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5807670.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010697763413190842,
+ "skip_count": 0.0,
+ "step": 3602,
+ "text_loss": 0.8039056658744812
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0007865839410944611,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5810880.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030022128485143185,
+ "skip_count": 0.0,
+ "step": 3604,
+ "text_loss": 0.596110463142395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.92985030818902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0007863302566155295,
+ "loss": 0.0093,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5814171.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006257854867726564,
+ "skip_count": 2.0,
+ "step": 3606,
+ "text_loss": 0.5700319409370422
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.0007860764624143031,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5817607.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004838473163545132,
+ "skip_count": 0.0,
+ "step": 3608,
+ "text_loss": 0.8319530487060547
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 16.94863516289991,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.08154296875,
+ "learning_rate": 0.0007858225585880369,
+ "loss": 0.0067,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 5821452.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02173662930727005,
+ "skip_count": 2.0,
+ "step": 3610,
+ "text_loss": 0.3738477826118469
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0007855685452340269,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5824683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032719180453568697,
+ "skip_count": 0.0,
+ "step": 3612,
+ "text_loss": 0.4054839015007019
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.967420017610802,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0007853144224496118,
+ "loss": 0.0093,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5827860.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.032171256840229034,
+ "skip_count": 0.0,
+ "step": 3614,
+ "text_loss": 0.18112395703792572
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 0.0007850601903321716,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5831651.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013230946846306324,
+ "skip_count": 1.0,
+ "step": 3616,
+ "text_loss": 0.2698844075202942
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.000784805848979129,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5834369.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00162619655020535,
+ "skip_count": 0.0,
+ "step": 3618,
+ "text_loss": 0.2430931180715561
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.995597299677137,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.0007845513984879477,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5838102.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002781603019684553,
+ "skip_count": 0.0,
+ "step": 3620,
+ "text_loss": 0.4968300759792328
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.00469621367772,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.0007842968389561337,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5841029.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023873315658420324,
+ "skip_count": 0.0,
+ "step": 3622,
+ "text_loss": 0.5842974781990051
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.014088641033165,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0007840421704812346,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5845158.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00400173757225275,
+ "skip_count": 1.0,
+ "step": 3624,
+ "text_loss": 0.8312450647354126
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.00078378739316084,
+ "loss": 0.0094,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5849175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004974664188921452,
+ "skip_count": 0.0,
+ "step": 3626,
+ "text_loss": 0.48637253046035767
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 25.0,
+ "epoch": 17.032873495744056,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.888888955116272,
+ "grad_norm": 0.10693359375,
+ "learning_rate": 0.000783532507092581,
+ "loss": 0.0079,
+ "macro_f1": 0.9555556178092957,
+ "num_tokens": 5852020.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02555239573121071,
+ "skip_count": 5.0,
+ "step": 3628,
+ "text_loss": 0.5407033562660217
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0007832775123741306,
+ "loss": 0.0106,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5854873.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025962977670133114,
+ "skip_count": 0.0,
+ "step": 3630,
+ "text_loss": 0.618230938911438
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.051658350454947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.000783022409103203,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5858086.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029271875973790884,
+ "skip_count": 0.0,
+ "step": 3632,
+ "text_loss": 0.21259798109531403
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0007827671973775542,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5860886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004102068953216076,
+ "skip_count": 0.0,
+ "step": 3634,
+ "text_loss": 0.4991208016872406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.070443205165834,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0007825118772949819,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5864291.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023497689981013536,
+ "skip_count": 1.0,
+ "step": 3636,
+ "text_loss": 0.3878401517868042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.0007822564489533255,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5867155.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007680345326662064,
+ "skip_count": 2.0,
+ "step": 3638,
+ "text_loss": 0.6132124066352844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.053466796875,
+ "learning_rate": 0.0007820009124504653,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5870325.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008242831099778414,
+ "skip_count": 0.0,
+ "step": 3640,
+ "text_loss": 0.3552473187446594
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.098620487232168,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0007817452678843236,
+ "loss": 0.0073,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 5873301.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.023831043392419815,
+ "skip_count": 2.0,
+ "step": 3642,
+ "text_loss": 0.18363867700099945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.108012914587615,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.0007814895153528635,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5876225.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001999989850446582,
+ "skip_count": 0.0,
+ "step": 3644,
+ "text_loss": 0.17581747472286224
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.11740534194306,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0007812336549540903,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5879501.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001098626758903265,
+ "skip_count": 0.0,
+ "step": 3646,
+ "text_loss": 0.5040884613990784
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.126797769298502,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0007809776867860499,
+ "loss": 0.005,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5882608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012210183776915073,
+ "skip_count": 1.0,
+ "step": 3648,
+ "text_loss": 0.27114811539649963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00078072161094683,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5886106.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005191771313548088,
+ "skip_count": 2.0,
+ "step": 3650,
+ "text_loss": 0.5167917609214783
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0007804654275345591,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5889122.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016411367105320096,
+ "skip_count": 1.0,
+ "step": 3652,
+ "text_loss": 0.7691274285316467
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 17.154975051364836,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0007802091366474074,
+ "loss": 0.005,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 5892313.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.015627093613147736,
+ "skip_count": 1.0,
+ "step": 3654,
+ "text_loss": 0.4646325409412384
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.164367478720283,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0007799527383835858,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5895577.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009879748104140162,
+ "skip_count": 0.0,
+ "step": 3656,
+ "text_loss": 0.5587969422340393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0986328125,
+ "learning_rate": 0.0007796962328413469,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5898546.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004864919930696487,
+ "skip_count": 0.0,
+ "step": 3658,
+ "text_loss": 0.6981375813484192
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0007794396201189839,
+ "loss": 0.0078,
+ "macro_f1": 1.0,
+ "num_tokens": 5901618.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006617432460188866,
+ "skip_count": 2.0,
+ "step": 3660,
+ "text_loss": 0.22521957755088806
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.192544760786618,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0007791829003148312,
+ "loss": 0.0098,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 5904540.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0782252699136734,
+ "skip_count": 2.0,
+ "step": 3662,
+ "text_loss": 0.2649642825126648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06494140625,
+ "learning_rate": 0.0007789260735272647,
+ "loss": 0.0114,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5907827.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012057392159476876,
+ "skip_count": 0.0,
+ "step": 3664,
+ "text_loss": 0.6943771243095398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 0.0007786691398547005,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5911163.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007476957980543375,
+ "skip_count": 2.0,
+ "step": 3666,
+ "text_loss": 0.1502683162689209
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 17.220722042852948,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0007784120993955962,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5913948.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004082011990249157,
+ "skip_count": 0.0,
+ "step": 3668,
+ "text_loss": 0.4127517640590668
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 17.230114470208395,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0007781549522484503,
+ "loss": 0.0066,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 5917360.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.027505695819854736,
+ "skip_count": 1.0,
+ "step": 3670,
+ "text_loss": 0.23892618715763092
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0007778976985118018,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5920524.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024977331049740314,
+ "skip_count": 2.0,
+ "step": 3672,
+ "text_loss": 0.5076471567153931
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.248899324919282,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 0.0007776403382842312,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5923632.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015700991498306394,
+ "skip_count": 0.0,
+ "step": 3674,
+ "text_loss": 0.6287924647331238
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.25829175227473,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05810546875,
+ "learning_rate": 0.0007773828716643591,
+ "loss": 0.0085,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5926438.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.05108916014432907,
+ "skip_count": 0.0,
+ "step": 3676,
+ "text_loss": 0.26517006754875183
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0007771252987508474,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5930081.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003439917229115963,
+ "skip_count": 0.0,
+ "step": 3678,
+ "text_loss": 0.5189079642295837
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 17.277076606985617,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056884765625,
+ "learning_rate": 0.0007768676196423984,
+ "loss": 0.0064,
+ "macro_f1": 1.0,
+ "num_tokens": 5933463.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001935846172273159,
+ "skip_count": 1.0,
+ "step": 3680,
+ "text_loss": 0.6703575849533081
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 17.286469034341064,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0007766098344377553,
+ "loss": 0.0082,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 5937098.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0384826585650444,
+ "skip_count": 2.0,
+ "step": 3682,
+ "text_loss": 0.6424444913864136
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.0007763519432357018,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5940436.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008654671837575734,
+ "skip_count": 0.0,
+ "step": 3684,
+ "text_loss": 0.4189988672733307
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.30525388905195,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.0007760939461350623,
+ "loss": 0.0111,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5943731.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007468715775758028,
+ "skip_count": 2.0,
+ "step": 3686,
+ "text_loss": 0.2875453233718872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.314646316407398,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0007758358432347019,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5946707.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001252831774763763,
+ "skip_count": 0.0,
+ "step": 3688,
+ "text_loss": 0.5093055367469788
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0007755776346335259,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5949833.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001680848654359579,
+ "skip_count": 0.0,
+ "step": 3690,
+ "text_loss": 0.4031114876270294
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.0007753193204304807,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5953095.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0047258250415325165,
+ "skip_count": 2.0,
+ "step": 3692,
+ "text_loss": 0.17632785439491272
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.342823598473732,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0007750609007245524,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 5955971.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.001980359200388193,
+ "skip_count": 4.0,
+ "step": 3694,
+ "text_loss": 0.3423727750778198
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0007748023756147679,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5958948.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00511702848598361,
+ "skip_count": 0.0,
+ "step": 3696,
+ "text_loss": 0.28279972076416016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0007745437452001949,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5961819.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005220443126745522,
+ "skip_count": 0.0,
+ "step": 3698,
+ "text_loss": 0.4793325662612915
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.371000880540066,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0007742850095799408,
+ "loss": 0.0084,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5964625.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06411020457744598,
+ "skip_count": 0.0,
+ "step": 3700,
+ "text_loss": 0.2825184464454651
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 17.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0751953125,
+ "learning_rate": 0.0007740261688531536,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5967134.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004408109001815319,
+ "skip_count": 3.0,
+ "step": 3702,
+ "text_loss": 0.690429151058197
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.0007737672231190215,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5969831.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006747521692886949,
+ "skip_count": 0.0,
+ "step": 3704,
+ "text_loss": 0.32556024193763733
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.399178162606397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.0007735081724767732,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5973015.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020414739847183228,
+ "skip_count": 0.0,
+ "step": 3706,
+ "text_loss": 0.5876469612121582
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 17.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.072265625,
+ "learning_rate": 0.0007732490170256769,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5975778.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005610425490885973,
+ "skip_count": 0.0,
+ "step": 3708,
+ "text_loss": 0.2968577444553375
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.0007729897568650422,
+ "loss": 0.0097,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5979115.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001248046406544745,
+ "skip_count": 0.0,
+ "step": 3710,
+ "text_loss": 0.626361608505249
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.42735544467273,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06787109375,
+ "learning_rate": 0.0007727303920942176,
+ "loss": 0.0102,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5982213.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005791695322841406,
+ "skip_count": 2.0,
+ "step": 3712,
+ "text_loss": 0.4133484661579132
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 17.436747872028178,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.08740234375,
+ "learning_rate": 0.0007724709228125922,
+ "loss": 0.0105,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 5984930.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02114664763212204,
+ "skip_count": 2.0,
+ "step": 3714,
+ "text_loss": 0.4646461308002472
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 17.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0007722113491195952,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 5988017.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005913930479437113,
+ "skip_count": 5.0,
+ "step": 3716,
+ "text_loss": 0.15474505722522736
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0007719516711146957,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5991562.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0075925313867628574,
+ "skip_count": 2.0,
+ "step": 3718,
+ "text_loss": 0.5293686985969543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.464925154094512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.000771691888897403,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5994675.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012335237115621567,
+ "skip_count": 0.0,
+ "step": 3720,
+ "text_loss": 0.5210637450218201
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0771484375,
+ "learning_rate": 0.0007714320025672657,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5999070.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010582062415778637,
+ "skip_count": 2.0,
+ "step": 3722,
+ "text_loss": 0.2783571779727936
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 17.4837100088054,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.000771172012223873,
+ "loss": 0.0078,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 6002702.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015008784830570221,
+ "skip_count": 3.0,
+ "step": 3724,
+ "text_loss": 0.358705073595047
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.493102436160846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052734375,
+ "learning_rate": 0.0007709119179668538,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6005517.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00111615180503577,
+ "skip_count": 0.0,
+ "step": 3726,
+ "text_loss": 0.45202162861824036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 17.50249486351629,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0007706517198958764,
+ "loss": 0.0096,
+ "macro_f1": 0.6595745086669922,
+ "num_tokens": 6009111.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.05215252563357353,
+ "skip_count": 4.0,
+ "step": 3728,
+ "text_loss": 0.20360413193702698
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 17.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.0007703914181106497,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6012989.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010039499960839748,
+ "skip_count": 3.0,
+ "step": 3730,
+ "text_loss": 0.20334361493587494
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.52127971822718,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08203125,
+ "learning_rate": 0.0007701310127109211,
+ "loss": 0.0062,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6016420.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01090205181390047,
+ "skip_count": 1.0,
+ "step": 3732,
+ "text_loss": 0.47959551215171814
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 24.0,
+ "epoch": 17.530672145582624,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.888888955116272,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0007698705037964791,
+ "loss": 0.0076,
+ "macro_f1": 0.6225374937057495,
+ "num_tokens": 6019551.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02677762135863304,
+ "skip_count": 5.0,
+ "step": 3734,
+ "text_loss": 0.2621438801288605
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 17.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.056640625,
+ "learning_rate": 0.000769609891467151,
+ "loss": 0.0119,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6022262.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00460716662928462,
+ "skip_count": 0.0,
+ "step": 3736,
+ "text_loss": 0.3433022201061249
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0007693491758228037,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6025723.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036111194640398026,
+ "skip_count": 2.0,
+ "step": 3738,
+ "text_loss": 0.38703784346580505
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0007690883569633442,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6028652.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003299296135082841,
+ "skip_count": 0.0,
+ "step": 3740,
+ "text_loss": 0.24203069508075714
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0007688274349887188,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6032280.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003173880511894822,
+ "skip_count": 0.0,
+ "step": 3742,
+ "text_loss": 0.2827291488647461
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.57763428235985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0007685664099989131,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6035111.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008576177642680705,
+ "skip_count": 0.0,
+ "step": 3744,
+ "text_loss": 0.43613526225090027
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0007683052820939524,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6038428.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004335585981607437,
+ "skip_count": 2.0,
+ "step": 3746,
+ "text_loss": 1.0385624170303345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0007680440513739015,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6041185.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008210531086660922,
+ "skip_count": 0.0,
+ "step": 3748,
+ "text_loss": 0.7070431709289551
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 17.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056640625,
+ "learning_rate": 0.0007677827179388646,
+ "loss": 0.0089,
+ "macro_f1": 1.0,
+ "num_tokens": 6046333.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003778942162171006,
+ "skip_count": 1.0,
+ "step": 3750,
+ "text_loss": 0.3682238757610321
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 17.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08984375,
+ "learning_rate": 0.000767521281888985,
+ "loss": 0.009,
+ "macro_f1": 1.0,
+ "num_tokens": 6049528.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002767334459349513,
+ "skip_count": 1.0,
+ "step": 3752,
+ "text_loss": 0.7619418501853943
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0007672597433244455,
+ "loss": 0.0108,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6053202.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004796457476913929,
+ "skip_count": 2.0,
+ "step": 3754,
+ "text_loss": 0.4157083034515381
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0007669981023454682,
+ "loss": 0.0126,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6056609.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013067846884950995,
+ "skip_count": 0.0,
+ "step": 3756,
+ "text_loss": 0.4529118537902832
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0007667363590523142,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6060504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010285493917763233,
+ "skip_count": 0.0,
+ "step": 3758,
+ "text_loss": 0.8363246321678162
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 17.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.055419921875,
+ "learning_rate": 0.0007664745135452844,
+ "loss": 0.0092,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6063526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006289863493293524,
+ "skip_count": 3.0,
+ "step": 3760,
+ "text_loss": 0.5313657522201538
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.662166128558848,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05517578125,
+ "learning_rate": 0.0007662125659247183,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6067147.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028537956532090902,
+ "skip_count": 0.0,
+ "step": 3762,
+ "text_loss": 0.5668109059333801
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0007659505162909949,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6070350.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026814753655344248,
+ "skip_count": 0.0,
+ "step": 3764,
+ "text_loss": 0.4983512759208679
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056884765625,
+ "learning_rate": 0.0007656883647445318,
+ "loss": 0.0099,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6073091.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005981382913887501,
+ "skip_count": 1.0,
+ "step": 3766,
+ "text_loss": 0.30372318625450134
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.690343410625182,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0007654261113857863,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6076244.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000803640519734472,
+ "skip_count": 0.0,
+ "step": 3768,
+ "text_loss": 0.6100738048553467
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.69973583798063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0007651637563152539,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6078936.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013324898900464177,
+ "skip_count": 0.0,
+ "step": 3770,
+ "text_loss": 0.4733821153640747
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 17.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.0007649012996334701,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6081951.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0021543330512940884,
+ "skip_count": 0.0,
+ "step": 3772,
+ "text_loss": 0.6794875860214233
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.718520692691516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.0007646387414410085,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6085165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005426189745776355,
+ "skip_count": 0.0,
+ "step": 3774,
+ "text_loss": 0.5886107683181763
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.727913120046964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0007643760818384819,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6088370.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002537576947361231,
+ "skip_count": 0.0,
+ "step": 3776,
+ "text_loss": 0.23591920733451843
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0007641133209265423,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6092319.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002613696036860347,
+ "skip_count": 0.0,
+ "step": 3778,
+ "text_loss": 0.3217754662036896
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052978515625,
+ "learning_rate": 0.0007638504588058796,
+ "loss": 0.0105,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6095799.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007219464750960469,
+ "skip_count": 0.0,
+ "step": 3780,
+ "text_loss": 0.4276983141899109
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 17.756090402113298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.0007635874955772234,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6098789.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005965052172541618,
+ "skip_count": 3.0,
+ "step": 3782,
+ "text_loss": 0.30936646461486816
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07177734375,
+ "learning_rate": 0.0007633244313413417,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6101631.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007469559786841273,
+ "skip_count": 0.0,
+ "step": 3784,
+ "text_loss": 0.44460123777389526
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.774875256824185,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0007630612661990412,
+ "loss": 0.0097,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6105097.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004300760570913553,
+ "skip_count": 1.0,
+ "step": 3786,
+ "text_loss": 0.41950157284736633
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.784267684179632,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0007627980002511672,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6107847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023050960153341293,
+ "skip_count": 1.0,
+ "step": 3788,
+ "text_loss": 0.48561373353004456
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0007625346335986039,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6110546.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018124044872820377,
+ "skip_count": 0.0,
+ "step": 3790,
+ "text_loss": 0.20882295072078705
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0007622711663422735,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6113600.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007613401976414025,
+ "skip_count": 0.0,
+ "step": 3792,
+ "text_loss": 0.31751760840415955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.812444966245963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0007620075985831375,
+ "loss": 0.0092,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6116916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005452962126582861,
+ "skip_count": 2.0,
+ "step": 3794,
+ "text_loss": 0.3246645927429199
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 17.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0007617439304221956,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6120056.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0043787881731987,
+ "skip_count": 0.0,
+ "step": 3796,
+ "text_loss": 0.4859195947647095
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.0007614801619604856,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6122668.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033891722559928894,
+ "skip_count": 0.0,
+ "step": 3798,
+ "text_loss": 0.48194369673728943
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.840622248312297,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.0007612162932990845,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6126792.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001883238204754889,
+ "skip_count": 0.0,
+ "step": 3800,
+ "text_loss": 0.3740062117576599
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0007609523245391068,
+ "loss": 0.0076,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6129801.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00882677361369133,
+ "skip_count": 2.0,
+ "step": 3802,
+ "text_loss": 0.5759486556053162
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0007606882557817062,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6133613.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009537030011415482,
+ "skip_count": 2.0,
+ "step": 3804,
+ "text_loss": 0.3217554986476898
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.0007604240871280742,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6137784.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023913346230983734,
+ "skip_count": 0.0,
+ "step": 3806,
+ "text_loss": 0.3718445599079132
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.878191957734078,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0007601598186794407,
+ "loss": 0.0081,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 6141356.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.033796411007642746,
+ "skip_count": 1.0,
+ "step": 3808,
+ "text_loss": 0.2717749774456024
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.88758438508952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.000759895450537074,
+ "loss": 0.01,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6144448.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037919918540865183,
+ "skip_count": 2.0,
+ "step": 3810,
+ "text_loss": 0.5935076475143433
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.896976812444965,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0007596309828022803,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6147526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008182782912626863,
+ "skip_count": 0.0,
+ "step": 3812,
+ "text_loss": 0.449336439371109
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 17.906369239800412,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0007593664155764044,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6150620.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001734903547912836,
+ "skip_count": 0.0,
+ "step": 3814,
+ "text_loss": 0.6647221446037292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.915761667155856,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0007591017489608286,
+ "loss": 0.0088,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6153714.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04721754416823387,
+ "skip_count": 0.0,
+ "step": 3816,
+ "text_loss": 0.25481200218200684
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0007588369830569738,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6156974.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0002484306460246444,
+ "skip_count": 0.0,
+ "step": 3818,
+ "text_loss": 0.7195295691490173
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.934546521866746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0007585721179662988,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6159660.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0051363613456487656,
+ "skip_count": 2.0,
+ "step": 3820,
+ "text_loss": 0.5073586702346802
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052734375,
+ "learning_rate": 0.0007583071537903005,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6163146.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006719176657497883,
+ "skip_count": 0.0,
+ "step": 3822,
+ "text_loss": 0.6950558423995972
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0007580420906305136,
+ "loss": 0.0073,
+ "macro_f1": 1.0,
+ "num_tokens": 6166257.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00871267355978489,
+ "skip_count": 3.0,
+ "step": 3824,
+ "text_loss": 0.2549148201942444
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.0007577769285885109,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6169624.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015642556827515364,
+ "skip_count": 0.0,
+ "step": 3826,
+ "text_loss": 0.3720305860042572
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0007575116677659029,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6172673.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011551049537956715,
+ "skip_count": 0.0,
+ "step": 3828,
+ "text_loss": 0.6819429397583008
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.981508658643968,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0007572463082643377,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6175414.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008922060951590538,
+ "skip_count": 0.0,
+ "step": 3830,
+ "text_loss": 0.5424665212631226
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0007569808501855023,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6178701.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004167596809566021,
+ "skip_count": 1.0,
+ "step": 3832,
+ "text_loss": 0.4429764151573181
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.00075671529363112,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6183036.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008732969872653484,
+ "skip_count": 0.0,
+ "step": 3834,
+ "text_loss": 0.8015334010124207
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.009392427355444,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0007564496387029531,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6186325.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021374202333390713,
+ "skip_count": 1.0,
+ "step": 3836,
+ "text_loss": 0.4233771562576294
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.000756183885502801,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6189919.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004017227329313755,
+ "skip_count": 0.0,
+ "step": 3838,
+ "text_loss": 0.33691394329071045
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 0.0007559180341325005,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6193412.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013120946241542697,
+ "skip_count": 0.0,
+ "step": 3840,
+ "text_loss": 0.14970099925994873
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 18.037569709421778,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0007556520846939265,
+ "loss": 0.0061,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 6196588.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011793316341936588,
+ "skip_count": 2.0,
+ "step": 3842,
+ "text_loss": 0.2714047133922577
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 18.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0007553860372889914,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 6200841.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.019968654960393906,
+ "skip_count": 4.0,
+ "step": 3844,
+ "text_loss": 0.23680976033210754
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 18.05635456413267,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 0.0007551198920196452,
+ "loss": 0.0079,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 6203797.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013615630567073822,
+ "skip_count": 2.0,
+ "step": 3846,
+ "text_loss": 0.25839608907699585
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0546875,
+ "learning_rate": 0.000754853648987875,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6206790.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002420815173536539,
+ "skip_count": 1.0,
+ "step": 3848,
+ "text_loss": 0.5358025431632996
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 18.07513941884356,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0007545873082957057,
+ "loss": 0.0072,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 6209791.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.018236197531223297,
+ "skip_count": 3.0,
+ "step": 3850,
+ "text_loss": 0.1463700383901596
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 18.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0007543208700451998,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6212792.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006242573726922274,
+ "skip_count": 3.0,
+ "step": 3852,
+ "text_loss": 0.9441591501235962
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.093924273554446,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0007540543343384565,
+ "loss": 0.0062,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6215747.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01451140083372593,
+ "skip_count": 1.0,
+ "step": 3854,
+ "text_loss": 0.41610902547836304
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0007537877012776132,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6218593.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00037674361374229193,
+ "skip_count": 0.0,
+ "step": 3856,
+ "text_loss": 0.6048852205276489
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.0007535209709648439,
+ "loss": 0.0045,
+ "macro_f1": 1.0,
+ "num_tokens": 6221315.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005776284262537956,
+ "skip_count": 3.0,
+ "step": 3858,
+ "text_loss": 0.35627537965774536
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.0007532541435023605,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6225012.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009280376834794879,
+ "skip_count": 0.0,
+ "step": 3860,
+ "text_loss": 0.6440183520317078
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0007529872189924114,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6227650.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009876530384644866,
+ "skip_count": 0.0,
+ "step": 3862,
+ "text_loss": 0.35507893562316895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.14088641033167,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0007527201975372827,
+ "loss": 0.0045,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 6230557.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013780162669718266,
+ "skip_count": 1.0,
+ "step": 3864,
+ "text_loss": 0.38958442211151123
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 18.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.0007524530792392977,
+ "loss": 0.011,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6233371.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004849869292229414,
+ "skip_count": 3.0,
+ "step": 3866,
+ "text_loss": 0.3826720714569092
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.159671265042558,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 0.0007521858642008163,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6236770.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008618295192718506,
+ "skip_count": 1.0,
+ "step": 3868,
+ "text_loss": 0.3596078157424927
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0007519185525242363,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6239661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013421972980722785,
+ "skip_count": 0.0,
+ "step": 3870,
+ "text_loss": 0.5585550665855408
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.0007516511443119916,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6242459.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038009448908269405,
+ "skip_count": 1.0,
+ "step": 3872,
+ "text_loss": 0.4418395757675171
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.187848547108892,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0007513836396665534,
+ "loss": 0.0061,
+ "macro_f1": 1.0,
+ "num_tokens": 6245489.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002785376040264964,
+ "skip_count": 2.0,
+ "step": 3874,
+ "text_loss": 0.551510751247406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.19724097446434,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0007511160386904305,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6249014.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021424589212983847,
+ "skip_count": 1.0,
+ "step": 3876,
+ "text_loss": 1.0502676963806152
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0007508483414861679,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6252357.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0085759861394763,
+ "skip_count": 1.0,
+ "step": 3878,
+ "text_loss": 0.49212515354156494
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.216025829175226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0007505805481563477,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6254975.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010723904706537724,
+ "skip_count": 0.0,
+ "step": 3880,
+ "text_loss": 0.7022985816001892
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.225418256530673,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.0007503126588035887,
+ "loss": 0.0081,
+ "macro_f1": 1.0,
+ "num_tokens": 6258001.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012809890322387218,
+ "skip_count": 2.0,
+ "step": 3882,
+ "text_loss": 0.1829151213169098
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.0007500446735305466,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6261795.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026790346018970013,
+ "skip_count": 1.0,
+ "step": 3884,
+ "text_loss": 0.20436066389083862
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.000749776592439914,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 6265585.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005243788007646799,
+ "skip_count": 2.0,
+ "step": 3886,
+ "text_loss": 0.4479229748249054
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.253595538597008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.00074950841563442,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6269039.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007998534478247166,
+ "skip_count": 1.0,
+ "step": 3888,
+ "text_loss": 0.2154676914215088
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0007492401432168303,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6272315.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004648822825402021,
+ "skip_count": 1.0,
+ "step": 3890,
+ "text_loss": 0.3375042676925659
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.272380393307895,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0007489717752899477,
+ "loss": 0.0094,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6275342.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012154200114309788,
+ "skip_count": 1.0,
+ "step": 3892,
+ "text_loss": 0.1964082419872284
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.281772820663342,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.000748703311956611,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 6278700.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004610476549714804,
+ "skip_count": 2.0,
+ "step": 3894,
+ "text_loss": 0.26545581221580505
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 0.0007484347533196961,
+ "loss": 0.0105,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6281864.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0075586591847240925,
+ "skip_count": 2.0,
+ "step": 3896,
+ "text_loss": 0.3106999397277832
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.0007481660994821151,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6284676.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007845268584787846,
+ "skip_count": 1.0,
+ "step": 3898,
+ "text_loss": 0.4094304144382477
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.309950102729672,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0007478973505468165,
+ "loss": 0.0081,
+ "macro_f1": 1.0,
+ "num_tokens": 6287470.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011116391979157925,
+ "skip_count": 2.0,
+ "step": 3900,
+ "text_loss": 0.1838909536600113
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.31934253008512,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0007476285066167857,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6290432.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004599364474415779,
+ "skip_count": 0.0,
+ "step": 3902,
+ "text_loss": 0.25872838497161865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.0007473595677950439,
+ "loss": 0.0109,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6293557.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016367282951250672,
+ "skip_count": 1.0,
+ "step": 3904,
+ "text_loss": 0.5272360444068909
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.338127384796007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0007470905341846492,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6295979.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004760588926728815,
+ "skip_count": 0.0,
+ "step": 3906,
+ "text_loss": 0.666959822177887
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0007468214058886956,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6299215.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000524883100297302,
+ "skip_count": 0.0,
+ "step": 3908,
+ "text_loss": 0.5144801139831543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0007465521830103137,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6302320.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016085522947832942,
+ "skip_count": 0.0,
+ "step": 3910,
+ "text_loss": 0.14342890679836273
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0007462828656526702,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6305212.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002720315707847476,
+ "skip_count": 2.0,
+ "step": 3912,
+ "text_loss": 0.31109121441841125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.375697094217788,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06884765625,
+ "learning_rate": 0.0007460134539189681,
+ "loss": 0.0114,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6308964.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010418406454846263,
+ "skip_count": 1.0,
+ "step": 3914,
+ "text_loss": 0.5662030577659607
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.38508952157323,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 0.0007457439479124459,
+ "loss": 0.0134,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6313195.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020303844939917326,
+ "skip_count": 0.0,
+ "step": 3916,
+ "text_loss": 0.6358339190483093
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.394481948928675,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.0007454743477363797,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6315949.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006592223653569818,
+ "skip_count": 0.0,
+ "step": 3918,
+ "text_loss": 0.35648423433303833
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.403874376284122,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0007452046534940803,
+ "loss": 0.0075,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 6319024.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.024555351585149765,
+ "skip_count": 1.0,
+ "step": 3920,
+ "text_loss": 0.21955153346061707
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0007449348652888952,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6321633.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003606822807341814,
+ "skip_count": 1.0,
+ "step": 3922,
+ "text_loss": 0.6079489588737488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0007446649832242075,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6325209.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035831446293741465,
+ "skip_count": 1.0,
+ "step": 3924,
+ "text_loss": 0.2774808406829834
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.432051658350456,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0007443950074034368,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6327822.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006809544749557972,
+ "skip_count": 2.0,
+ "step": 3926,
+ "text_loss": 0.48236769437789917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.4414440857059,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0007441249379300381,
+ "loss": 0.007,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 6331662.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.023832591250538826,
+ "skip_count": 2.0,
+ "step": 3928,
+ "text_loss": 0.7287537455558777
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0007438547749075028,
+ "loss": 0.0061,
+ "macro_f1": 1.0,
+ "num_tokens": 6335801.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011755098588764668,
+ "skip_count": 3.0,
+ "step": 3930,
+ "text_loss": 0.17253030836582184
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0007435845184393577,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6338747.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005972472485154867,
+ "skip_count": 0.0,
+ "step": 3932,
+ "text_loss": 0.6400216817855835
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0007433141686291657,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6342772.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030393085908144712,
+ "skip_count": 1.0,
+ "step": 3934,
+ "text_loss": 0.6865074038505554
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.0007430437255805252,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6345957.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006984061910770833,
+ "skip_count": 0.0,
+ "step": 3936,
+ "text_loss": 0.40398702025413513
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.488406222483125,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07275390625,
+ "learning_rate": 0.0007427731893970706,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6349162.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005219762213528156,
+ "skip_count": 0.0,
+ "step": 3938,
+ "text_loss": 0.5951031446456909
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 18.49779864983857,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.0007425025601824717,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6352655.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015575960278511047,
+ "skip_count": 3.0,
+ "step": 3940,
+ "text_loss": 0.26689088344573975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0007422318380404346,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6355890.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012208883417770267,
+ "skip_count": 0.0,
+ "step": 3942,
+ "text_loss": 0.570725679397583
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.516583504549455,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0007419610230746999,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6358891.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0029412026051431894,
+ "skip_count": 0.0,
+ "step": 3944,
+ "text_loss": 0.5521301031112671
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0007416901153890448,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6361586.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010283910669386387,
+ "skip_count": 0.0,
+ "step": 3946,
+ "text_loss": 0.4046417772769928
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0007414191150872818,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6364954.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008222512900829315,
+ "skip_count": 2.0,
+ "step": 3948,
+ "text_loss": 0.2803446352481842
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0007411480222732583,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6367660.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001304348581470549,
+ "skip_count": 0.0,
+ "step": 3950,
+ "text_loss": 0.45553359389305115
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0007408768370508576,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6371585.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016345062758773565,
+ "skip_count": 0.0,
+ "step": 3952,
+ "text_loss": 0.25424402952194214
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0007406055595239986,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6374365.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005097290268167853,
+ "skip_count": 0.0,
+ "step": 3954,
+ "text_loss": 0.5856026411056519
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.572938068682124,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.060546875,
+ "learning_rate": 0.0007403341897966356,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6377335.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002482263371348381,
+ "skip_count": 1.0,
+ "step": 3956,
+ "text_loss": 0.5145615339279175
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.0007400627279727574,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6380799.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011743451468646526,
+ "skip_count": 0.0,
+ "step": 3958,
+ "text_loss": 0.31868961453437805
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.0007397911741563892,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6383963.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009861881844699383,
+ "skip_count": 0.0,
+ "step": 3960,
+ "text_loss": 0.21192194521427155
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.601115350748458,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0007395195284515905,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6387410.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004189098719507456,
+ "skip_count": 0.0,
+ "step": 3962,
+ "text_loss": 0.5809708833694458
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.610507778103905,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0007392477909624567,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6390670.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001853612600825727,
+ "skip_count": 0.0,
+ "step": 3964,
+ "text_loss": 0.48985618352890015
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.61990020545935,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.0007389759617931182,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6393609.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003303771372884512,
+ "skip_count": 0.0,
+ "step": 3966,
+ "text_loss": 0.28729453682899475
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 18.629292632814792,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.0007387040410477404,
+ "loss": 0.0058,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 6396608.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01791577786207199,
+ "skip_count": 4.0,
+ "step": 3968,
+ "text_loss": 0.30386820435523987
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.0007384320288305235,
+ "loss": 0.0091,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6399793.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005771282012574375,
+ "skip_count": 0.0,
+ "step": 3970,
+ "text_loss": 0.47285011410713196
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0007381599252457037,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6403365.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003010645741596818,
+ "skip_count": 0.0,
+ "step": 3972,
+ "text_loss": 0.5313063859939575
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.000737887730397551,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6406205.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006457438692450523,
+ "skip_count": 0.0,
+ "step": 3974,
+ "text_loss": 0.2323843240737915
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.666862342236573,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0007376154443903713,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6409552.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010693981312215328,
+ "skip_count": 0.0,
+ "step": 3976,
+ "text_loss": 0.6304101943969727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.676254769592017,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0007373430673285051,
+ "loss": 0.008,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6412386.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03116440214216709,
+ "skip_count": 0.0,
+ "step": 3978,
+ "text_loss": 0.23448467254638672
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.68564719694746,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10009765625,
+ "learning_rate": 0.0007370705993163278,
+ "loss": 0.0111,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6416054.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011973714455962181,
+ "skip_count": 0.0,
+ "step": 3980,
+ "text_loss": 0.6371755599975586
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.695039624302908,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0007367980404582497,
+ "loss": 0.0105,
+ "macro_f1": 1.0,
+ "num_tokens": 6419238.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005117347463965416,
+ "skip_count": 2.0,
+ "step": 3982,
+ "text_loss": 0.19822923839092255
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.0007365253908587158,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6422122.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010648667812347412,
+ "skip_count": 0.0,
+ "step": 3984,
+ "text_loss": 0.566700279712677
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.0007362526506222058,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6425313.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005726494826376438,
+ "skip_count": 0.0,
+ "step": 3986,
+ "text_loss": 0.6568437814712524
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.723216906369238,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0007359798198532343,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6428422.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004504100419580936,
+ "skip_count": 0.0,
+ "step": 3988,
+ "text_loss": 0.598754346370697
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0007357068986563509,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6431512.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019837068393826485,
+ "skip_count": 1.0,
+ "step": 3990,
+ "text_loss": 0.7152895927429199
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0007354338871361393,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6434358.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026031541638076305,
+ "skip_count": 1.0,
+ "step": 3992,
+ "text_loss": 0.4986513555049896
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.751394188435572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.000735160785397218,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6438175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024831905029714108,
+ "skip_count": 2.0,
+ "step": 3994,
+ "text_loss": 0.4406205713748932
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0007348875935442401,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6441228.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008635876583866775,
+ "skip_count": 0.0,
+ "step": 3996,
+ "text_loss": 0.48884135484695435
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0007346143116818932,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6444318.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004007008858025074,
+ "skip_count": 0.0,
+ "step": 3998,
+ "text_loss": 0.6669428944587708
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.779571470501907,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08203125,
+ "learning_rate": 0.0007343409399148994,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6448317.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031380734872072935,
+ "skip_count": 0.0,
+ "step": 4000,
+ "text_loss": 0.6468493938446045
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.788963897857354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0007340674783480154,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6451673.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004996029660105705,
+ "skip_count": 0.0,
+ "step": 4002,
+ "text_loss": 0.28135430812835693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.798356325212797,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0007337939270860323,
+ "loss": 0.009,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6456372.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03784399852156639,
+ "skip_count": 0.0,
+ "step": 4004,
+ "text_loss": 0.41668644547462463
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.80774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0007335202862337753,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6459047.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011750755365937948,
+ "skip_count": 0.0,
+ "step": 4006,
+ "text_loss": 0.6853910684585571
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 18.817141179923688,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.000733246555896104,
+ "loss": 0.0062,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 6462390.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01630394533276558,
+ "skip_count": 4.0,
+ "step": 4008,
+ "text_loss": 0.7110592126846313
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.0007329727361779124,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6466057.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0052404399029910564,
+ "skip_count": 2.0,
+ "step": 4010,
+ "text_loss": 0.13856995105743408
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.000732698827184129,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6468878.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002138581359758973,
+ "skip_count": 0.0,
+ "step": 4012,
+ "text_loss": 0.3999565839767456
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.845318461990022,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.000732424829019716,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6472364.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037466560024768114,
+ "skip_count": 0.0,
+ "step": 4014,
+ "text_loss": 0.28161346912384033
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0007321507417896699,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6475379.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010469373082742095,
+ "skip_count": 0.0,
+ "step": 4016,
+ "text_loss": 1.0490952730178833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06591796875,
+ "learning_rate": 0.0007318765655990218,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6478585.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009968385100364685,
+ "skip_count": 2.0,
+ "step": 4018,
+ "text_loss": 0.31696680188179016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0007316023005528362,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6484153.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002349073765799403,
+ "skip_count": 1.0,
+ "step": 4020,
+ "text_loss": 0.30981555581092834
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 18.8828881714118,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0007313279467562124,
+ "loss": 0.0053,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 6487029.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011854278855025768,
+ "skip_count": 4.0,
+ "step": 4022,
+ "text_loss": 0.9689550399780273
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.0007310535043142829,
+ "loss": 0.0077,
+ "macro_f1": 1.0,
+ "num_tokens": 6490315.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00908346101641655,
+ "skip_count": 3.0,
+ "step": 4024,
+ "text_loss": 0.1705625057220459
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.901673026122687,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0007307789733322146,
+ "loss": 0.0094,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6493921.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007360641611739993,
+ "skip_count": 0.0,
+ "step": 4026,
+ "text_loss": 0.6252996325492859
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.087890625,
+ "learning_rate": 0.0007305043539152083,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6496689.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017757206223905087,
+ "skip_count": 0.0,
+ "step": 4028,
+ "text_loss": 0.40533265471458435
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.000730229646168499,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6500090.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022657213266938925,
+ "skip_count": 0.0,
+ "step": 4030,
+ "text_loss": 0.25954708456993103
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.92985030818902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0007299548501973548,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6503023.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021747269202023745,
+ "skip_count": 0.0,
+ "step": 4032,
+ "text_loss": 0.6223418712615967
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 18.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0007296799661070782,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6506382.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006400502752512693,
+ "skip_count": 4.0,
+ "step": 4034,
+ "text_loss": 0.6873653531074524
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.94863516289991,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0007294049940030055,
+ "loss": 0.0065,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6509194.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0197185929864645,
+ "skip_count": 1.0,
+ "step": 4036,
+ "text_loss": 0.16156800091266632
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0007291299339905059,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6512271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009541353792883456,
+ "skip_count": 0.0,
+ "step": 4038,
+ "text_loss": 0.5038442015647888
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.967420017610802,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0007288547861749838,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6516403.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008226391859352589,
+ "skip_count": 2.0,
+ "step": 4040,
+ "text_loss": 0.3706657588481903
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.976812444966246,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0007285795506618758,
+ "loss": 0.0063,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6519310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017001887783408165,
+ "skip_count": 1.0,
+ "step": 4042,
+ "text_loss": 0.24296723306179047
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0007283042275566528,
+ "loss": 0.0125,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6521979.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01666323095560074,
+ "skip_count": 2.0,
+ "step": 4044,
+ "text_loss": 0.36904850602149963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.995597299677137,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 0.0007280288169648192,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6524976.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007593175978399813,
+ "skip_count": 0.0,
+ "step": 4046,
+ "text_loss": 0.7312731146812439
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 19.00469621367772,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0007277533189919127,
+ "loss": 0.0063,
+ "macro_f1": 1.0,
+ "num_tokens": 6528638.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005652119871228933,
+ "skip_count": 1.0,
+ "step": 4048,
+ "text_loss": 0.23326151072978973
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.014088641033165,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.0007274777337435046,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6532193.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010509157553315163,
+ "skip_count": 2.0,
+ "step": 4050,
+ "text_loss": 0.23918013274669647
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0007272020613251999,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6534994.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002153293928131461,
+ "skip_count": 0.0,
+ "step": 4052,
+ "text_loss": 0.5890526175498962
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0007269263018426367,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 6537469.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0018494052346795797,
+ "skip_count": 2.0,
+ "step": 4054,
+ "text_loss": 0.36058738827705383
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0007266504554014866,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6541271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007579320226795971,
+ "skip_count": 0.0,
+ "step": 4056,
+ "text_loss": 0.4089007079601288
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.051658350454947,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0007263745221074545,
+ "loss": 0.0086,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 6544293.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06202420964837074,
+ "skip_count": 2.0,
+ "step": 4058,
+ "text_loss": 0.2226305454969406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 19.06105077781039,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.0007260985020662784,
+ "loss": 0.0049,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 6547640.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.044639844447374344,
+ "skip_count": 3.0,
+ "step": 4060,
+ "text_loss": 0.23004353046417236
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 19.070443205165834,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.0007258223953837298,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6550840.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004215611144900322,
+ "skip_count": 0.0,
+ "step": 4062,
+ "text_loss": 0.2891770601272583
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0007255462021656132,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6554122.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011056234361603856,
+ "skip_count": 0.0,
+ "step": 4064,
+ "text_loss": 0.7485370635986328
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0007252699225177666,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6557138.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008258933201432228,
+ "skip_count": 2.0,
+ "step": 4066,
+ "text_loss": 0.25219282507896423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.098620487232168,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0007249935565460606,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6560654.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005102175287902355,
+ "skip_count": 0.0,
+ "step": 4068,
+ "text_loss": 0.5553314089775085
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.108012914587615,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0007247171043563994,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6563814.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01283820066601038,
+ "skip_count": 2.0,
+ "step": 4070,
+ "text_loss": 0.15729956328868866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.11740534194306,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.0007244405660547199,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6567060.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009684927063062787,
+ "skip_count": 0.0,
+ "step": 4072,
+ "text_loss": 0.3725031912326813
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.126797769298502,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 0.000724163941746992,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6571608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007890827837400138,
+ "skip_count": 0.0,
+ "step": 4074,
+ "text_loss": 0.8438301682472229
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 19.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0007238872315392189,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 6575214.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0040600355714559555,
+ "skip_count": 1.0,
+ "step": 4076,
+ "text_loss": 0.5923112034797668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0007236104355374363,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6578383.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024899677373468876,
+ "skip_count": 2.0,
+ "step": 4078,
+ "text_loss": 0.20302526652812958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05517578125,
+ "learning_rate": 0.000723333553847713,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6582175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006120906211435795,
+ "skip_count": 2.0,
+ "step": 4080,
+ "text_loss": 0.5400223731994629
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.164367478720283,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06787109375,
+ "learning_rate": 0.0007230565865761504,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6585516.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029941233806312084,
+ "skip_count": 0.0,
+ "step": 4082,
+ "text_loss": 0.19460804760456085
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.0007227795338288831,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6588266.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009357884526252747,
+ "skip_count": 2.0,
+ "step": 4084,
+ "text_loss": 0.35237613320350647
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0007225023957120782,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6591009.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023083325941115618,
+ "skip_count": 2.0,
+ "step": 4086,
+ "text_loss": 0.4336731433868408
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.192544760786618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.0007222251723319356,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6594472.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008416616474278271,
+ "skip_count": 0.0,
+ "step": 4088,
+ "text_loss": 0.6390535831451416
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 0.0007219478637946877,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6597477.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004390760324895382,
+ "skip_count": 1.0,
+ "step": 4090,
+ "text_loss": 0.525839626789093
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0007216704702065997,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6600431.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010311100631952286,
+ "skip_count": 0.0,
+ "step": 4092,
+ "text_loss": 0.5310423374176025
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.220722042852948,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0007213929916739695,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6603899.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032497600186616182,
+ "skip_count": 1.0,
+ "step": 4094,
+ "text_loss": 0.2775326073169708
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.000721115428303127,
+ "loss": 0.0077,
+ "macro_f1": 1.0,
+ "num_tokens": 6606544.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004692315589636564,
+ "skip_count": 3.0,
+ "step": 4096,
+ "text_loss": 0.6667124032974243
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0007208377802004353,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6610097.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007263485458679497,
+ "skip_count": 0.0,
+ "step": 4098,
+ "text_loss": 0.6916406750679016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.248899324919282,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0007205600474722897,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6613836.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017989488551393151,
+ "skip_count": 0.0,
+ "step": 4100,
+ "text_loss": 0.5257929563522339
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.000720282230225118,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6616780.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011308686807751656,
+ "skip_count": 1.0,
+ "step": 4102,
+ "text_loss": 0.4410906732082367
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0007200043285653799,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6620110.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002058265497907996,
+ "skip_count": 2.0,
+ "step": 4104,
+ "text_loss": 0.8581191897392273
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 19.277076606985617,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0007197263425995681,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6622585.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017528717871755362,
+ "skip_count": 0.0,
+ "step": 4106,
+ "text_loss": 0.5000449419021606
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.286469034341064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.0007194482724342075,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6626356.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021995846182107925,
+ "skip_count": 0.0,
+ "step": 4108,
+ "text_loss": 0.401346892118454
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.0007191701181758547,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6629738.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014869922306388617,
+ "skip_count": 0.0,
+ "step": 4110,
+ "text_loss": 0.9598422050476074
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.30525388905195,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.0007188918799310993,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6632807.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012853415682911873,
+ "skip_count": 0.0,
+ "step": 4112,
+ "text_loss": 0.3996548354625702
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.314646316407398,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0007186135578065627,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6636227.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009887361666187644,
+ "skip_count": 0.0,
+ "step": 4114,
+ "text_loss": 0.4127283990383148
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.0007183351519088982,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6639443.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006282114889472723,
+ "skip_count": 1.0,
+ "step": 4116,
+ "text_loss": 0.20028606057167053
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.333431171118285,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.061767578125,
+ "learning_rate": 0.0007180566623447917,
+ "loss": 0.0114,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 6642127.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008101986721158028,
+ "skip_count": 0.0,
+ "step": 4118,
+ "text_loss": 0.763931155204773
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.342823598473732,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.0007177780892209607,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6645376.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001953610684722662,
+ "skip_count": 0.0,
+ "step": 4120,
+ "text_loss": 0.42317715287208557
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0007174994326441551,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6648150.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003279355587437749,
+ "skip_count": 0.0,
+ "step": 4122,
+ "text_loss": 0.19656142592430115
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.0007172206927211567,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6650935.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032076311763375998,
+ "skip_count": 0.0,
+ "step": 4124,
+ "text_loss": 0.13608409464359283
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0007169418695587791,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6654464.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004065621178597212,
+ "skip_count": 2.0,
+ "step": 4126,
+ "text_loss": 0.4882086217403412
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.0007166629632638678,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6657749.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009243001695722342,
+ "skip_count": 0.0,
+ "step": 4128,
+ "text_loss": 0.31632331013679504
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0007163839739433003,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6660997.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018459554994478822,
+ "skip_count": 0.0,
+ "step": 4130,
+ "text_loss": 0.6123947501182556
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.399178162606397,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0007161049017039857,
+ "loss": 0.0073,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 6663542.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.030032536014914513,
+ "skip_count": 2.0,
+ "step": 4132,
+ "text_loss": 0.6985659003257751
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 0.0007158257466528652,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6666178.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013813833938911557,
+ "skip_count": 0.0,
+ "step": 4134,
+ "text_loss": 0.38380664587020874
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 19.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.0007155465088969114,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6668852.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00513424864038825,
+ "skip_count": 3.0,
+ "step": 4136,
+ "text_loss": 0.49724283814430237
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.42735544467273,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0007152671885431288,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6671430.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005165594047866762,
+ "skip_count": 0.0,
+ "step": 4138,
+ "text_loss": 0.666959822177887
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 0.0007149877856985535,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6675215.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001685218419879675,
+ "skip_count": 0.0,
+ "step": 4140,
+ "text_loss": 0.3127259612083435
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.000714708300470253,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6678505.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004025314934551716,
+ "skip_count": 0.0,
+ "step": 4142,
+ "text_loss": 0.3179470896720886
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 19.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0007144287329653269,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6681127.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005965690594166517,
+ "skip_count": 0.0,
+ "step": 4144,
+ "text_loss": 0.3862907886505127
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.464925154094512,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0007141490832909058,
+ "loss": 0.0071,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6683968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012896374799311161,
+ "skip_count": 1.0,
+ "step": 4146,
+ "text_loss": 0.48156118392944336
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0007138693515541519,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6687196.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006367767928168178,
+ "skip_count": 1.0,
+ "step": 4148,
+ "text_loss": 0.676702082157135
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 19.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.0007135895378622592,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6689972.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004532640799880028,
+ "skip_count": 3.0,
+ "step": 4150,
+ "text_loss": 0.5865558981895447
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.493102436160846,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0007133096423224526,
+ "loss": 0.0081,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6693568.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0377078577876091,
+ "skip_count": 0.0,
+ "step": 4152,
+ "text_loss": 0.2790502607822418
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056640625,
+ "learning_rate": 0.0007130296650419885,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6696468.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004455826710909605,
+ "skip_count": 1.0,
+ "step": 4154,
+ "text_loss": 0.5869500041007996
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0007127496061281551,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6699307.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001998464809730649,
+ "skip_count": 0.0,
+ "step": 4156,
+ "text_loss": 0.6931945085525513
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 19.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0007124694656882713,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6702647.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.004117495380342007,
+ "skip_count": 0.0,
+ "step": 4158,
+ "text_loss": 0.4325876832008362
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.0007121892438296874,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6705964.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014713290147483349,
+ "skip_count": 0.0,
+ "step": 4160,
+ "text_loss": 0.3672060966491699
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0007119089406597849,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6710182.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037311650812625885,
+ "skip_count": 1.0,
+ "step": 4162,
+ "text_loss": 0.6643805503845215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0007116285562859767,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6713410.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006017287727445364,
+ "skip_count": 0.0,
+ "step": 4164,
+ "text_loss": 0.4606415927410126
+ },
+ {
+ "acc_repeat": 0.3333333432674408,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 19.55884942764896,
+ "f1_execute": 0.9545454382896423,
+ "f1_repeat": 0.5,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.0007113480908157065,
+ "loss": 0.0108,
+ "macro_f1": 0.8181818723678589,
+ "num_tokens": 6716056.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.08640352636575699,
+ "skip_count": 4.0,
+ "step": 4166,
+ "text_loss": 0.3139408528804779
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0007110675443564491,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6719497.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012731150491163135,
+ "skip_count": 0.0,
+ "step": 4168,
+ "text_loss": 0.7283861637115479
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.57763428235985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0007107869170157108,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6722297.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021509863436222076,
+ "skip_count": 2.0,
+ "step": 4170,
+ "text_loss": 0.5767703056335449
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.000710506208901028,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6725762.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00257494836114347,
+ "skip_count": 1.0,
+ "step": 4172,
+ "text_loss": 0.33571913838386536
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.000710225420119969,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 6728436.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00943201594054699,
+ "skip_count": 3.0,
+ "step": 4174,
+ "text_loss": 0.6849368810653687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0007099445507801323,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6731427.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01046718005090952,
+ "skip_count": 2.0,
+ "step": 4176,
+ "text_loss": 0.3346157670021057
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0007096636009891477,
+ "loss": 0.0091,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6734800.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007813365664333105,
+ "skip_count": 0.0,
+ "step": 4178,
+ "text_loss": 0.49989959597587585
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.000709382570854676,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6738244.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002825600327923894,
+ "skip_count": 0.0,
+ "step": 4180,
+ "text_loss": 0.15744923055171967
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0007091014604844078,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6741695.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017124463338404894,
+ "skip_count": 0.0,
+ "step": 4182,
+ "text_loss": 0.3752405643463135
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.0007088202699860655,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 6744882.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005134924780577421,
+ "skip_count": 3.0,
+ "step": 4184,
+ "text_loss": 0.18534569442272186
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 0.000708538999467402,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6747811.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002371585462242365,
+ "skip_count": 1.0,
+ "step": 4186,
+ "text_loss": 0.6251029968261719
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.662166128558848,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0007082576490362004,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6750765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002088436856865883,
+ "skip_count": 0.0,
+ "step": 4188,
+ "text_loss": 0.35471436381340027
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.000707976218800275,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6754021.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012272283202037215,
+ "skip_count": 0.0,
+ "step": 4190,
+ "text_loss": 0.5737302899360657
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07763671875,
+ "learning_rate": 0.0007076947088674701,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6756793.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026050808373838663,
+ "skip_count": 0.0,
+ "step": 4192,
+ "text_loss": 0.526336669921875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.690343410625182,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.054931640625,
+ "learning_rate": 0.000707413119345661,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6760221.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013151296880096197,
+ "skip_count": 0.0,
+ "step": 4194,
+ "text_loss": 0.5678895711898804
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.69973583798063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0007071314503427532,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6763721.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001528652966953814,
+ "skip_count": 0.0,
+ "step": 4196,
+ "text_loss": 0.7640175223350525
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0007068497019666829,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6768581.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019202446565032005,
+ "skip_count": 0.0,
+ "step": 4198,
+ "text_loss": 0.41878414154052734
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.718520692691516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.051513671875,
+ "learning_rate": 0.0007065678743254167,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6772758.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004667408298701048,
+ "skip_count": 1.0,
+ "step": 4200,
+ "text_loss": 0.3550313413143158
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 19.727913120046964,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 0.0007062859675269513,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6776671.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.00568761583417654,
+ "skip_count": 0.0,
+ "step": 4202,
+ "text_loss": 0.1707649976015091
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0007060039816793141,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6780284.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030401297844946384,
+ "skip_count": 0.0,
+ "step": 4204,
+ "text_loss": 0.2686377167701721
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 19.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.0007057219168905625,
+ "loss": 0.0068,
+ "macro_f1": 1.0,
+ "num_tokens": 6783525.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003353122156113386,
+ "skip_count": 5.0,
+ "step": 4206,
+ "text_loss": 0.5235374569892883
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.756090402113298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.000705439773268784,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6787691.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016532237641513348,
+ "skip_count": 1.0,
+ "step": 4208,
+ "text_loss": 0.5002681612968445
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0007051575509220972,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6790833.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011808308772742748,
+ "skip_count": 0.0,
+ "step": 4210,
+ "text_loss": 0.7251001596450806
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.774875256824185,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.0007048752499586497,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6794260.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006246297620236874,
+ "skip_count": 2.0,
+ "step": 4212,
+ "text_loss": 0.2430499643087387
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.784267684179632,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.00070459287048662,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6797413.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012964420020580292,
+ "skip_count": 0.0,
+ "step": 4214,
+ "text_loss": 0.48889362812042236
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0007043104126142163,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6800815.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018109704833477736,
+ "skip_count": 0.0,
+ "step": 4216,
+ "text_loss": 0.5617026686668396
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 19.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 0.0007040278764496771,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 6803937.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0028699536342173815,
+ "skip_count": 1.0,
+ "step": 4218,
+ "text_loss": 0.548405647277832
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.812444966245963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0007037452621012708,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6806946.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007951617590151727,
+ "skip_count": 0.0,
+ "step": 4220,
+ "text_loss": 0.5702725648880005
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0007034625696772958,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6810083.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003436052706092596,
+ "skip_count": 2.0,
+ "step": 4222,
+ "text_loss": 0.3898725211620331
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.00070317979928608,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6812845.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005070401239208877,
+ "skip_count": 0.0,
+ "step": 4224,
+ "text_loss": 0.5244157910346985
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.840622248312297,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.000702896951035982,
+ "loss": 0.0101,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6815801.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01560303382575512,
+ "skip_count": 1.0,
+ "step": 4226,
+ "text_loss": 0.26503118872642517
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0007026140250353896,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6819464.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009310240857303143,
+ "skip_count": 2.0,
+ "step": 4228,
+ "text_loss": 0.15597499907016754
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.0007023310213927208,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6822657.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005309136584401131,
+ "skip_count": 0.0,
+ "step": 4230,
+ "text_loss": 0.5271651148796082
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046875,
+ "learning_rate": 0.0007020479402164226,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6825661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005936166271567345,
+ "skip_count": 2.0,
+ "step": 4232,
+ "text_loss": 0.6105108857154846
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.878191957734078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0007017647816149727,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6828688.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001653556595556438,
+ "skip_count": 0.0,
+ "step": 4234,
+ "text_loss": 0.6966437101364136
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.88758438508952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.000701481545696878,
+ "loss": 0.009,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6831850.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013501866487786174,
+ "skip_count": 0.0,
+ "step": 4236,
+ "text_loss": 1.259678840637207
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.896976812444965,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.059814453125,
+ "learning_rate": 0.0007011982325706747,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6834862.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008970130234956741,
+ "skip_count": 1.0,
+ "step": 4238,
+ "text_loss": 0.24906545877456665
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.906369239800412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0007009148423449292,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6838148.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026013399474322796,
+ "skip_count": 0.0,
+ "step": 4240,
+ "text_loss": 0.291467547416687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.915761667155856,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0007006313751282371,
+ "loss": 0.0094,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6841142.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021415632218122482,
+ "skip_count": 1.0,
+ "step": 4242,
+ "text_loss": 0.507606029510498
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.0007003478310292236,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6844042.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023636550176888704,
+ "skip_count": 0.0,
+ "step": 4244,
+ "text_loss": 0.11626995354890823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.934546521866746,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0007000642101565433,
+ "loss": 0.008,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6847359.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.025154776871204376,
+ "skip_count": 0.0,
+ "step": 4246,
+ "text_loss": 0.42898693680763245
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.0006997805126188803,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6850443.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00540317315608263,
+ "skip_count": 0.0,
+ "step": 4248,
+ "text_loss": 0.18085283041000366
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.000699496738524948,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6853495.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014433214673772454,
+ "skip_count": 0.0,
+ "step": 4250,
+ "text_loss": 0.5524004697799683
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0006992128879834891,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 6856774.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013381492346525192,
+ "skip_count": 3.0,
+ "step": 4252,
+ "text_loss": 0.19605717062950134
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.0006989289611032758,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6860313.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007140172645449638,
+ "skip_count": 1.0,
+ "step": 4254,
+ "text_loss": 0.3182447552680969
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.981508658643968,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0006986449579931091,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6863683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006486213766038418,
+ "skip_count": 1.0,
+ "step": 4256,
+ "text_loss": 0.19250160455703735
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.0006983608787618201,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6867609.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001465818495489657,
+ "skip_count": 0.0,
+ "step": 4258,
+ "text_loss": 0.5912898182868958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.000698076723518268,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6870040.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031106441747397184,
+ "skip_count": 0.0,
+ "step": 4260,
+ "text_loss": 0.13542121648788452
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.009392427355444,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0006977924923713418,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6873441.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005377951893024147,
+ "skip_count": 0.0,
+ "step": 4262,
+ "text_loss": 0.352464497089386
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0006975081854299594,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6876637.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007052485831081867,
+ "skip_count": 0.0,
+ "step": 4264,
+ "text_loss": 0.5023844242095947
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.0006972238028030678,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6879928.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013608322478830814,
+ "skip_count": 0.0,
+ "step": 4266,
+ "text_loss": 0.8664718270301819
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.037569709421778,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0006969393445996429,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6883425.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007607188890688121,
+ "skip_count": 0.0,
+ "step": 4268,
+ "text_loss": 0.5131992101669312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0006966548109286897,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6886790.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00035804163780994713,
+ "skip_count": 0.0,
+ "step": 4270,
+ "text_loss": 0.5352054834365845
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.000696370201899242,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6889747.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004451376851648092,
+ "skip_count": 1.0,
+ "step": 4272,
+ "text_loss": 0.47865036129951477
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0006960855176203623,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6892604.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015342880506068468,
+ "skip_count": 0.0,
+ "step": 4274,
+ "text_loss": 0.36278650164604187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.07513941884356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0006958007582011425,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6895563.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022974940948188305,
+ "skip_count": 2.0,
+ "step": 4276,
+ "text_loss": 0.6695618629455566
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0006955159237507027,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6898591.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00859096460044384,
+ "skip_count": 1.0,
+ "step": 4278,
+ "text_loss": 0.44284722208976746
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0006952310143781921,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 6903119.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007919861935079098,
+ "skip_count": 3.0,
+ "step": 4280,
+ "text_loss": 0.5006136298179626
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0006949460301927886,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6906394.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008476210059598088,
+ "skip_count": 0.0,
+ "step": 4282,
+ "text_loss": 0.8153555989265442
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048095703125,
+ "learning_rate": 0.0006946609713036985,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6909136.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006711610127240419,
+ "skip_count": 2.0,
+ "step": 4284,
+ "text_loss": 0.43136683106422424
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0185546875,
+ "learning_rate": 0.0006943758378201571,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6912734.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038677838165313005,
+ "skip_count": 0.0,
+ "step": 4286,
+ "text_loss": 0.2693749964237213
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0006940906298514278,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6915838.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012188015971332788,
+ "skip_count": 0.0,
+ "step": 4288,
+ "text_loss": 0.5809219479560852
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0006938053475068031,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6919225.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001955829095095396,
+ "skip_count": 0.0,
+ "step": 4290,
+ "text_loss": 0.5116089582443237
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 20.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11279296875,
+ "learning_rate": 0.0006935199908956037,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6922495.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0035709093790501356,
+ "skip_count": 0.0,
+ "step": 4292,
+ "text_loss": 0.2745901644229889
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.159671265042558,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.0006932345601271786,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6925317.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005745319649577141,
+ "skip_count": 0.0,
+ "step": 4294,
+ "text_loss": 0.6039219498634338
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 20.169063692398005,
+ "f1_execute": 0.9743589162826538,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0006929490553109056,
+ "loss": 0.0107,
+ "macro_f1": 0.9247862696647644,
+ "num_tokens": 6928054.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.061689916998147964,
+ "skip_count": 6.0,
+ "step": 4296,
+ "text_loss": 0.3904837667942047
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0006926634765561907,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6931348.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002007248578593135,
+ "skip_count": 0.0,
+ "step": 4298,
+ "text_loss": 0.5170742273330688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.187848547108892,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.000692377823972468,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6934411.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005786226247437298,
+ "skip_count": 0.0,
+ "step": 4300,
+ "text_loss": 0.8032443523406982
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.19724097446434,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0006920920976692004,
+ "loss": 0.0071,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6938153.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.024602646008133888,
+ "skip_count": 0.0,
+ "step": 4302,
+ "text_loss": 0.446534663438797
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0006918062977558784,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6940731.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005759815219789743,
+ "skip_count": 2.0,
+ "step": 4304,
+ "text_loss": 0.15479247272014618
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.216025829175226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0006915204243420214,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6943246.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005315347574651241,
+ "skip_count": 1.0,
+ "step": 4306,
+ "text_loss": 0.22127842903137207
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.225418256530673,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0006912344775371765,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6947197.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012061651796102524,
+ "skip_count": 0.0,
+ "step": 4308,
+ "text_loss": 0.7058854103088379
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0006909484574509191,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6951817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029203309677541256,
+ "skip_count": 0.0,
+ "step": 4310,
+ "text_loss": 0.6014000773429871
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.0006906623641928525,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6955094.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005703397560864687,
+ "skip_count": 2.0,
+ "step": 4312,
+ "text_loss": 0.5923848152160645
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.253595538597008,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08154296875,
+ "learning_rate": 0.0006903761978726084,
+ "loss": 0.0073,
+ "macro_f1": 1.0,
+ "num_tokens": 6958127.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004489895887672901,
+ "skip_count": 2.0,
+ "step": 4314,
+ "text_loss": 0.36911651492118835
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.000690089958599846,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6960871.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003871412482112646,
+ "skip_count": 2.0,
+ "step": 4316,
+ "text_loss": 0.442545086145401
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.000689803646484253,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 6963980.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008667866699397564,
+ "skip_count": 2.0,
+ "step": 4318,
+ "text_loss": 0.1987489014863968
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 20.281772820663342,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0006895172616355446,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6967132.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00843339879065752,
+ "skip_count": 0.0,
+ "step": 4320,
+ "text_loss": 0.48267918825149536
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0006892308041634639,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6969971.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004312851815484464,
+ "skip_count": 0.0,
+ "step": 4322,
+ "text_loss": 0.3662732243537903
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 20.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0006889442741777822,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6973114.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004588035400956869,
+ "skip_count": 3.0,
+ "step": 4324,
+ "text_loss": 0.6707104444503784
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.309950102729672,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 0.0006886576717882982,
+ "loss": 0.0057,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 6976013.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0687296912074089,
+ "skip_count": 3.0,
+ "step": 4326,
+ "text_loss": 0.1662217676639557
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.31934253008512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0006883709971048384,
+ "loss": 0.0091,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6979200.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002950174268335104,
+ "skip_count": 0.0,
+ "step": 4328,
+ "text_loss": 0.21168152987957
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0006880842502372572,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6982640.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032158740796148777,
+ "skip_count": 0.0,
+ "step": 4330,
+ "text_loss": 0.26790961623191833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.338127384796007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.0006877974312954365,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6985917.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005083635332994163,
+ "skip_count": 0.0,
+ "step": 4332,
+ "text_loss": 0.9736502170562744
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.347519812151454,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.000687510540389286,
+ "loss": 0.0053,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 6988388.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03473830223083496,
+ "skip_count": 2.0,
+ "step": 4334,
+ "text_loss": 0.21662230789661407
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0006872235776287425,
+ "loss": 0.0091,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6991360.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002206524135544896,
+ "skip_count": 0.0,
+ "step": 4336,
+ "text_loss": 0.6026972532272339
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 20.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.0006869365431237711,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6995080.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.000969731598161161,
+ "skip_count": 0.0,
+ "step": 4338,
+ "text_loss": 0.5833017230033875
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.375697094217788,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0006866494369843635,
+ "loss": 0.0054,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 6998526.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.013962293043732643,
+ "skip_count": 2.0,
+ "step": 4340,
+ "text_loss": 0.41465985774993896
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 20.38508952157323,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.0006863622593205397,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7001494.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0064964210614562035,
+ "skip_count": 3.0,
+ "step": 4342,
+ "text_loss": 0.3774271011352539
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 20.394481948928675,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0006860750102423464,
+ "loss": 0.0062,
+ "macro_f1": 0.6589147448539734,
+ "num_tokens": 7005544.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.023250726982951164,
+ "skip_count": 6.0,
+ "step": 4344,
+ "text_loss": 0.2732464373111725
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.403874376284122,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 0.0006857876898598582,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7008847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038170060142874718,
+ "skip_count": 2.0,
+ "step": 4346,
+ "text_loss": 0.29610875248908997
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0006855002982831769,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7012577.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012856025714427233,
+ "skip_count": 0.0,
+ "step": 4348,
+ "text_loss": 0.6098502278327942
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.061767578125,
+ "learning_rate": 0.0006852128356224314,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7015650.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008162742480635643,
+ "skip_count": 1.0,
+ "step": 4350,
+ "text_loss": 0.20868146419525146
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.432051658350456,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.0006849253019877778,
+ "loss": 0.0074,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 7019925.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.023544032126665115,
+ "skip_count": 3.0,
+ "step": 4352,
+ "text_loss": 0.628226101398468
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 0.0006846376974893996,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7023130.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004982319660484791,
+ "skip_count": 2.0,
+ "step": 4354,
+ "text_loss": 0.7037544250488281
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 20.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0006843500222375074,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7026422.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004015266429632902,
+ "skip_count": 0.0,
+ "step": 4356,
+ "text_loss": 0.22352729737758636
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 27.0,
+ "epoch": 20.46022894041679,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0006840622763423391,
+ "loss": 0.0071,
+ "macro_f1": 0.9449735879898071,
+ "num_tokens": 7029077.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.021162014454603195,
+ "skip_count": 4.0,
+ "step": 4358,
+ "text_loss": 0.2431403249502182
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0006837744599141591,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7032582.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007044129306450486,
+ "skip_count": 0.0,
+ "step": 4360,
+ "text_loss": 0.26667487621307373
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0006834865730632594,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7035642.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0067853196524083614,
+ "skip_count": 1.0,
+ "step": 4362,
+ "text_loss": 0.20965275168418884
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.488406222483125,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0006831986158999588,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7038601.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00899333506822586,
+ "skip_count": 2.0,
+ "step": 4364,
+ "text_loss": 0.26860126852989197
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.49779864983857,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.000682910588534603,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7042274.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019194348715245724,
+ "skip_count": 0.0,
+ "step": 4366,
+ "text_loss": 0.14046810567378998
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0006826224910775647,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 7045268.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006915684789419174,
+ "skip_count": 3.0,
+ "step": 4368,
+ "text_loss": 0.5900366306304932
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.516583504549455,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0006823343236392432,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7049407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001678116386756301,
+ "skip_count": 0.0,
+ "step": 4370,
+ "text_loss": 0.7868026494979858
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.000682046086330065,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7052783.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003459530707914382,
+ "skip_count": 0.0,
+ "step": 4372,
+ "text_loss": 0.6349637508392334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.0006817577792604831,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7055757.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011729507241398096,
+ "skip_count": 0.0,
+ "step": 4374,
+ "text_loss": 0.43258991837501526
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0006814694025409773,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7058684.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006664610700681806,
+ "skip_count": 0.0,
+ "step": 4376,
+ "text_loss": 0.5307940244674683
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.091796875,
+ "learning_rate": 0.0006811809562820542,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7061902.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004595907870680094,
+ "skip_count": 2.0,
+ "step": 4378,
+ "text_loss": 0.5830042362213135
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0006808924405942467,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7065100.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032026609405875206,
+ "skip_count": 0.0,
+ "step": 4380,
+ "text_loss": 0.20797798037528992
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 20.572938068682124,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 0.0006806038555881148,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7068556.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0024626904632896185,
+ "skip_count": 0.0,
+ "step": 4382,
+ "text_loss": 0.5791074633598328
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0006803152013742448,
+ "loss": 0.0075,
+ "macro_f1": 1.0,
+ "num_tokens": 7071284.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010723610408604145,
+ "skip_count": 2.0,
+ "step": 4384,
+ "text_loss": 0.13227243721485138
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 20.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0006800264780632495,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7074428.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0011231007520109415,
+ "skip_count": 0.0,
+ "step": 4386,
+ "text_loss": 0.4360627233982086
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 20.601115350748458,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.0006797376857657681,
+ "loss": 0.0081,
+ "macro_f1": 1.0,
+ "num_tokens": 7078313.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.008419238030910492,
+ "skip_count": 1.0,
+ "step": 4388,
+ "text_loss": 0.5183924436569214
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.610507778103905,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.0006794488245924664,
+ "loss": 0.0084,
+ "macro_f1": 1.0,
+ "num_tokens": 7081258.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006582668516784906,
+ "skip_count": 3.0,
+ "step": 4390,
+ "text_loss": 0.2797473669052124
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.61990020545935,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046630859375,
+ "learning_rate": 0.0006791598946540368,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7084527.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00557357631623745,
+ "skip_count": 2.0,
+ "step": 4392,
+ "text_loss": 0.39495575428009033
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.629292632814792,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06005859375,
+ "learning_rate": 0.0006788708960611975,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7087675.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007155992556363344,
+ "skip_count": 0.0,
+ "step": 4394,
+ "text_loss": 0.3785299062728882
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01806640625,
+ "learning_rate": 0.0006785818289246934,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7090171.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009265039698220789,
+ "skip_count": 0.0,
+ "step": 4396,
+ "text_loss": 0.42634522914886475
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 20.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.0006782926933552955,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 7092529.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008679097518324852,
+ "skip_count": 7.0,
+ "step": 4398,
+ "text_loss": 0.4283660054206848
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0006780034894638014,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7095141.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002363949315622449,
+ "skip_count": 0.0,
+ "step": 4400,
+ "text_loss": 0.481539249420166
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 20.666862342236573,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.000677714217361034,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7098208.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004005146212875843,
+ "skip_count": 3.0,
+ "step": 4402,
+ "text_loss": 0.6443291902542114
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0006774248771578435,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7101681.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026864963583648205,
+ "skip_count": 0.0,
+ "step": 4404,
+ "text_loss": 0.16315312683582306
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 27.0,
+ "epoch": 20.68564719694746,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0006771354689651054,
+ "loss": 0.005,
+ "macro_f1": 0.9449735879898071,
+ "num_tokens": 7104719.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.02719845622777939,
+ "skip_count": 4.0,
+ "step": 4406,
+ "text_loss": 0.37855592370033264
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.695039624302908,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.0006768459928937213,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7108697.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010488593950867653,
+ "skip_count": 0.0,
+ "step": 4408,
+ "text_loss": 0.23133711516857147
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 20.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0006765564490546193,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7111426.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0013637891970574856,
+ "skip_count": 0.0,
+ "step": 4410,
+ "text_loss": 0.41399383544921875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0732421875,
+ "learning_rate": 0.0006762668375587528,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7114241.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000900395680218935,
+ "skip_count": 0.0,
+ "step": 4412,
+ "text_loss": 0.6460412740707397
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.723216906369238,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.0006759771585171016,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7117031.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024001260753721,
+ "skip_count": 0.0,
+ "step": 4414,
+ "text_loss": 0.7645824551582336
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0006756874120406714,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 7120766.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.005034091416746378,
+ "skip_count": 4.0,
+ "step": 4416,
+ "text_loss": 0.31753066182136536
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0006753975982404934,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7125243.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002483269665390253,
+ "skip_count": 0.0,
+ "step": 4418,
+ "text_loss": 0.5304268002510071
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.751394188435572,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0006751077172276249,
+ "loss": 0.0052,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7127795.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02676006779074669,
+ "skip_count": 1.0,
+ "step": 4420,
+ "text_loss": 0.22011354565620422
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 0.000674817769113149,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7130837.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003267093561589718,
+ "skip_count": 2.0,
+ "step": 4422,
+ "text_loss": 0.2906076908111572
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 20.770179043146463,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.000674527754008174,
+ "loss": 0.0045,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 7135090.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022510390728712082,
+ "skip_count": 3.0,
+ "step": 4424,
+ "text_loss": 0.2544902563095093
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.779571470501907,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0006742376720238345,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7138751.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011178571730852127,
+ "skip_count": 0.0,
+ "step": 4426,
+ "text_loss": 0.6811438798904419
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 20.788963897857354,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0006739475232712904,
+ "loss": 0.0036,
+ "macro_f1": 1.0,
+ "num_tokens": 7141762.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005595206283032894,
+ "skip_count": 1.0,
+ "step": 4428,
+ "text_loss": 0.38743990659713745
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0006736573078617272,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7145235.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002793942578136921,
+ "skip_count": 2.0,
+ "step": 4430,
+ "text_loss": 0.21894219517707825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 20.80774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0006733670259063561,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7149042.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006146818865090609,
+ "skip_count": 3.0,
+ "step": 4432,
+ "text_loss": 0.17822015285491943
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 20.817141179923688,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0006730766775164136,
+ "loss": 0.0061,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 7152166.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.026045087724924088,
+ "skip_count": 2.0,
+ "step": 4434,
+ "text_loss": 0.2910420000553131
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 20.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0006727862628031618,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7155506.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0022973387967795134,
+ "skip_count": 0.0,
+ "step": 4436,
+ "text_loss": 0.3502544164657593
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.0006724957818778882,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7158739.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002357073128223419,
+ "skip_count": 1.0,
+ "step": 4438,
+ "text_loss": 0.26200664043426514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.845318461990022,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0006722052348519054,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7161776.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005521026905626059,
+ "skip_count": 0.0,
+ "step": 4440,
+ "text_loss": 0.3922915458679199
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.000671914621836552,
+ "loss": 0.0106,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7164763.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007691344246268272,
+ "skip_count": 2.0,
+ "step": 4442,
+ "text_loss": 0.6021351218223572
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.000671623942943191,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7167924.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032181134447455406,
+ "skip_count": 0.0,
+ "step": 4444,
+ "text_loss": 0.23639555275440216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.873495744056356,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.0006713331982832113,
+ "loss": 0.0071,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7170743.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.024979131296277046,
+ "skip_count": 0.0,
+ "step": 4446,
+ "text_loss": 0.4957772493362427
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0006710423879680271,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7174660.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002571308286860585,
+ "skip_count": 0.0,
+ "step": 4448,
+ "text_loss": 0.47968071699142456
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.000670751512109077,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7177965.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00212799571454525,
+ "skip_count": 0.0,
+ "step": 4450,
+ "text_loss": 0.6550716161727905
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.901673026122687,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0006704605708178252,
+ "loss": 0.0107,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7181512.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004176430404186249,
+ "skip_count": 1.0,
+ "step": 4452,
+ "text_loss": 0.36959558725357056
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0006701695642057613,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7184555.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010968588758260012,
+ "skip_count": 0.0,
+ "step": 4454,
+ "text_loss": 0.6686749458312988
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0006698784923843993,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7187474.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014241471653804183,
+ "skip_count": 0.0,
+ "step": 4456,
+ "text_loss": 0.6147221922874451
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.92985030818902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0006695873554652784,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7190649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008801907300949097,
+ "skip_count": 0.0,
+ "step": 4458,
+ "text_loss": 0.26381927728652954
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.0006692961535599634,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7193961.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009027508087456226,
+ "skip_count": 1.0,
+ "step": 4460,
+ "text_loss": 0.1926470547914505
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0006690048867800427,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7197456.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022697453387081623,
+ "skip_count": 0.0,
+ "step": 4462,
+ "text_loss": 0.6736721992492676
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0006687135552371305,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7200290.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006747903767973185,
+ "skip_count": 1.0,
+ "step": 4464,
+ "text_loss": 0.2026437371969223
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.967420017610802,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0006684221590428657,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7203320.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011565096210688353,
+ "skip_count": 0.0,
+ "step": 4466,
+ "text_loss": 0.7587730288505554
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.976812444966246,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0006681306983089121,
+ "loss": 0.0083,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 7206411.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.023645581677556038,
+ "skip_count": 2.0,
+ "step": 4468,
+ "text_loss": 0.8981561660766602
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0006678391731469575,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7209421.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035848666448146105,
+ "skip_count": 0.0,
+ "step": 4470,
+ "text_loss": 0.1522839516401291
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 20.995597299677137,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0006675475836687152,
+ "loss": 0.0069,
+ "macro_f1": 1.0,
+ "num_tokens": 7212267.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005046425387263298,
+ "skip_count": 1.0,
+ "step": 4472,
+ "text_loss": 0.46007999777793884
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.00469621367772,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0006672559299859228,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7215195.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019333874806761742,
+ "skip_count": 0.0,
+ "step": 4474,
+ "text_loss": 1.0859547853469849
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.014088641033165,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0006669642122103423,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7217941.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005401032394729555,
+ "skip_count": 0.0,
+ "step": 4476,
+ "text_loss": 0.9754356145858765
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.023481068388612,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.0006666724304537611,
+ "loss": 0.0053,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7222494.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015569722279906273,
+ "skip_count": 0.0,
+ "step": 4478,
+ "text_loss": 0.2896423637866974
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0006663805848279898,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7225292.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020135147497057915,
+ "skip_count": 0.0,
+ "step": 4480,
+ "text_loss": 0.8492724299430847
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 21.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.0006660886754448648,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7229184.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002355351345613599,
+ "skip_count": 0.0,
+ "step": 4482,
+ "text_loss": 0.189764603972435
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.051658350454947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.0006657967024162459,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7232906.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003044391982257366,
+ "skip_count": 0.0,
+ "step": 4484,
+ "text_loss": 0.4239847660064697
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0006655046658540179,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7235996.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00602696230635047,
+ "skip_count": 2.0,
+ "step": 4486,
+ "text_loss": 0.217103973031044
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.070443205165834,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0169677734375,
+ "learning_rate": 0.0006652125658700896,
+ "loss": 0.0031,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7238882.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001470155781134963,
+ "skip_count": 1.0,
+ "step": 4488,
+ "text_loss": 0.6090770363807678
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0006649204025763945,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 7241815.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008737480267882347,
+ "skip_count": 2.0,
+ "step": 4490,
+ "text_loss": 0.48314425349235535
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0177001953125,
+ "learning_rate": 0.0006646281760848902,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7244848.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008257135050371289,
+ "skip_count": 0.0,
+ "step": 4492,
+ "text_loss": 0.5884748101234436
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.098620487232168,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0006643358865075581,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7247930.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016262239078059793,
+ "skip_count": 0.0,
+ "step": 4494,
+ "text_loss": 0.21444730460643768
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.108012914587615,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0006640435339564042,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7251776.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001315156347118318,
+ "skip_count": 0.0,
+ "step": 4496,
+ "text_loss": 0.6890370845794678
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.11740534194306,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0006637511185434588,
+ "loss": 0.0091,
+ "macro_f1": 1.0,
+ "num_tokens": 7255070.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007614497095346451,
+ "skip_count": 3.0,
+ "step": 4498,
+ "text_loss": 0.516417920589447
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 21.126797769298502,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0006634586403807758,
+ "loss": 0.0041,
+ "macro_f1": 1.0,
+ "num_tokens": 7258115.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.004906686954200268,
+ "skip_count": 2.0,
+ "step": 4500,
+ "text_loss": 0.577463686466217
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.13619019665395,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0927734375,
+ "learning_rate": 0.0006631660995804334,
+ "loss": 0.0067,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 7260769.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013337121345102787,
+ "skip_count": 2.0,
+ "step": 4502,
+ "text_loss": 0.37124839425086975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0006628734962545339,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7263908.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023418180644512177,
+ "skip_count": 0.0,
+ "step": 4504,
+ "text_loss": 0.17937727272510529
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0006625808305152033,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7267391.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006556165171787143,
+ "skip_count": 0.0,
+ "step": 4506,
+ "text_loss": 0.45344987511634827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.164367478720283,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0006622881024745919,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7271402.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021988123189657927,
+ "skip_count": 0.0,
+ "step": 4508,
+ "text_loss": 0.5842905640602112
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0006619953122448734,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7274354.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00774174090474844,
+ "skip_count": 2.0,
+ "step": 4510,
+ "text_loss": 0.27159228920936584
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0006617024599382456,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7277378.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006942499312572181,
+ "skip_count": 0.0,
+ "step": 4512,
+ "text_loss": 0.4464176297187805
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.192544760786618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0006614095456669302,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7280526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003003394464030862,
+ "skip_count": 0.0,
+ "step": 4514,
+ "text_loss": 0.31188079714775085
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0006611165695431725,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7283916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006948060472495854,
+ "skip_count": 0.0,
+ "step": 4516,
+ "text_loss": 0.5266574025154114
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0006608235316792413,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7286843.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014080886030569673,
+ "skip_count": 0.0,
+ "step": 4518,
+ "text_loss": 0.5880120396614075
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.220722042852948,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0006605304321874295,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7289940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016894340515136719,
+ "skip_count": 0.0,
+ "step": 4520,
+ "text_loss": 0.6623797416687012
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0006602372711800531,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7292869.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003522444050759077,
+ "skip_count": 0.0,
+ "step": 4522,
+ "text_loss": 0.5488807559013367
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0006599440487694521,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7296618.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011981099378317595,
+ "skip_count": 0.0,
+ "step": 4524,
+ "text_loss": 0.4128517210483551
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 21.248899324919282,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00065965076506799,
+ "loss": 0.0047,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 7300481.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.010548194870352745,
+ "skip_count": 2.0,
+ "step": 4526,
+ "text_loss": 0.26450902223587036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0006593574201880536,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7303272.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005642973352223635,
+ "skip_count": 1.0,
+ "step": 4528,
+ "text_loss": 0.35269856452941895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.000659064014242053,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7306615.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004171932581812143,
+ "skip_count": 1.0,
+ "step": 4530,
+ "text_loss": 0.18814080953598022
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.277076606985617,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0006587705473424223,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7310368.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002289367141202092,
+ "skip_count": 2.0,
+ "step": 4532,
+ "text_loss": 0.7363705635070801
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.286469034341064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.000658477019601618,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7313788.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004440625663846731,
+ "skip_count": 1.0,
+ "step": 4534,
+ "text_loss": 0.8126176595687866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0006581834311321211,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7317864.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013160990783944726,
+ "skip_count": 2.0,
+ "step": 4536,
+ "text_loss": 0.7015916109085083
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.30525388905195,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04736328125,
+ "learning_rate": 0.000657889782046435,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7320693.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032275544945150614,
+ "skip_count": 2.0,
+ "step": 4538,
+ "text_loss": 0.6481677293777466
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.314646316407398,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.0006575960724570865,
+ "loss": 0.0054,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7324335.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009769129566848278,
+ "skip_count": 1.0,
+ "step": 4540,
+ "text_loss": 0.22194676101207733
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0006573023024766258,
+ "loss": 0.0061,
+ "macro_f1": 1.0,
+ "num_tokens": 7327431.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0036973082460463047,
+ "skip_count": 4.0,
+ "step": 4542,
+ "text_loss": 0.475127637386322
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.000657008472217626,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7330262.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007046440150588751,
+ "skip_count": 0.0,
+ "step": 4544,
+ "text_loss": 0.2649917006492615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.342823598473732,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.0006567145817926836,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7333110.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026714997366070747,
+ "skip_count": 0.0,
+ "step": 4546,
+ "text_loss": 0.5490524768829346
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0006564206313144175,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7336101.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006552211008965969,
+ "skip_count": 0.0,
+ "step": 4548,
+ "text_loss": 0.14098678529262543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0006561266208954707,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7339435.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035560601390898228,
+ "skip_count": 2.0,
+ "step": 4550,
+ "text_loss": 0.20412275195121765
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0006558325506485081,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7342609.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020106974989175797,
+ "skip_count": 1.0,
+ "step": 4552,
+ "text_loss": 0.6184256076812744
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 0.0006555384206862183,
+ "loss": 0.009,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7345614.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014235252747312188,
+ "skip_count": 0.0,
+ "step": 4554,
+ "text_loss": 1.0108838081359863
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.389785735250953,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0006552442311213121,
+ "loss": 0.0041,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7348957.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01703745685517788,
+ "skip_count": 0.0,
+ "step": 4556,
+ "text_loss": 0.21315747499465942
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 21.399178162606397,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0006549499820665237,
+ "loss": 0.0077,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 7352724.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013315381482243538,
+ "skip_count": 3.0,
+ "step": 4558,
+ "text_loss": 0.34369465708732605
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.00065465567363461,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7356592.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017354936571791768,
+ "skip_count": 0.0,
+ "step": 4560,
+ "text_loss": 0.6267461180686951
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0006543613059383503,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7359774.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011646085418760777,
+ "skip_count": 2.0,
+ "step": 4562,
+ "text_loss": 0.4400193989276886
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.42735544467273,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0006540668790905471,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7362765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019345436012372375,
+ "skip_count": 0.0,
+ "step": 4564,
+ "text_loss": 0.49204275012016296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0006537723932040251,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7366337.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00562885170802474,
+ "skip_count": 1.0,
+ "step": 4566,
+ "text_loss": 0.22566382586956024
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 21.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0006534778483916319,
+ "loss": 0.0084,
+ "macro_f1": 1.0,
+ "num_tokens": 7369851.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005508176051080227,
+ "skip_count": 2.0,
+ "step": 4568,
+ "text_loss": 0.8057850003242493
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0006531832447662377,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7373918.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006460923235863447,
+ "skip_count": 2.0,
+ "step": 4570,
+ "text_loss": 0.5141497254371643
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.464925154094512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0006528885824407351,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7376674.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032120654359459877,
+ "skip_count": 0.0,
+ "step": 4572,
+ "text_loss": 0.1281338930130005
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 0.0006525938615280394,
+ "loss": 0.0116,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7379791.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00443810923025012,
+ "skip_count": 0.0,
+ "step": 4574,
+ "text_loss": 0.268352210521698
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.000652299082141088,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7382886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008284369483590126,
+ "skip_count": 2.0,
+ "step": 4576,
+ "text_loss": 0.30193832516670227
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 21.493102436160846,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0006520042443928411,
+ "loss": 0.0068,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 7386036.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03383317217230797,
+ "skip_count": 1.0,
+ "step": 4578,
+ "text_loss": 0.23106542229652405
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.000651709348396281,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7388908.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017075951909646392,
+ "skip_count": 1.0,
+ "step": 4580,
+ "text_loss": 0.386099249124527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0006514143942644124,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7392004.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009516917169094086,
+ "skip_count": 1.0,
+ "step": 4582,
+ "text_loss": 0.3162059485912323
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051513671875,
+ "learning_rate": 0.0006511193821102623,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7395538.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031392278615385294,
+ "skip_count": 0.0,
+ "step": 4584,
+ "text_loss": 0.5536221861839294
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0006508243120468799,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7398461.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014138511614874005,
+ "skip_count": 0.0,
+ "step": 4586,
+ "text_loss": 0.7934318780899048
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0006505291841873367,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7401611.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005265916115604341,
+ "skip_count": 0.0,
+ "step": 4588,
+ "text_loss": 0.4569905698299408
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.000650233998644726,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7404641.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024988956283777952,
+ "skip_count": 0.0,
+ "step": 4590,
+ "text_loss": 0.49998772144317627
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0006499387555321636,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7407574.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004110113717615604,
+ "skip_count": 1.0,
+ "step": 4592,
+ "text_loss": 0.5679413676261902
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0006496434549627874,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7410806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032845588866621256,
+ "skip_count": 0.0,
+ "step": 4594,
+ "text_loss": 0.35515281558036804
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.57763428235985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0006493480970497568,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7413402.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010577172972261906,
+ "skip_count": 1.0,
+ "step": 4596,
+ "text_loss": 0.26111698150634766
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0006490526819062537,
+ "loss": 0.0091,
+ "macro_f1": 1.0,
+ "num_tokens": 7417236.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002054794691503048,
+ "skip_count": 2.0,
+ "step": 4598,
+ "text_loss": 0.6480993628501892
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07958984375,
+ "learning_rate": 0.0006487572096454818,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7420278.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017989084590226412,
+ "skip_count": 0.0,
+ "step": 4600,
+ "text_loss": 0.4935401678085327
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0006484616803806665,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7423866.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006671485956758261,
+ "skip_count": 1.0,
+ "step": 4602,
+ "text_loss": 0.15030258893966675
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 21.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0006481660942250552,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7426884.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008334980346262455,
+ "skip_count": 3.0,
+ "step": 4604,
+ "text_loss": 0.29933279752731323
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 21.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0006478704512919173,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7431017.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011923984624445438,
+ "skip_count": 3.0,
+ "step": 4606,
+ "text_loss": 0.35141825675964355
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 21.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.0006475747516945432,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7434406.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031092462595552206,
+ "skip_count": 3.0,
+ "step": 4608,
+ "text_loss": 0.21021464467048645
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 21.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.000647278995546246,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7437204.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0006713552866131067,
+ "skip_count": 0.0,
+ "step": 4610,
+ "text_loss": 0.4052635431289673
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0006469831829603598,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7439741.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022583482787013054,
+ "skip_count": 2.0,
+ "step": 4612,
+ "text_loss": 0.5443860292434692
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.662166128558848,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0006466873140502407,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7443619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004187075886875391,
+ "skip_count": 2.0,
+ "step": 4614,
+ "text_loss": 0.30709847807884216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.0006463913889292661,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7446696.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008314833045005798,
+ "skip_count": 0.0,
+ "step": 4616,
+ "text_loss": 0.22949637472629547
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0006460954077108353,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7450377.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001277514616958797,
+ "skip_count": 0.0,
+ "step": 4618,
+ "text_loss": 0.37715134024620056
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.690343410625182,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0006457993705083684,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7453271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022756033577024937,
+ "skip_count": 2.0,
+ "step": 4620,
+ "text_loss": 0.7373883128166199
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.69973583798063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.0006455032774353078,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7456492.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0039057908579707146,
+ "skip_count": 2.0,
+ "step": 4622,
+ "text_loss": 0.5058769583702087
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 0.0006452071286051169,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7459619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019458672031760216,
+ "skip_count": 0.0,
+ "step": 4624,
+ "text_loss": 0.5110082030296326
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.718520692691516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0006449109241312802,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7462552.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0002716891176532954,
+ "skip_count": 1.0,
+ "step": 4626,
+ "text_loss": 0.6197522878646851
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.727913120046964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0006446146641273042,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7466769.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037578947376459837,
+ "skip_count": 2.0,
+ "step": 4628,
+ "text_loss": 0.1653924286365509
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.000644318348706716,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7470216.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012791058979928493,
+ "skip_count": 0.0,
+ "step": 4630,
+ "text_loss": 0.7114694118499756
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0006440219779830643,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7472975.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00736592011526227,
+ "skip_count": 2.0,
+ "step": 4632,
+ "text_loss": 0.26601463556289673
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.756090402113298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.000643725552069919,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7475672.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00045455715735442936,
+ "skip_count": 0.0,
+ "step": 4634,
+ "text_loss": 0.5028402805328369
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.0006434290710808711,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7478850.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004247233271598816,
+ "skip_count": 2.0,
+ "step": 4636,
+ "text_loss": 0.12746070325374603
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 21.774875256824185,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0006431325351295324,
+ "loss": 0.0083,
+ "macro_f1": 0.5427350401878357,
+ "num_tokens": 7481747.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.047564394772052765,
+ "skip_count": 2.0,
+ "step": 4638,
+ "text_loss": 0.24056802690029144
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.784267684179632,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0006428359443295362,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7484885.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011175100225955248,
+ "skip_count": 0.0,
+ "step": 4640,
+ "text_loss": 0.6265338063240051
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 21.793660111535075,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0006425392987945369,
+ "loss": 0.0086,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 7487973.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016879938542842865,
+ "skip_count": 2.0,
+ "step": 4642,
+ "text_loss": 0.2523447275161743
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 21.80305253889052,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0006422425986382093,
+ "loss": 0.0055,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 7491024.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018616504967212677,
+ "skip_count": 3.0,
+ "step": 4644,
+ "text_loss": 0.38890624046325684
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.812444966245963,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0006419458439742496,
+ "loss": 0.0056,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7494199.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023129139095544815,
+ "skip_count": 1.0,
+ "step": 4646,
+ "text_loss": 0.4060848355293274
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0006416490349163747,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7497287.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018601802876219153,
+ "skip_count": 0.0,
+ "step": 4648,
+ "text_loss": 0.3387545943260193
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0006413521715783225,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7500598.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017482215771451592,
+ "skip_count": 0.0,
+ "step": 4650,
+ "text_loss": 0.4290996193885803
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.840622248312297,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0006410552540738514,
+ "loss": 0.007,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7503252.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0420118011534214,
+ "skip_count": 0.0,
+ "step": 4652,
+ "text_loss": 0.439496248960495
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 21.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.000640758282516741,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 7506382.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017782216891646385,
+ "skip_count": 1.0,
+ "step": 4654,
+ "text_loss": 0.8513308167457581
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 21.859407103023187,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.0006404612570207911,
+ "loss": 0.0102,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7510423.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010385853238403797,
+ "skip_count": 0.0,
+ "step": 4656,
+ "text_loss": 0.7159742712974548
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0006401641776998223,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7513394.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011917101219296455,
+ "skip_count": 0.0,
+ "step": 4658,
+ "text_loss": 0.6165401339530945
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.878191957734078,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0006398670446676766,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 7516828.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.008860073052346706,
+ "skip_count": 4.0,
+ "step": 4660,
+ "text_loss": 0.923275887966156
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.88758438508952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0006395698580382153,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7519764.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000505418807733804,
+ "skip_count": 0.0,
+ "step": 4662,
+ "text_loss": 0.6143050789833069
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.896976812444965,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.0006392726179253212,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7522390.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004020806401968002,
+ "skip_count": 1.0,
+ "step": 4664,
+ "text_loss": 0.6935067176818848
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.906369239800412,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 0.0006389753244428972,
+ "loss": 0.0079,
+ "macro_f1": 1.0,
+ "num_tokens": 7525821.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00957963801920414,
+ "skip_count": 2.0,
+ "step": 4666,
+ "text_loss": 0.3350338637828827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.915761667155856,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0006386779777048666,
+ "loss": 0.0063,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 7529513.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.020673364400863647,
+ "skip_count": 2.0,
+ "step": 4668,
+ "text_loss": 0.47800472378730774
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.0006383805778251735,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7533450.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007217096630483866,
+ "skip_count": 1.0,
+ "step": 4670,
+ "text_loss": 0.4506106972694397
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 21.934546521866746,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.0006380831249177817,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7536287.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007001714315265417,
+ "skip_count": 0.0,
+ "step": 4672,
+ "text_loss": 0.4081715941429138
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0006377856190966762,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7539442.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015112817054614425,
+ "skip_count": 0.0,
+ "step": 4674,
+ "text_loss": 0.21451139450073242
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0006374880604758615,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7542594.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007311929017305374,
+ "skip_count": 2.0,
+ "step": 4676,
+ "text_loss": 0.14785248041152954
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0006371904491693626,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7545780.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007489737123250961,
+ "skip_count": 1.0,
+ "step": 4678,
+ "text_loss": 0.2248108983039856
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 21.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0006368927852912247,
+ "loss": 0.0057,
+ "macro_f1": 1.0,
+ "num_tokens": 7548287.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009772555902600288,
+ "skip_count": 1.0,
+ "step": 4680,
+ "text_loss": 0.1566995233297348
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.981508658643968,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0006365950689555133,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7551424.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002134992741048336,
+ "skip_count": 0.0,
+ "step": 4682,
+ "text_loss": 0.7322417497634888
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 21.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0006362973002763139,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 7554182.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008511497639119625,
+ "skip_count": 4.0,
+ "step": 4684,
+ "text_loss": 0.24387991428375244
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.0006359994793677319,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7557044.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004151526838541031,
+ "skip_count": 2.0,
+ "step": 4686,
+ "text_loss": 0.6139411330223083
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.009392427355444,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0006357016063438928,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7560231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009724601986818016,
+ "skip_count": 0.0,
+ "step": 4688,
+ "text_loss": 0.7875718474388123
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.0006354036813189421,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7562953.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008926765876822174,
+ "skip_count": 0.0,
+ "step": 4690,
+ "text_loss": 0.5195512771606445
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0006351057044070455,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7566137.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031294538639485836,
+ "skip_count": 0.0,
+ "step": 4692,
+ "text_loss": 0.7288873195648193
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.037569709421778,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0006348076757223877,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7569073.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015065820189192891,
+ "skip_count": 2.0,
+ "step": 4694,
+ "text_loss": 0.7242236137390137
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0006345095953791746,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7573025.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005603441968560219,
+ "skip_count": 0.0,
+ "step": 4696,
+ "text_loss": 0.34443899989128113
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.0006342114634916307,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7576546.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011047758162021637,
+ "skip_count": 0.0,
+ "step": 4698,
+ "text_loss": 0.4892682731151581
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.0006339132801740008,
+ "loss": 0.0076,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7580711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019803126342594624,
+ "skip_count": 2.0,
+ "step": 4700,
+ "text_loss": 0.4479489028453827
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 22.07513941884356,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 0.0006336150455405494,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7583385.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0005326359532773495,
+ "skip_count": 0.0,
+ "step": 4702,
+ "text_loss": 0.627504825592041
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.0006333167597055604,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7586584.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005587987834587693,
+ "skip_count": 0.0,
+ "step": 4704,
+ "text_loss": 0.43891432881355286
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.0006330184227833376,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7590408.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007053783163428307,
+ "skip_count": 2.0,
+ "step": 4706,
+ "text_loss": 0.19946859776973724
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 22.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0006327200348882043,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7593857.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0009479080326855183,
+ "skip_count": 0.0,
+ "step": 4708,
+ "text_loss": 0.7973214387893677
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1259765625,
+ "learning_rate": 0.0006324215961345032,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7596429.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012403312139213085,
+ "skip_count": 0.0,
+ "step": 4710,
+ "text_loss": 0.48477989435195923
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0006321231066365966,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7599618.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005520360427908599,
+ "skip_count": 0.0,
+ "step": 4712,
+ "text_loss": 0.44222453236579895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0006318245665088665,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7603180.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015553623670712113,
+ "skip_count": 0.0,
+ "step": 4714,
+ "text_loss": 0.5132410526275635
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0006315259758657138,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7606457.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004210884217172861,
+ "skip_count": 1.0,
+ "step": 4716,
+ "text_loss": 0.39850690960884094
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 22.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.0006312273348215589,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7609317.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001220117206685245,
+ "skip_count": 0.0,
+ "step": 4718,
+ "text_loss": 0.3509018123149872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.159671265042558,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0006309286434908419,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7613076.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007768960203975439,
+ "skip_count": 2.0,
+ "step": 4720,
+ "text_loss": 0.33361560106277466
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0006306299019880217,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7616242.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006226699333637953,
+ "skip_count": 0.0,
+ "step": 4722,
+ "text_loss": 0.23661087453365326
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 22.17845611975345,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0006303311104275766,
+ "loss": 0.0073,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 7619069.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015590761788189411,
+ "skip_count": 1.0,
+ "step": 4724,
+ "text_loss": 0.23373056948184967
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.187848547108892,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0006300322689240041,
+ "loss": 0.0076,
+ "macro_f1": 1.0,
+ "num_tokens": 7622581.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006862971931695938,
+ "skip_count": 2.0,
+ "step": 4726,
+ "text_loss": 0.8301828503608704
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 22.19724097446434,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0006297333775918209,
+ "loss": 0.0086,
+ "macro_f1": 1.0,
+ "num_tokens": 7625566.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006256614346057177,
+ "skip_count": 1.0,
+ "step": 4728,
+ "text_loss": 0.3756707012653351
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.0006294344365455626,
+ "loss": 0.0079,
+ "macro_f1": 1.0,
+ "num_tokens": 7629047.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009151885285973549,
+ "skip_count": 2.0,
+ "step": 4730,
+ "text_loss": 0.33362850546836853
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.216025829175226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0006291354458997841,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7631847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009307434665970504,
+ "skip_count": 0.0,
+ "step": 4732,
+ "text_loss": 0.4572524130344391
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.225418256530673,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0006288364057690591,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7635181.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00041220212006010115,
+ "skip_count": 0.0,
+ "step": 4734,
+ "text_loss": 0.40211325883865356
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0006285373162679804,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7637752.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006696670898236334,
+ "skip_count": 2.0,
+ "step": 4736,
+ "text_loss": 0.7588053345680237
+ },
+ {
+ "acc_repeat": 0.75,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 22.24420311124156,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.8571428656578064,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0006282381775111597,
+ "loss": 0.0081,
+ "macro_f1": 0.9449735879898071,
+ "num_tokens": 7640719.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.016283133998513222,
+ "skip_count": 2.0,
+ "step": 4738,
+ "text_loss": 0.5697863101959229
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 22.253595538597008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0006279389896132274,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7643524.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00763951288536191,
+ "skip_count": 3.0,
+ "step": 4740,
+ "text_loss": 0.548592209815979
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 22.26298796595245,
+ "f1_execute": 0.9756097793579102,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0006276397526888329,
+ "loss": 0.0094,
+ "macro_f1": 0.925203263759613,
+ "num_tokens": 7646919.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.038590483367443085,
+ "skip_count": 5.0,
+ "step": 4742,
+ "text_loss": 0.27226054668426514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0006273404668526443,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7650404.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012555639259517193,
+ "skip_count": 0.0,
+ "step": 4744,
+ "text_loss": 0.47892290353775024
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 22.281772820663342,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0006270411322193488,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7652942.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015356402145698667,
+ "skip_count": 0.0,
+ "step": 4746,
+ "text_loss": 0.5515767931938171
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0006267417489036517,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7656269.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005182140972465277,
+ "skip_count": 0.0,
+ "step": 4748,
+ "text_loss": 0.3496028184890747
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0006264423170202773,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7658664.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004144361708313227,
+ "skip_count": 0.0,
+ "step": 4750,
+ "text_loss": 0.2786032557487488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.309950102729672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0006261428366839685,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7661471.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00035335420398041606,
+ "skip_count": 0.0,
+ "step": 4752,
+ "text_loss": 0.4838487505912781
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.31934253008512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0006258433080094868,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7664593.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0103341368958354,
+ "skip_count": 2.0,
+ "step": 4754,
+ "text_loss": 0.24325360357761383
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0006255437311116119,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7667573.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014633853919804096,
+ "skip_count": 2.0,
+ "step": 4756,
+ "text_loss": 0.21569855511188507
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.338127384796007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.0006252441061051426,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7671171.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004900569561868906,
+ "skip_count": 0.0,
+ "step": 4758,
+ "text_loss": 0.12832018733024597
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0006249444331048955,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7673932.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020371589343994856,
+ "skip_count": 0.0,
+ "step": 4760,
+ "text_loss": 0.38652482628822327
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.000624644712225706,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7677396.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028059002943336964,
+ "skip_count": 2.0,
+ "step": 4762,
+ "text_loss": 0.7937633395195007
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.0006243449435824276,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7680392.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007225095760077238,
+ "skip_count": 0.0,
+ "step": 4764,
+ "text_loss": 0.5690395832061768
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.375697094217788,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0006240451272899321,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7684121.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002052050782367587,
+ "skip_count": 1.0,
+ "step": 4766,
+ "text_loss": 0.5321336984634399
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 22.38508952157323,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0006237452634631099,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7687236.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0039039517287164927,
+ "skip_count": 0.0,
+ "step": 4768,
+ "text_loss": 0.30823320150375366
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 22.394481948928675,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0006234453522168694,
+ "loss": 0.0084,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 7690355.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014570238068699837,
+ "skip_count": 2.0,
+ "step": 4770,
+ "text_loss": 0.21501587331295013
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 25.0,
+ "epoch": 22.403874376284122,
+ "f1_execute": 0.949999988079071,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.000623145393666137,
+ "loss": 0.0069,
+ "macro_f1": 0.886363685131073,
+ "num_tokens": 7693559.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.061707716435194016,
+ "skip_count": 6.0,
+ "step": 4772,
+ "text_loss": 0.24371100962162018
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0006228453879258576,
+ "loss": 0.0037,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7696422.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005053870379924774,
+ "skip_count": 2.0,
+ "step": 4774,
+ "text_loss": 0.237778440117836
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060302734375,
+ "learning_rate": 0.0006225453351109934,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7700460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017990898340940475,
+ "skip_count": 0.0,
+ "step": 4776,
+ "text_loss": 0.612456738948822
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.432051658350456,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.000622245235336526,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7703330.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004507021512836218,
+ "skip_count": 2.0,
+ "step": 4778,
+ "text_loss": 0.36898812651634216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0006219450887174537,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7707243.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006295828148722649,
+ "skip_count": 1.0,
+ "step": 4780,
+ "text_loss": 0.14474599063396454
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0006216448953687932,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7711121.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005049831233918667,
+ "skip_count": 0.0,
+ "step": 4782,
+ "text_loss": 0.4696790277957916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0006213446554055795,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7714889.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006010758224874735,
+ "skip_count": 0.0,
+ "step": 4784,
+ "text_loss": 0.46253830194473267
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 22.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0006210443689428649,
+ "loss": 0.0063,
+ "macro_f1": 1.0,
+ "num_tokens": 7718420.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.006691234186291695,
+ "skip_count": 1.0,
+ "step": 4786,
+ "text_loss": 0.579987645149231
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.00062074403609572,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7721720.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001864895923063159,
+ "skip_count": 0.0,
+ "step": 4788,
+ "text_loss": 0.325242817401886
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.488406222483125,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.0006204436569792324,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7724916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00202955212444067,
+ "skip_count": 0.0,
+ "step": 4790,
+ "text_loss": 0.49637556076049805
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 22.49779864983857,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0006201432317085083,
+ "loss": 0.0085,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7728081.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0037843603640794754,
+ "skip_count": 0.0,
+ "step": 4792,
+ "text_loss": 0.38812628388404846
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 22.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.0006198427603986711,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7731457.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012036679312586784,
+ "skip_count": 3.0,
+ "step": 4794,
+ "text_loss": 0.2996312379837036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.516583504549455,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0006195422431648623,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7734595.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008874868508428335,
+ "skip_count": 1.0,
+ "step": 4796,
+ "text_loss": 0.3203189969062805
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 22.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.0006192416801222403,
+ "loss": 0.0051,
+ "macro_f1": 1.0,
+ "num_tokens": 7737565.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0032894534524530172,
+ "skip_count": 1.0,
+ "step": 4798,
+ "text_loss": 0.3283322751522064
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.0006189410713859815,
+ "loss": 0.0076,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7740439.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009667043574154377,
+ "skip_count": 2.0,
+ "step": 4800,
+ "text_loss": 0.25219282507896423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 22.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0006186404170712797,
+ "loss": 0.0093,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7743813.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012643060646951199,
+ "skip_count": 4.0,
+ "step": 4802,
+ "text_loss": 0.22567439079284668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0006183397172933462,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7747182.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002678517485037446,
+ "skip_count": 0.0,
+ "step": 4804,
+ "text_loss": 0.19188879430294037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0006180389721674101,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7750735.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013385121710598469,
+ "skip_count": 0.0,
+ "step": 4806,
+ "text_loss": 0.5860441327095032
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.572938068682124,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.000617738181808717,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7753843.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034869094379246235,
+ "skip_count": 1.0,
+ "step": 4808,
+ "text_loss": 0.4366260766983032
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0478515625,
+ "learning_rate": 0.0006174373463325306,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7757039.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013648992171511054,
+ "skip_count": 0.0,
+ "step": 4810,
+ "text_loss": 0.5217258334159851
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0006171364658541314,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 7760016.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038017008919268847,
+ "skip_count": 2.0,
+ "step": 4812,
+ "text_loss": 0.8130963444709778
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.601115350748458,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0006168355404888177,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7762961.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006867518648505211,
+ "skip_count": 2.0,
+ "step": 4814,
+ "text_loss": 0.17822521924972534
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.610507778103905,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0006165345703519043,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7766399.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004653502255678177,
+ "skip_count": 0.0,
+ "step": 4816,
+ "text_loss": 0.5316070914268494
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 22.61990020545935,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0006162335555587238,
+ "loss": 0.008,
+ "macro_f1": 1.0,
+ "num_tokens": 7769039.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016906452365219593,
+ "skip_count": 1.0,
+ "step": 4818,
+ "text_loss": 0.5680997967720032
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.629292632814792,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 0.0006159324962246257,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7772768.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002541248919442296,
+ "skip_count": 0.0,
+ "step": 4820,
+ "text_loss": 0.6169226169586182
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0006156313924649762,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7775545.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008644679561257362,
+ "skip_count": 2.0,
+ "step": 4822,
+ "text_loss": 0.2211475968360901
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.0006153302443951589,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7778837.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0041346061043441296,
+ "skip_count": 2.0,
+ "step": 4824,
+ "text_loss": 0.5369775891304016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.0006150290521305746,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7782309.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012756052892655134,
+ "skip_count": 0.0,
+ "step": 4826,
+ "text_loss": 0.5294989943504333
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.666862342236573,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0006147278157866403,
+ "loss": 0.0046,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7785565.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.029718991369009018,
+ "skip_count": 1.0,
+ "step": 4828,
+ "text_loss": 0.6920449733734131
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0006144265354787906,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7788218.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004829924553632736,
+ "skip_count": 0.0,
+ "step": 4830,
+ "text_loss": 0.17072243988513947
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0006141252113224767,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7790788.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00254037044942379,
+ "skip_count": 0.0,
+ "step": 4832,
+ "text_loss": 0.20075996220111847
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.695039624302908,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01519775390625,
+ "learning_rate": 0.0006138238434331666,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7793913.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004426188243087381,
+ "skip_count": 0.0,
+ "step": 4834,
+ "text_loss": 0.695742130279541
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 0.000613522431926345,
+ "loss": 0.0036,
+ "macro_f1": 1.0,
+ "num_tokens": 7796932.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005176798906177282,
+ "skip_count": 3.0,
+ "step": 4836,
+ "text_loss": 0.4910822808742523
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0006132209769175132,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7800686.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004120545461773872,
+ "skip_count": 0.0,
+ "step": 4838,
+ "text_loss": 0.3701378405094147
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.723216906369238,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0006129194785221894,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7804765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0043835826218128204,
+ "skip_count": 0.0,
+ "step": 4840,
+ "text_loss": 0.343635618686676
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0006126179368559086,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7807498.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001394893741235137,
+ "skip_count": 1.0,
+ "step": 4842,
+ "text_loss": 0.47756674885749817
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.000612316352034222,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7810784.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031262130942195654,
+ "skip_count": 2.0,
+ "step": 4844,
+ "text_loss": 0.13077901303768158
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 22.751394188435572,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0006120147241726972,
+ "loss": 0.0081,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 7814754.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.016139274463057518,
+ "skip_count": 1.0,
+ "step": 4846,
+ "text_loss": 0.18850074708461761
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0006117130533869189,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7818245.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009124451316893101,
+ "skip_count": 0.0,
+ "step": 4848,
+ "text_loss": 0.42503559589385986
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0006114113397924878,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7822214.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015132242115214467,
+ "skip_count": 0.0,
+ "step": 4850,
+ "text_loss": 0.16767354309558868
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 22.779571470501907,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0006111095835050212,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 7825019.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006253300234675407,
+ "skip_count": 2.0,
+ "step": 4852,
+ "text_loss": 0.44826745986938477
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.788963897857354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0006108077846401524,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7828113.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024391328915953636,
+ "skip_count": 0.0,
+ "step": 4854,
+ "text_loss": 0.2009880244731903
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 22.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0006105059433135317,
+ "loss": 0.0078,
+ "macro_f1": 1.0,
+ "num_tokens": 7831177.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0020866121631115675,
+ "skip_count": 1.0,
+ "step": 4856,
+ "text_loss": 0.7082528471946716
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.80774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.0006102040596408251,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7834485.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004373365081846714,
+ "skip_count": 1.0,
+ "step": 4858,
+ "text_loss": 0.2541539669036865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.817141179923688,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0006099021337377148,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7837749.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004309024661779404,
+ "skip_count": 0.0,
+ "step": 4860,
+ "text_loss": 0.3163885176181793
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 22.82653360727913,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0006096001657198995,
+ "loss": 0.0065,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 7840979.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023044804111123085,
+ "skip_count": 4.0,
+ "step": 4862,
+ "text_loss": 0.49609798192977905
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 0.0006092981557030941,
+ "loss": 0.0056,
+ "macro_f1": 1.0,
+ "num_tokens": 7844905.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010683654807507992,
+ "skip_count": 3.0,
+ "step": 4864,
+ "text_loss": 0.16866883635520935
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.845318461990022,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0006089961038030291,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7847800.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011224723421037197,
+ "skip_count": 0.0,
+ "step": 4866,
+ "text_loss": 0.5093055367469788
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0006086940101354515,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7850983.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003944621421396732,
+ "skip_count": 1.0,
+ "step": 4868,
+ "text_loss": 0.5753747224807739
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 22.86410331670091,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.0006083918748161244,
+ "loss": 0.0069,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 7855041.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02532145567238331,
+ "skip_count": 2.0,
+ "step": 4870,
+ "text_loss": 0.8082366585731506
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0006080896979608262,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7858058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007558314246125519,
+ "skip_count": 0.0,
+ "step": 4872,
+ "text_loss": 0.6476574540138245
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.000607787479685352,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7861223.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009224560926668346,
+ "skip_count": 0.0,
+ "step": 4874,
+ "text_loss": 0.5012133717536926
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0006074852201055121,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7864180.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028308273758739233,
+ "skip_count": 0.0,
+ "step": 4876,
+ "text_loss": 0.7447214722633362
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.901673026122687,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052734375,
+ "learning_rate": 0.0006071829193371331,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7866726.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021505290642380714,
+ "skip_count": 0.0,
+ "step": 4878,
+ "text_loss": 0.5444929599761963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11376953125,
+ "learning_rate": 0.0006068805774960573,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7870166.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021109723020344973,
+ "skip_count": 0.0,
+ "step": 4880,
+ "text_loss": 0.3577263355255127
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.0006065781946981425,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7873028.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027144821360707283,
+ "skip_count": 0.0,
+ "step": 4882,
+ "text_loss": 0.28464797139167786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.92985030818902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0006062757710592624,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7876747.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004638207610696554,
+ "skip_count": 0.0,
+ "step": 4884,
+ "text_loss": 0.381534606218338
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0006059733066953066,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 7879524.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002225410658866167,
+ "skip_count": 2.0,
+ "step": 4886,
+ "text_loss": 0.5167883634567261
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0006056708017221796,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7882809.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00419368501752615,
+ "skip_count": 1.0,
+ "step": 4888,
+ "text_loss": 0.22688335180282593
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.000605368256255802,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7886310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017340193735435605,
+ "skip_count": 1.0,
+ "step": 4890,
+ "text_loss": 1.0128135681152344
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.967420017610802,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0712890625,
+ "learning_rate": 0.0006050656704121098,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7889483.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016647159354761243,
+ "skip_count": 0.0,
+ "step": 4892,
+ "text_loss": 0.2213262915611267
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 22.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0006047630443070547,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7892615.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038971947506070137,
+ "skip_count": 3.0,
+ "step": 4894,
+ "text_loss": 0.45751357078552246
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 22.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0006044603780566032,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 7895747.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0036852145567536354,
+ "skip_count": 1.0,
+ "step": 4896,
+ "text_loss": 0.13489919900894165
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.995597299677137,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0006041576717767379,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7899155.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007661987561732531,
+ "skip_count": 1.0,
+ "step": 4898,
+ "text_loss": 0.281853586435318
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 23.00469621367772,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0006038549255834563,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 7901667.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01836695335805416,
+ "skip_count": 5.0,
+ "step": 4900,
+ "text_loss": 0.24879895150661469
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.014088641033165,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.000603552139592771,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7904506.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011829182039946318,
+ "skip_count": 0.0,
+ "step": 4902,
+ "text_loss": 0.7550268769264221
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 23.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0006032493139207106,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7907316.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0022891140542924404,
+ "skip_count": 0.0,
+ "step": 4904,
+ "text_loss": 0.37596020102500916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.0006029464486833186,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7911283.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001990227960050106,
+ "skip_count": 0.0,
+ "step": 4906,
+ "text_loss": 0.5879577994346619
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.0006026435439966531,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7913907.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026039890944957733,
+ "skip_count": 1.0,
+ "step": 4908,
+ "text_loss": 0.41484713554382324
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.051658350454947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0006023405999767879,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7916772.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009183229878544807,
+ "skip_count": 1.0,
+ "step": 4910,
+ "text_loss": 0.20732562243938446
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0006020376167398116,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7919346.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005508727394044399,
+ "skip_count": 1.0,
+ "step": 4912,
+ "text_loss": 0.41416165232658386
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 23.070443205165834,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0006017345944018284,
+ "loss": 0.0051,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7922404.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008651934564113617,
+ "skip_count": 0.0,
+ "step": 4914,
+ "text_loss": 0.4290519952774048
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0006014315330789563,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7925165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003601635340601206,
+ "skip_count": 1.0,
+ "step": 4916,
+ "text_loss": 0.8447931408882141
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0006011284328873296,
+ "loss": 0.0041,
+ "macro_f1": 1.0,
+ "num_tokens": 7928146.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0049415635876357555,
+ "skip_count": 2.0,
+ "step": 4918,
+ "text_loss": 0.32237401604652405
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.098620487232168,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.0006008252939430967,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7931163.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024150956887751818,
+ "skip_count": 0.0,
+ "step": 4920,
+ "text_loss": 0.2251713126897812
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.108012914587615,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0006005221163624209,
+ "loss": 0.0057,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7934084.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03181030973792076,
+ "skip_count": 0.0,
+ "step": 4922,
+ "text_loss": 0.4962928593158722
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.11740534194306,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.054931640625,
+ "learning_rate": 0.0006002189002614806,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7937021.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00227518193423748,
+ "skip_count": 2.0,
+ "step": 4924,
+ "text_loss": 0.34440335631370544
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.126797769298502,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0005999156457564685,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7940205.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004331593867391348,
+ "skip_count": 1.0,
+ "step": 4926,
+ "text_loss": 0.14114083349704742
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0005996123529635925,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7945174.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000612895586527884,
+ "skip_count": 0.0,
+ "step": 4928,
+ "text_loss": 0.3895469009876251
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.145582624009393,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.000599309021999075,
+ "loss": 0.006,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7948716.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02319233864545822,
+ "skip_count": 1.0,
+ "step": 4930,
+ "text_loss": 0.38103172183036804
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0005990056529791528,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7952497.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003423231653869152,
+ "skip_count": 0.0,
+ "step": 4932,
+ "text_loss": 0.30447322130203247
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.164367478720283,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017822265625,
+ "learning_rate": 0.0005987022460200778,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7955578.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007005351362749934,
+ "skip_count": 0.0,
+ "step": 4934,
+ "text_loss": 0.49621838331222534
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 23.173759906075727,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0005983988012381159,
+ "loss": 0.0061,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 7958741.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03962617367506027,
+ "skip_count": 1.0,
+ "step": 4936,
+ "text_loss": 0.1920493096113205
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 23.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 0.0005980953187495476,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7962236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026006060652434826,
+ "skip_count": 3.0,
+ "step": 4938,
+ "text_loss": 0.5286803841590881
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.192544760786618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0005977917986706681,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7965631.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005010952707380056,
+ "skip_count": 0.0,
+ "step": 4940,
+ "text_loss": 0.3507745563983917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.0005974882411177871,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7968516.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023964287247508764,
+ "skip_count": 0.0,
+ "step": 4942,
+ "text_loss": 0.9110504388809204
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.000597184646207228,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7971310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026230409275740385,
+ "skip_count": 1.0,
+ "step": 4944,
+ "text_loss": 0.4131232798099518
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.220722042852948,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0005968810140553292,
+ "loss": 0.0102,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7974809.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007397596491500735,
+ "skip_count": 0.0,
+ "step": 4946,
+ "text_loss": 0.5130466222763062
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0005965773447784431,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7977800.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009955473942682147,
+ "skip_count": 0.0,
+ "step": 4948,
+ "text_loss": 0.5366153717041016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01373291015625,
+ "learning_rate": 0.0005962736384929362,
+ "loss": 0.0026,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7981027.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0049227322451770306,
+ "skip_count": 0.0,
+ "step": 4950,
+ "text_loss": 0.17266370356082916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.248899324919282,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 0.0005959698953151895,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7983580.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009975163266062737,
+ "skip_count": 0.0,
+ "step": 4952,
+ "text_loss": 0.2474549114704132
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.0005956661153615979,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7986711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006475782720372081,
+ "skip_count": 0.0,
+ "step": 4954,
+ "text_loss": 0.5748327970504761
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.0005953622987485703,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7990194.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001449751085601747,
+ "skip_count": 0.0,
+ "step": 4956,
+ "text_loss": 0.5163559317588806
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.277076606985617,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0005950584455925301,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7993050.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017087773885577917,
+ "skip_count": 0.0,
+ "step": 4958,
+ "text_loss": 0.15892620384693146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.286469034341064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.0005947545560099142,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7996383.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0044417232275009155,
+ "skip_count": 0.0,
+ "step": 4960,
+ "text_loss": 0.48022928833961487
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 23.295861461696507,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0005944506301171734,
+ "loss": 0.0066,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 7999843.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010093312710523605,
+ "skip_count": 2.0,
+ "step": 4962,
+ "text_loss": 0.5050316452980042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.30525388905195,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0005941466680307732,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8003504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009699694812297821,
+ "skip_count": 0.0,
+ "step": 4964,
+ "text_loss": 0.30474427342414856
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 23.314646316407398,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0005938426698671922,
+ "loss": 0.0097,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8007427.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016759657301008701,
+ "skip_count": 0.0,
+ "step": 4966,
+ "text_loss": 0.25060293078422546
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.0005935386357429232,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 8010265.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006916914135217667,
+ "skip_count": 3.0,
+ "step": 4968,
+ "text_loss": 0.49084481596946716
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 23.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0005932345657744723,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 8013733.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.017182426527142525,
+ "skip_count": 5.0,
+ "step": 4970,
+ "text_loss": 0.2705717980861664
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.342823598473732,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00059293046007836,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8017068.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008485594764351845,
+ "skip_count": 2.0,
+ "step": 4972,
+ "text_loss": 0.18570218980312347
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0005926263187711201,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8020185.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021750847809016705,
+ "skip_count": 2.0,
+ "step": 4974,
+ "text_loss": 0.4457069933414459
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.0005923221419693001,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8023038.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020193420350551605,
+ "skip_count": 0.0,
+ "step": 4976,
+ "text_loss": 0.7394505143165588
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.054931640625,
+ "learning_rate": 0.0005920179297894613,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8026236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001450369250960648,
+ "skip_count": 1.0,
+ "step": 4978,
+ "text_loss": 0.5914503335952759
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.000591713682348178,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8028765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017808573320508003,
+ "skip_count": 0.0,
+ "step": 4980,
+ "text_loss": 0.19231407344341278
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0005914093997620388,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8032043.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018225493840873241,
+ "skip_count": 0.0,
+ "step": 4982,
+ "text_loss": 0.3567875325679779
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.399178162606397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0005911050821476449,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8035086.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016285666497424245,
+ "skip_count": 0.0,
+ "step": 4984,
+ "text_loss": 0.34609633684158325
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.0005908007296216119,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8038193.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014699801104143262,
+ "skip_count": 0.0,
+ "step": 4986,
+ "text_loss": 0.4492359757423401
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.000590496342300568,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8041099.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002442725468426943,
+ "skip_count": 0.0,
+ "step": 4988,
+ "text_loss": 0.5162975788116455
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.42735544467273,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0005901919203011548,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8044350.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008624207228422165,
+ "skip_count": 2.0,
+ "step": 4990,
+ "text_loss": 0.2533033490180969
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.0005898874637400279,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8047467.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015421364223584533,
+ "skip_count": 0.0,
+ "step": 4992,
+ "text_loss": 0.4890289306640625
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.0005895829727338552,
+ "loss": 0.0065,
+ "macro_f1": 1.0,
+ "num_tokens": 8050626.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0024516626726835966,
+ "skip_count": 2.0,
+ "step": 4994,
+ "text_loss": 0.50797039270401
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0005892784473993184,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8053386.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018553845584392548,
+ "skip_count": 2.0,
+ "step": 4996,
+ "text_loss": 0.628828763961792
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.464925154094512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.000588973887853112,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8055941.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004258487373590469,
+ "skip_count": 0.0,
+ "step": 4998,
+ "text_loss": 0.2643229067325592
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.474317581449956,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0005886692942119441,
+ "loss": 0.0062,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 8058638.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.019064312800765038,
+ "skip_count": 2.0,
+ "step": 5000,
+ "text_loss": 0.4925006031990051
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0005883646665925353,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8062097.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007969749276526272,
+ "skip_count": 0.0,
+ "step": 5002,
+ "text_loss": 0.49412909150123596
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.493102436160846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0005880600051116196,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8065202.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005813780706375837,
+ "skip_count": 2.0,
+ "step": 5004,
+ "text_loss": 0.5681346654891968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0005877553098859439,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8068574.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005012941546738148,
+ "skip_count": 0.0,
+ "step": 5006,
+ "text_loss": 0.2682424485683441
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 23.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0005874505810322678,
+ "loss": 0.0102,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8071834.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005859757773578167,
+ "skip_count": 3.0,
+ "step": 5008,
+ "text_loss": 0.6460036039352417
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.000587145818667364,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8074687.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002868571551516652,
+ "skip_count": 2.0,
+ "step": 5010,
+ "text_loss": 0.2405751347541809
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0005868410229080181,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8077617.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021759893279522657,
+ "skip_count": 1.0,
+ "step": 5012,
+ "text_loss": 0.7455595135688782
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.0005865361938710286,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8080734.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008311949786730111,
+ "skip_count": 0.0,
+ "step": 5014,
+ "text_loss": 0.44876906275749207
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 25.0,
+ "epoch": 23.549457000293515,
+ "f1_execute": 0.9756097793579102,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0005862313316732063,
+ "loss": 0.0054,
+ "macro_f1": 0.9615669250488281,
+ "num_tokens": 8085092.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.012511664070189,
+ "skip_count": 6.0,
+ "step": 5016,
+ "text_loss": 0.26010942459106445
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.000585926436431375,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8088333.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035441694781184196,
+ "skip_count": 0.0,
+ "step": 5018,
+ "text_loss": 0.28225192427635193
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 23.568241855004402,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0005856215082623711,
+ "loss": 0.0093,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 8091298.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.023543989285826683,
+ "skip_count": 2.0,
+ "step": 5020,
+ "text_loss": 0.5757577419281006
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.57763428235985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.0005853165472830439,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8094361.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003124240320175886,
+ "skip_count": 0.0,
+ "step": 5022,
+ "text_loss": 0.4021305739879608
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0005850115536102546,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8097514.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008170558139681816,
+ "skip_count": 1.0,
+ "step": 5024,
+ "text_loss": 0.18926584720611572
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 23.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.0005847065273608777,
+ "loss": 0.0051,
+ "macro_f1": 1.0,
+ "num_tokens": 8100525.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02127663604915142,
+ "skip_count": 5.0,
+ "step": 5026,
+ "text_loss": 0.18827557563781738
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.0005844014686517998,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8104016.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00272122910246253,
+ "skip_count": 0.0,
+ "step": 5028,
+ "text_loss": 0.15534701943397522
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 23.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0005840963775999199,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 8106697.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.008979840204119682,
+ "skip_count": 4.0,
+ "step": 5030,
+ "text_loss": 0.8123718500137329
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0005837912543221493,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8110986.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005006929859519005,
+ "skip_count": 0.0,
+ "step": 5032,
+ "text_loss": 0.26128846406936646
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.0005834860989354121,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8114010.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005531277856789529,
+ "skip_count": 0.0,
+ "step": 5034,
+ "text_loss": 0.5100266933441162
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.64338127384796,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0556640625,
+ "learning_rate": 0.0005831809115566442,
+ "loss": 0.0073,
+ "macro_f1": 0.6538461446762085,
+ "num_tokens": 8117168.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04978533461689949,
+ "skip_count": 1.0,
+ "step": 5036,
+ "text_loss": 0.41049885749816895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0005828756923027941,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8119900.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006322385743260384,
+ "skip_count": 0.0,
+ "step": 5038,
+ "text_loss": 0.5584380626678467
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.662166128558848,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0005825704412908225,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8123928.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001000594231300056,
+ "skip_count": 0.0,
+ "step": 5040,
+ "text_loss": 0.6460791230201721
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 0.0005822651586377019,
+ "loss": 0.0108,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8127926.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011595834977924824,
+ "skip_count": 2.0,
+ "step": 5042,
+ "text_loss": 0.3131820261478424
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 23.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.0005819598444604173,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8131092.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004449303261935711,
+ "skip_count": 3.0,
+ "step": 5044,
+ "text_loss": 0.2774372696876526
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.690343410625182,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0005816544988759658,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8134051.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007877505850046873,
+ "skip_count": 0.0,
+ "step": 5046,
+ "text_loss": 0.39496293663978577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.69973583798063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.0005813491220013563,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8138725.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002868623472750187,
+ "skip_count": 0.0,
+ "step": 5048,
+ "text_loss": 0.3779948651790619
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 0.0005810437139536098,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 8141913.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006244937423616648,
+ "skip_count": 4.0,
+ "step": 5050,
+ "text_loss": 0.4512978494167328
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.718520692691516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.0005807382748497592,
+ "loss": 0.0112,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8146193.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011013929033651948,
+ "skip_count": 0.0,
+ "step": 5052,
+ "text_loss": 0.6194499731063843
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.727913120046964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0005804328048068493,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8149701.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005505079869180918,
+ "skip_count": 1.0,
+ "step": 5054,
+ "text_loss": 0.2932305335998535
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 23.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0005801273039419368,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 8152861.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0057641929015517235,
+ "skip_count": 1.0,
+ "step": 5056,
+ "text_loss": 0.2631317973136902
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 23.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0005798217723720904,
+ "loss": 0.005,
+ "macro_f1": 1.0,
+ "num_tokens": 8155843.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0021671492140740156,
+ "skip_count": 5.0,
+ "step": 5058,
+ "text_loss": 0.2889988422393799
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.756090402113298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0005795162102143902,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8158812.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004476628266274929,
+ "skip_count": 1.0,
+ "step": 5060,
+ "text_loss": 0.48028868436813354
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0005792106175859283,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 8162719.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038497636560350657,
+ "skip_count": 3.0,
+ "step": 5062,
+ "text_loss": 0.4559471607208252
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.774875256824185,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 0.0005789049946038083,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8165692.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004451582673937082,
+ "skip_count": 0.0,
+ "step": 5064,
+ "text_loss": 0.3782602548599243
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.784267684179632,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.0005785993413851456,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8168900.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002951978938654065,
+ "skip_count": 0.0,
+ "step": 5066,
+ "text_loss": 0.32392629981040955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.000578293658047067,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8171661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011171254329383373,
+ "skip_count": 2.0,
+ "step": 5068,
+ "text_loss": 0.24492619931697845
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0005779879447067109,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8175075.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016067599644884467,
+ "skip_count": 0.0,
+ "step": 5070,
+ "text_loss": 0.7738823294639587
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.812444966245963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.000577682201481227,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8178515.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009113503620028496,
+ "skip_count": 1.0,
+ "step": 5072,
+ "text_loss": 0.2082248032093048
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 23.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0005773764284877774,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 8181790.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007332196459174156,
+ "skip_count": 1.0,
+ "step": 5074,
+ "text_loss": 0.4557662904262543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0537109375,
+ "learning_rate": 0.0005770706258435342,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8184854.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016252279747277498,
+ "skip_count": 0.0,
+ "step": 5076,
+ "text_loss": 0.2888098657131195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.840622248312297,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0005767647936656818,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8187860.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003406575648114085,
+ "skip_count": 0.0,
+ "step": 5078,
+ "text_loss": 0.6533790230751038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0005764589320714158,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8191683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006520140450447798,
+ "skip_count": 0.0,
+ "step": 5080,
+ "text_loss": 0.6903796195983887
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0005761530411779426,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8195109.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01188349537551403,
+ "skip_count": 1.0,
+ "step": 5082,
+ "text_loss": 0.20460398495197296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 23.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.083984375,
+ "learning_rate": 0.0005758471211024804,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8198340.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004826809279620647,
+ "skip_count": 3.0,
+ "step": 5084,
+ "text_loss": 0.2203969657421112
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.878191957734078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 0.0005755411719622584,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8200882.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019170823507010937,
+ "skip_count": 0.0,
+ "step": 5086,
+ "text_loss": 0.6744595170021057
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.88758438508952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.0005752351938745167,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8203777.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002110893838107586,
+ "skip_count": 1.0,
+ "step": 5088,
+ "text_loss": 0.4137859046459198
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.896976812444965,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.000574929186956507,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8207627.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018580821342766285,
+ "skip_count": 1.0,
+ "step": 5090,
+ "text_loss": 0.4830456078052521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.906369239800412,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0005746231513254912,
+ "loss": 0.0066,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 8210263.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0194723978638649,
+ "skip_count": 0.0,
+ "step": 5092,
+ "text_loss": 0.17383277416229248
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0005743170870987433,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8214166.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006944256369024515,
+ "skip_count": 2.0,
+ "step": 5094,
+ "text_loss": 0.20003484189510345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0005740109943935472,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8217545.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002044794149696827,
+ "skip_count": 1.0,
+ "step": 5096,
+ "text_loss": 0.5117167830467224
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.934546521866746,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06494140625,
+ "learning_rate": 0.0005737048733271986,
+ "loss": 0.0076,
+ "macro_f1": 1.0,
+ "num_tokens": 8220673.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009966124780476093,
+ "skip_count": 2.0,
+ "step": 5098,
+ "text_loss": 0.2705996036529541
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0005733987240170035,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8223796.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009675708715803921,
+ "skip_count": 0.0,
+ "step": 5100,
+ "text_loss": 0.7016357183456421
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.0005730925465802788,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8227048.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009548200177960098,
+ "skip_count": 0.0,
+ "step": 5102,
+ "text_loss": 0.30823078751564026
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0005727863411343526,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8229971.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005767418188042939,
+ "skip_count": 0.0,
+ "step": 5104,
+ "text_loss": 0.6897505521774292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 23.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0005724801077965629,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8232758.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009297889657318592,
+ "skip_count": 3.0,
+ "step": 5106,
+ "text_loss": 0.21293514966964722
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.981508658643968,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0005721738466842592,
+ "loss": 0.0079,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 8238154.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013964693062007427,
+ "skip_count": 0.0,
+ "step": 5108,
+ "text_loss": 0.7273620367050171
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 23.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10888671875,
+ "learning_rate": 0.0005718675579148014,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 8240818.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.007218098267912865,
+ "skip_count": 1.0,
+ "step": 5110,
+ "text_loss": 0.5607150793075562
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.0005715612416055598,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8244048.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007558444049209356,
+ "skip_count": 2.0,
+ "step": 5112,
+ "text_loss": 0.23694385588169098
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 24.009392427355444,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0005712548978739154,
+ "loss": 0.0072,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 8247240.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015726923942565918,
+ "skip_count": 1.0,
+ "step": 5114,
+ "text_loss": 0.6032099723815918
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 24.01878485471089,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 0.0005709485268372598,
+ "loss": 0.0046,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 8250585.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.011148860678076744,
+ "skip_count": 2.0,
+ "step": 5116,
+ "text_loss": 0.6825997233390808
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0005706421286129948,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8254240.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006977916229516268,
+ "skip_count": 0.0,
+ "step": 5118,
+ "text_loss": 0.2532844543457031
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.037569709421778,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0005703357033185328,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8257133.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006415650714188814,
+ "skip_count": 2.0,
+ "step": 5120,
+ "text_loss": 0.6132124066352844
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 24.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.0005700292510712967,
+ "loss": 0.0077,
+ "macro_f1": 1.0,
+ "num_tokens": 8261076.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0044475216418504715,
+ "skip_count": 1.0,
+ "step": 5122,
+ "text_loss": 0.4277699887752533
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.0005697227719887194,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8264607.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005743155721575022,
+ "skip_count": 2.0,
+ "step": 5124,
+ "text_loss": 0.2570968270301819
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0005694162661882444,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8267992.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007581565878354013,
+ "skip_count": 0.0,
+ "step": 5126,
+ "text_loss": 0.5850184559822083
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.07513941884356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0005691097337873252,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8271010.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036611228715628386,
+ "skip_count": 0.0,
+ "step": 5128,
+ "text_loss": 0.660999059677124
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0005688031749034258,
+ "loss": 0.0032,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8273638.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0039906189776957035,
+ "skip_count": 0.0,
+ "step": 5130,
+ "text_loss": 0.5839648246765137
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0005684965896540198,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 8276504.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007539632264524698,
+ "skip_count": 3.0,
+ "step": 5132,
+ "text_loss": 0.27675092220306396
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 24.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0005681899781565915,
+ "loss": 0.0033,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8279977.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0026953567285090685,
+ "skip_count": 0.0,
+ "step": 5134,
+ "text_loss": 0.532974123954773
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.000567883340528635,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8282781.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005754240322858095,
+ "skip_count": 1.0,
+ "step": 5136,
+ "text_loss": 0.31100207567214966
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0005675766768876542,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8286533.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0051517849788069725,
+ "skip_count": 0.0,
+ "step": 5138,
+ "text_loss": 0.5734741687774658
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0005672699873511635,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8289858.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025852699764072895,
+ "skip_count": 2.0,
+ "step": 5140,
+ "text_loss": 0.37045374512672424
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0005669632720366868,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8293038.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038520018570125103,
+ "skip_count": 0.0,
+ "step": 5142,
+ "text_loss": 0.25952374935150146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0005666565310617577,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8295717.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00026914477348327637,
+ "skip_count": 0.0,
+ "step": 5144,
+ "text_loss": 0.32531213760375977
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.159671265042558,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.0005663497645439203,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8299750.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0055860537104308605,
+ "skip_count": 2.0,
+ "step": 5146,
+ "text_loss": 0.2520618438720703
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0005660429726007279,
+ "loss": 0.0092,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8303075.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004446739796549082,
+ "skip_count": 1.0,
+ "step": 5148,
+ "text_loss": 0.43672287464141846
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 24.17845611975345,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07080078125,
+ "learning_rate": 0.000565736155349744,
+ "loss": 0.0076,
+ "macro_f1": 0.8814815282821655,
+ "num_tokens": 8306268.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.046915046870708466,
+ "skip_count": 4.0,
+ "step": 5150,
+ "text_loss": 0.35405927896499634
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 24.187848547108892,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0005654293129085412,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8310480.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010549088008701801,
+ "skip_count": 4.0,
+ "step": 5152,
+ "text_loss": 0.3523249626159668
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 24.19724097446434,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0005651224453947023,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8313367.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002893900265917182,
+ "skip_count": 0.0,
+ "step": 5154,
+ "text_loss": 0.4503810703754425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0005648155529258195,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8318006.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018450213829055429,
+ "skip_count": 0.0,
+ "step": 5156,
+ "text_loss": 0.5687127113342285
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.216025829175226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 0.0005645086356194943,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8320646.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026727779768407345,
+ "skip_count": 0.0,
+ "step": 5158,
+ "text_loss": 0.38920050859451294
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.225418256530673,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0005642016935933385,
+ "loss": 0.0035,
+ "macro_f1": 1.0,
+ "num_tokens": 8323915.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00611621281132102,
+ "skip_count": 2.0,
+ "step": 5160,
+ "text_loss": 0.3003547787666321
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 24.0,
+ "epoch": 24.234810683886117,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.0005638947269649726,
+ "loss": 0.0063,
+ "macro_f1": 0.9619450569152832,
+ "num_tokens": 8327073.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.028447439894080162,
+ "skip_count": 6.0,
+ "step": 5162,
+ "text_loss": 0.24053414165973663
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.0005635877358520268,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8330388.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013072624569758773,
+ "skip_count": 0.0,
+ "step": 5164,
+ "text_loss": 0.43772217631340027
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.253595538597008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0005632807203721406,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8333241.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009456822881475091,
+ "skip_count": 0.0,
+ "step": 5166,
+ "text_loss": 0.5217573046684265
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 0.000562973680642963,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8337257.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023840824142098427,
+ "skip_count": 0.0,
+ "step": 5168,
+ "text_loss": 0.31814974546432495
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 24.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 0.0005626666167821521,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8340143.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020231492817401886,
+ "skip_count": 3.0,
+ "step": 5170,
+ "text_loss": 0.5478505492210388
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.281772820663342,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0162353515625,
+ "learning_rate": 0.0005623595289073755,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 8343566.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01070715207606554,
+ "skip_count": 2.0,
+ "step": 5172,
+ "text_loss": 0.23213914036750793
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 24.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.0005620524171363099,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8346836.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003720001084730029,
+ "skip_count": 3.0,
+ "step": 5174,
+ "text_loss": 0.5114789009094238
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 24.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0005617452815866409,
+ "loss": 0.0061,
+ "macro_f1": 1.0,
+ "num_tokens": 8349726.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003322509117424488,
+ "skip_count": 1.0,
+ "step": 5176,
+ "text_loss": 0.4894506335258484
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.309950102729672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0005614381223760635,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8352478.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00028752797516062856,
+ "skip_count": 0.0,
+ "step": 5178,
+ "text_loss": 0.6418307423591614
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.31934253008512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.0005611309396222817,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8355766.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028724796138703823,
+ "skip_count": 0.0,
+ "step": 5180,
+ "text_loss": 0.23635952174663544
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.328734957440563,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0005608237334430085,
+ "loss": 0.0068,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 8358888.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.058520980179309845,
+ "skip_count": 2.0,
+ "step": 5182,
+ "text_loss": 0.23434793949127197
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.338127384796007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1015625,
+ "learning_rate": 0.000560516503955966,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8361761.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021356395445764065,
+ "skip_count": 1.0,
+ "step": 5184,
+ "text_loss": 0.40855672955513
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.000560209251278885,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8364376.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016185789136216044,
+ "skip_count": 0.0,
+ "step": 5186,
+ "text_loss": 0.6265131831169128
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0005599019755295053,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8367769.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031490204855799675,
+ "skip_count": 2.0,
+ "step": 5188,
+ "text_loss": 0.4716353118419647
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 24.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0005595946768255756,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8370705.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003500689286738634,
+ "skip_count": 0.0,
+ "step": 5190,
+ "text_loss": 0.5467679500579834
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.375697094217788,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0005592873552848532,
+ "loss": 0.0045,
+ "macro_f1": 1.0,
+ "num_tokens": 8374217.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.010764475911855698,
+ "skip_count": 3.0,
+ "step": 5192,
+ "text_loss": 0.4345340132713318
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 24.38508952157323,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0005589800110251045,
+ "loss": 0.0087,
+ "macro_f1": 1.0,
+ "num_tokens": 8378182.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0010365343187004328,
+ "skip_count": 1.0,
+ "step": 5194,
+ "text_loss": 0.46722909808158875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.394481948928675,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0005586726441641044,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8381227.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006349093746393919,
+ "skip_count": 2.0,
+ "step": 5196,
+ "text_loss": 0.35410359501838684
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.403874376284122,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.0005583652548196362,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8384886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00038166221929714084,
+ "skip_count": 0.0,
+ "step": 5198,
+ "text_loss": 0.5950250625610352
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0005580578431094924,
+ "loss": 0.0092,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8388939.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023578559048473835,
+ "skip_count": 2.0,
+ "step": 5200,
+ "text_loss": 0.6553771495819092
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0005577504091514735,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8391629.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010771085508167744,
+ "skip_count": 0.0,
+ "step": 5202,
+ "text_loss": 0.4441985785961151
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 24.432051658350456,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.000557442953063389,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8394440.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005844325292855501,
+ "skip_count": 3.0,
+ "step": 5204,
+ "text_loss": 0.5807011723518372
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.0005571354749630564,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8397731.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006837233901023865,
+ "skip_count": 1.0,
+ "step": 5206,
+ "text_loss": 0.27780941128730774
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 24.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.000556827974968302,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8400859.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007656649220734835,
+ "skip_count": 3.0,
+ "step": 5208,
+ "text_loss": 0.4746324121952057
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0005565204531969606,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8404164.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028129038400948048,
+ "skip_count": 1.0,
+ "step": 5210,
+ "text_loss": 0.8513513803482056
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0005562129097668746,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8407196.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00492360582575202,
+ "skip_count": 1.0,
+ "step": 5212,
+ "text_loss": 0.12255420535802841
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0005559053447958958,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8410633.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020713545382022858,
+ "skip_count": 0.0,
+ "step": 5214,
+ "text_loss": 0.6878522634506226
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.488406222483125,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.0005555977584018833,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8413414.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007216963567771018,
+ "skip_count": 0.0,
+ "step": 5216,
+ "text_loss": 0.845878541469574
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.49779864983857,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.057861328125,
+ "learning_rate": 0.0005552901507027048,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8416817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002400130731984973,
+ "skip_count": 1.0,
+ "step": 5218,
+ "text_loss": 0.16753672063350677
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 0.0005549825218162365,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8419617.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004563181661069393,
+ "skip_count": 0.0,
+ "step": 5220,
+ "text_loss": 0.26107168197631836
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 24.516583504549455,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.000554674871860362,
+ "loss": 0.0086,
+ "macro_f1": 1.0,
+ "num_tokens": 8422686.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006413881666958332,
+ "skip_count": 1.0,
+ "step": 5222,
+ "text_loss": 0.6333847045898438
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 24.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0005543672009529734,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8425571.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0057656955905258656,
+ "skip_count": 3.0,
+ "step": 5224,
+ "text_loss": 0.4552212357521057
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 24.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.0005540595092119709,
+ "loss": 0.0082,
+ "macro_f1": 1.0,
+ "num_tokens": 8429038.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.011755156330764294,
+ "skip_count": 2.0,
+ "step": 5226,
+ "text_loss": 0.16597330570220947
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0005537517967552626,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8432117.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007519085193052888,
+ "skip_count": 0.0,
+ "step": 5228,
+ "text_loss": 0.6283590197563171
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.000553444063700764,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8435176.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003066456411033869,
+ "skip_count": 0.0,
+ "step": 5230,
+ "text_loss": 0.2360922247171402
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 0.0005531363101663998,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8438515.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002865589689463377,
+ "skip_count": 0.0,
+ "step": 5232,
+ "text_loss": 0.8075396418571472
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.572938068682124,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0005528285362701011,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8441731.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012521179160103202,
+ "skip_count": 0.0,
+ "step": 5234,
+ "text_loss": 0.584335446357727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 24.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0005525207421298077,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8444535.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005398475099354982,
+ "skip_count": 3.0,
+ "step": 5236,
+ "text_loss": 0.22711622714996338
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0966796875,
+ "learning_rate": 0.0005522129278634669,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8448337.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002957914723083377,
+ "skip_count": 1.0,
+ "step": 5238,
+ "text_loss": 0.3157515823841095
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 24.601115350748458,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 0.0005519050935890335,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8451530.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007757039275020361,
+ "skip_count": 3.0,
+ "step": 5240,
+ "text_loss": 0.2815830111503601
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 24.610507778103905,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0005515972394244704,
+ "loss": 0.0063,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 8454171.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.021602008491754532,
+ "skip_count": 1.0,
+ "step": 5242,
+ "text_loss": 0.6024490594863892
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.61990020545935,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0005512893654877478,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8457544.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006062488537281752,
+ "skip_count": 0.0,
+ "step": 5244,
+ "text_loss": 0.550110936164856
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.629292632814792,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0005509814718968435,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8460135.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002793943975120783,
+ "skip_count": 0.0,
+ "step": 5246,
+ "text_loss": 0.4361286163330078
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.0005506735587697433,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8463516.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016669550677761436,
+ "skip_count": 0.0,
+ "step": 5248,
+ "text_loss": 0.4642958641052246
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.0005503656262244395,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8466406.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006051387754268944,
+ "skip_count": 0.0,
+ "step": 5250,
+ "text_loss": 0.3445641100406647
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 24.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0005500576743789329,
+ "loss": 0.0037,
+ "macro_f1": 1.0,
+ "num_tokens": 8468838.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.00654293829575181,
+ "skip_count": 1.0,
+ "step": 5252,
+ "text_loss": 0.2842808663845062
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.666862342236573,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0005497497033512309,
+ "loss": 0.0077,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 8471815.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03845973685383797,
+ "skip_count": 3.0,
+ "step": 5254,
+ "text_loss": 0.2597215175628662
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 24.676254769592017,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0005494417132593487,
+ "loss": 0.0047,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 8475202.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02252381667494774,
+ "skip_count": 4.0,
+ "step": 5256,
+ "text_loss": 0.32269927859306335
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.055419921875,
+ "learning_rate": 0.0005491337042213088,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8478650.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01232751365751028,
+ "skip_count": 2.0,
+ "step": 5258,
+ "text_loss": 0.6523372530937195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.695039624302908,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0005488256763551408,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8481724.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028322834987193346,
+ "skip_count": 0.0,
+ "step": 5260,
+ "text_loss": 0.4212580621242523
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0177001953125,
+ "learning_rate": 0.0005485176297788814,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8485833.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002623105887323618,
+ "skip_count": 2.0,
+ "step": 5262,
+ "text_loss": 0.16906329989433289
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 24.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0005482095646105748,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8489089.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0007179114618338645,
+ "skip_count": 0.0,
+ "step": 5264,
+ "text_loss": 0.4523872137069702
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.723216906369238,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0005479014809682721,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8492905.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005234059412032366,
+ "skip_count": 0.0,
+ "step": 5266,
+ "text_loss": 0.207139790058136
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.0005475933789700314,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8495480.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023258263245224953,
+ "skip_count": 0.0,
+ "step": 5268,
+ "text_loss": 0.18060965836048126
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.0005472852587339183,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8499070.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013497259933501482,
+ "skip_count": 0.0,
+ "step": 5270,
+ "text_loss": 0.7460769414901733
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.751394188435572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.056640625,
+ "learning_rate": 0.0005469771203780048,
+ "loss": 0.0099,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8502886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003589815751183778,
+ "skip_count": 0.0,
+ "step": 5272,
+ "text_loss": 0.48119160532951355
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0005466689640203701,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8506646.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006619705818593502,
+ "skip_count": 1.0,
+ "step": 5274,
+ "text_loss": 0.15656520426273346
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0005463607897791005,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8509450.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002992175053805113,
+ "skip_count": 1.0,
+ "step": 5276,
+ "text_loss": 0.486930251121521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.779571470501907,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0005460525977722886,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8512851.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027784097474068403,
+ "skip_count": 0.0,
+ "step": 5278,
+ "text_loss": 0.19654682278633118
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.788963897857354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0005457443881180345,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8516858.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017648129723966122,
+ "skip_count": 0.0,
+ "step": 5280,
+ "text_loss": 0.580982506275177
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0005454361609344444,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 8519912.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.010817649774253368,
+ "skip_count": 3.0,
+ "step": 5282,
+ "text_loss": 0.2644204795360565
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.80774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.000545127916339632,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8522396.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001453282660804689,
+ "skip_count": 0.0,
+ "step": 5284,
+ "text_loss": 0.5014839172363281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.817141179923688,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0005448196544517168,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8525326.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006645771209150553,
+ "skip_count": 2.0,
+ "step": 5286,
+ "text_loss": 0.2983154058456421
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0005445113753888254,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8528611.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005447337171062827,
+ "skip_count": 0.0,
+ "step": 5288,
+ "text_loss": 0.43598243594169617
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.000544203079269091,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8531571.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026976624503731728,
+ "skip_count": 0.0,
+ "step": 5290,
+ "text_loss": 0.6454944610595703
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.845318461990022,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.0005438947662106533,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8534565.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002217630622908473,
+ "skip_count": 0.0,
+ "step": 5292,
+ "text_loss": 0.742935836315155
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 29.0,
+ "epoch": 24.854710889345466,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0005435864363316584,
+ "loss": 0.0073,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 8537581.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.030740609392523766,
+ "skip_count": 2.0,
+ "step": 5294,
+ "text_loss": 0.48913639783859253
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0005432780897502588,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8541271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005306888837367296,
+ "skip_count": 1.0,
+ "step": 5296,
+ "text_loss": 0.5820846557617188
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 24.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.0005429697265846137,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8545052.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002255369909107685,
+ "skip_count": 0.0,
+ "step": 5298,
+ "text_loss": 0.565483808517456
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0005426613469528881,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8548605.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010787079809233546,
+ "skip_count": 0.0,
+ "step": 5300,
+ "text_loss": 0.40154510736465454
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.000542352950973254,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8552581.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017972089117392898,
+ "skip_count": 0.0,
+ "step": 5302,
+ "text_loss": 0.5430748462677002
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.901673026122687,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04736328125,
+ "learning_rate": 0.0005420445387638891,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8556360.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016180560924112797,
+ "skip_count": 2.0,
+ "step": 5304,
+ "text_loss": 0.544040322303772
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.0005417361104429777,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 8559264.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012688961811363697,
+ "skip_count": 2.0,
+ "step": 5306,
+ "text_loss": 0.2018517404794693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.0005414276661287101,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8562169.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012141643092036247,
+ "skip_count": 0.0,
+ "step": 5308,
+ "text_loss": 0.5685747265815735
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.92985030818902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.059326171875,
+ "learning_rate": 0.0005411192059392826,
+ "loss": 0.0098,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8565231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015626107342541218,
+ "skip_count": 0.0,
+ "step": 5310,
+ "text_loss": 0.8073471784591675
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0005408107299928979,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8568122.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004773529712110758,
+ "skip_count": 0.0,
+ "step": 5312,
+ "text_loss": 0.22583355009555817
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.0005405022384077644,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8571056.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025621228851377964,
+ "skip_count": 1.0,
+ "step": 5314,
+ "text_loss": 0.25274428725242615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0005401937313020967,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8574300.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009726752527058125,
+ "skip_count": 2.0,
+ "step": 5316,
+ "text_loss": 0.3283393979072571
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 24.967420017610802,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0005398852087941155,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8577424.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012483839876949787,
+ "skip_count": 4.0,
+ "step": 5318,
+ "text_loss": 0.1876130849123001
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.000539576671002047,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8580309.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009830677881836891,
+ "skip_count": 0.0,
+ "step": 5320,
+ "text_loss": 0.6955490708351135
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046875,
+ "learning_rate": 0.0005392681180441235,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8583399.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010819481685757637,
+ "skip_count": 0.0,
+ "step": 5322,
+ "text_loss": 0.4708341956138611
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.995597299677137,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.000538959550038583,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8586259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005763369146734476,
+ "skip_count": 0.0,
+ "step": 5324,
+ "text_loss": 0.20463642477989197
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.00469621367772,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0005386509671036695,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8589067.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006229027640074492,
+ "skip_count": 0.0,
+ "step": 5326,
+ "text_loss": 0.6819888353347778
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 24.0,
+ "epoch": 25.014088641033165,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0005383423693576325,
+ "loss": 0.0087,
+ "macro_f1": 0.9619450569152832,
+ "num_tokens": 8592837.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.030066559091210365,
+ "skip_count": 6.0,
+ "step": 5328,
+ "text_loss": 0.24606549739837646
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.0005380337569187272,
+ "loss": 0.0092,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8596293.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007445990107953548,
+ "skip_count": 0.0,
+ "step": 5330,
+ "text_loss": 0.16730253398418427
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 25.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.0005377251299052145,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 8599360.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004563331138342619,
+ "skip_count": 1.0,
+ "step": 5332,
+ "text_loss": 0.6856988668441772
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0005374164884353608,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8602376.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015491938684135675,
+ "skip_count": 0.0,
+ "step": 5334,
+ "text_loss": 1.3248854875564575
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.051658350454947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.0005371078326274382,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8605400.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016098044579848647,
+ "skip_count": 0.0,
+ "step": 5336,
+ "text_loss": 0.747150182723999
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 25.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.0005367991625997243,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8608100.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034471298567950726,
+ "skip_count": 3.0,
+ "step": 5338,
+ "text_loss": 0.6443291902542114
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.070443205165834,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0005364904784705015,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8611768.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007947597652673721,
+ "skip_count": 1.0,
+ "step": 5340,
+ "text_loss": 0.7768037915229797
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 25.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0005361817803580588,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 8614424.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.009964234195649624,
+ "skip_count": 2.0,
+ "step": 5342,
+ "text_loss": 0.22826914489269257
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.0005358730683806896,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8617826.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014116480015218258,
+ "skip_count": 0.0,
+ "step": 5344,
+ "text_loss": 0.49022090435028076
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 25.098620487232168,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0005355643426566929,
+ "loss": 0.0061,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 8621220.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013940622098743916,
+ "skip_count": 2.0,
+ "step": 5346,
+ "text_loss": 0.26819515228271484
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.108012914587615,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.000535255603304373,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8623957.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032230091746896505,
+ "skip_count": 2.0,
+ "step": 5348,
+ "text_loss": 0.46905452013015747
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.11740534194306,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.0005349468504420395,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8626760.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002631337149068713,
+ "skip_count": 1.0,
+ "step": 5350,
+ "text_loss": 0.5312309861183167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.126797769298502,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.0005346380841880068,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8630207.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004526057746261358,
+ "skip_count": 2.0,
+ "step": 5352,
+ "text_loss": 0.5810666084289551
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.0005343293046605949,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8633241.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023941127583384514,
+ "skip_count": 0.0,
+ "step": 5354,
+ "text_loss": 0.18468725681304932
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0005340205119781288,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8636215.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017020340310409665,
+ "skip_count": 0.0,
+ "step": 5356,
+ "text_loss": 0.6665788888931274
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0005337117062589383,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8639326.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004964717663824558,
+ "skip_count": 2.0,
+ "step": 5358,
+ "text_loss": 0.19770404696464539
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.164367478720283,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.0005334028876213585,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8642157.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006587155628949404,
+ "skip_count": 0.0,
+ "step": 5360,
+ "text_loss": 0.2295130044221878
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0005330940561837291,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8645355.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006586945964954793,
+ "skip_count": 0.0,
+ "step": 5362,
+ "text_loss": 0.2701159417629242
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0005327852120643947,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8648911.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0014281768817454576,
+ "skip_count": 0.0,
+ "step": 5364,
+ "text_loss": 0.8957229852676392
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.192544760786618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0005324763553817053,
+ "loss": 0.0027,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8652037.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005899337120354176,
+ "skip_count": 0.0,
+ "step": 5366,
+ "text_loss": 0.38642236590385437
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 25.20193718814206,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0005321674862540154,
+ "loss": 0.0058,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 8655381.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.024511313065886497,
+ "skip_count": 1.0,
+ "step": 5368,
+ "text_loss": 0.6439879536628723
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.000531858604799684,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8658476.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012558114249259233,
+ "skip_count": 0.0,
+ "step": 5370,
+ "text_loss": 0.3227672874927521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.220722042852948,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06005859375,
+ "learning_rate": 0.0005315497111370752,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8661982.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013541636290028691,
+ "skip_count": 0.0,
+ "step": 5372,
+ "text_loss": 0.6375321745872498
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 25.230114470208395,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.051513671875,
+ "learning_rate": 0.0005312408053845575,
+ "loss": 0.0052,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 8665071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010432626120746136,
+ "skip_count": 2.0,
+ "step": 5374,
+ "text_loss": 0.536924421787262
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.0005309318876605042,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8668411.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004450209904462099,
+ "skip_count": 1.0,
+ "step": 5376,
+ "text_loss": 0.2643466889858246
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.248899324919282,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0005306229580832933,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 8672088.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011189920827746391,
+ "skip_count": 3.0,
+ "step": 5378,
+ "text_loss": 0.8259533047676086
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.000530314016771307,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8675206.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020095291547477245,
+ "skip_count": 0.0,
+ "step": 5380,
+ "text_loss": 0.31364113092422485
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.267684179630173,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0005300050638429324,
+ "loss": 0.0078,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 8678289.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010738557204604149,
+ "skip_count": 1.0,
+ "step": 5382,
+ "text_loss": 0.19013966619968414
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.277076606985617,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.0005296960994165607,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8681555.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018534278497099876,
+ "skip_count": 1.0,
+ "step": 5384,
+ "text_loss": 0.762248694896698
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.286469034341064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0005293871236105877,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8684413.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009143726900219917,
+ "skip_count": 2.0,
+ "step": 5386,
+ "text_loss": 0.19994212687015533
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 25.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0005290781365434134,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8687450.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.002034468576312065,
+ "skip_count": 0.0,
+ "step": 5388,
+ "text_loss": 0.5519160628318787
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.30525388905195,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0005287691383334425,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8690651.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006834167055785656,
+ "skip_count": 0.0,
+ "step": 5390,
+ "text_loss": 0.5439304709434509
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.314646316407398,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060791015625,
+ "learning_rate": 0.0005284601290990832,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8693929.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0022327799815684557,
+ "skip_count": 0.0,
+ "step": 5392,
+ "text_loss": 0.24108269810676575
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0005281511089587491,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8696727.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002669565612450242,
+ "skip_count": 0.0,
+ "step": 5394,
+ "text_loss": 0.8659077286720276
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0005278420780308568,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8700934.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007252473384141922,
+ "skip_count": 0.0,
+ "step": 5396,
+ "text_loss": 0.5592793226242065
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.342823598473732,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 0.0005275330364338276,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8704449.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001793015981093049,
+ "skip_count": 0.0,
+ "step": 5398,
+ "text_loss": 0.5211784243583679
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 25.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 0.0005272239842860868,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 8707384.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.00963665172457695,
+ "skip_count": 4.0,
+ "step": 5400,
+ "text_loss": 0.6092788577079773
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 25.36160845318462,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0005269149217060642,
+ "loss": 0.0059,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 8710453.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01758105307817459,
+ "skip_count": 2.0,
+ "step": 5402,
+ "text_loss": 0.3423936069011688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0005266058488121926,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8713514.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025636721402406693,
+ "skip_count": 1.0,
+ "step": 5404,
+ "text_loss": 0.484171986579895
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.38039330789551,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0005262967657229095,
+ "loss": 0.0064,
+ "macro_f1": 0.9255813956260681,
+ "num_tokens": 8717051.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.022406045347452164,
+ "skip_count": 4.0,
+ "step": 5406,
+ "text_loss": 0.23368191719055176
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 0.0005259876725566563,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8719987.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004114408977329731,
+ "skip_count": 2.0,
+ "step": 5408,
+ "text_loss": 0.20237496495246887
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.399178162606397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.000525678569431878,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8723258.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006741158664226532,
+ "skip_count": 2.0,
+ "step": 5410,
+ "text_loss": 0.7969435453414917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.0005253694564670233,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8726294.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034468702506273985,
+ "skip_count": 0.0,
+ "step": 5412,
+ "text_loss": 0.5533816814422607
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.000525060333780545,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8729603.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01086533535271883,
+ "skip_count": 2.0,
+ "step": 5414,
+ "text_loss": 0.31856611371040344
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 25.42735544467273,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0005247512014908998,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8733423.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00512756546959281,
+ "skip_count": 6.0,
+ "step": 5416,
+ "text_loss": 0.6710903644561768
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06103515625,
+ "learning_rate": 0.0005244420597165472,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8736457.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026201079599559307,
+ "skip_count": 0.0,
+ "step": 5418,
+ "text_loss": 0.6469964981079102
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.0005241329085759514,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8739617.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004130818881094456,
+ "skip_count": 0.0,
+ "step": 5420,
+ "text_loss": 0.4868837296962738
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0005238237481875795,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8742653.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003171122632920742,
+ "skip_count": 0.0,
+ "step": 5422,
+ "text_loss": 0.12026242166757584
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.464925154094512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0005235145786699021,
+ "loss": 0.0091,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8745835.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008553664083592594,
+ "skip_count": 0.0,
+ "step": 5424,
+ "text_loss": 0.601640522480011
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0005232054001413941,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8749006.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006958908052183688,
+ "skip_count": 0.0,
+ "step": 5426,
+ "text_loss": 0.7083519101142883
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 0.0005228962127205329,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8752493.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012221037177368999,
+ "skip_count": 1.0,
+ "step": 5428,
+ "text_loss": 0.3949109613895416
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.493102436160846,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0005225870165257997,
+ "loss": 0.0079,
+ "macro_f1": 1.0,
+ "num_tokens": 8755294.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003924673888832331,
+ "skip_count": 2.0,
+ "step": 5430,
+ "text_loss": 0.7487186789512634
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0005222778116756793,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8758043.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002388258930295706,
+ "skip_count": 0.0,
+ "step": 5432,
+ "text_loss": 0.4092858135700226
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0005219685982886594,
+ "loss": 0.0037,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8760618.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0045886957086622715,
+ "skip_count": 0.0,
+ "step": 5434,
+ "text_loss": 0.5889580249786377
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.052978515625,
+ "learning_rate": 0.0005216593764832311,
+ "loss": 0.0074,
+ "macro_f1": 1.0,
+ "num_tokens": 8764269.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00704155582934618,
+ "skip_count": 2.0,
+ "step": 5436,
+ "text_loss": 0.2634117007255554
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0005213501463778889,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8767142.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00368728069588542,
+ "skip_count": 2.0,
+ "step": 5438,
+ "text_loss": 0.3512301445007324
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05322265625,
+ "learning_rate": 0.0005210409080911304,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8770239.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012925115879625082,
+ "skip_count": 0.0,
+ "step": 5440,
+ "text_loss": 0.9330073595046997
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0005207316617414561,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8772927.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005604506935924292,
+ "skip_count": 0.0,
+ "step": 5442,
+ "text_loss": 0.23477613925933838
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.55884942764896,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 0.0005204224074473701,
+ "loss": 0.0049,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 8776451.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010945434682071209,
+ "skip_count": 2.0,
+ "step": 5444,
+ "text_loss": 0.6184295415878296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.0005201131453273789,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8779481.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024414353538304567,
+ "skip_count": 0.0,
+ "step": 5446,
+ "text_loss": 0.16186967492103577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.57763428235985,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.0005198038754999926,
+ "loss": 0.0052,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 8782425.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013872416689991951,
+ "skip_count": 0.0,
+ "step": 5448,
+ "text_loss": 0.42294546961784363
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0005194945980837237,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8785466.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006147907115519047,
+ "skip_count": 0.0,
+ "step": 5450,
+ "text_loss": 0.6285432577133179
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0005191853131970881,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8788461.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010585964191704988,
+ "skip_count": 0.0,
+ "step": 5452,
+ "text_loss": 0.6032317876815796
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0005188760209586044,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8791572.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005267909727990627,
+ "skip_count": 1.0,
+ "step": 5454,
+ "text_loss": 0.3015609681606293
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0005185667214867937,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8794697.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000532392121385783,
+ "skip_count": 0.0,
+ "step": 5456,
+ "text_loss": 0.9596265554428101
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0005182574149001805,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8797880.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007176774088293314,
+ "skip_count": 0.0,
+ "step": 5458,
+ "text_loss": 0.5599364638328552
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0005179481013172912,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8801995.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022756673861294985,
+ "skip_count": 0.0,
+ "step": 5460,
+ "text_loss": 0.47327280044555664
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0005176387808566558,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8805138.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025084633380174637,
+ "skip_count": 0.0,
+ "step": 5462,
+ "text_loss": 0.26674970984458923
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.0005173294536368061,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8808102.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008814680040813982,
+ "skip_count": 0.0,
+ "step": 5464,
+ "text_loss": 0.5981299877166748
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.662166128558848,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0005170201197762773,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8811431.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005443177651613951,
+ "skip_count": 0.0,
+ "step": 5466,
+ "text_loss": 1.037438988685608
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0005167107793936065,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8814256.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000494555220939219,
+ "skip_count": 0.0,
+ "step": 5468,
+ "text_loss": 0.5005733966827393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0005164014326073333,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8817024.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004793747793883085,
+ "skip_count": 2.0,
+ "step": 5470,
+ "text_loss": 0.6999614834785461
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.690343410625182,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.0005160920795360002,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8819892.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020966180600225925,
+ "skip_count": 0.0,
+ "step": 5472,
+ "text_loss": 0.5536707043647766
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.69973583798063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.0005157827202981521,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8822928.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020367507822811604,
+ "skip_count": 0.0,
+ "step": 5474,
+ "text_loss": 0.43655988574028015
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0005154733550123356,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8825842.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020070383325219154,
+ "skip_count": 0.0,
+ "step": 5476,
+ "text_loss": 0.48149657249450684
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.718520692691516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0005151639837971004,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8829534.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016327418852597475,
+ "skip_count": 0.0,
+ "step": 5478,
+ "text_loss": 0.6693689227104187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.727913120046964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.000514854606770998,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8833177.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012691980227828026,
+ "skip_count": 0.0,
+ "step": 5480,
+ "text_loss": 0.44926801323890686
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0005145452240525822,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8836933.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0007724820752628148,
+ "skip_count": 0.0,
+ "step": 5482,
+ "text_loss": 0.5759884119033813
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 25.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0005142358357604092,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 8840093.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008331702090799809,
+ "skip_count": 7.0,
+ "step": 5484,
+ "text_loss": 0.47393685579299927
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.756090402113298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0005139264420130368,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8843918.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003124477108940482,
+ "skip_count": 2.0,
+ "step": 5486,
+ "text_loss": 0.5298711061477661
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08447265625,
+ "learning_rate": 0.0005136170429290259,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8846558.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034127775579690933,
+ "skip_count": 2.0,
+ "step": 5488,
+ "text_loss": 0.43582668900489807
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.774875256824185,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.0005133076386269383,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8849724.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0018056259723380208,
+ "skip_count": 0.0,
+ "step": 5490,
+ "text_loss": 0.8116800785064697
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 25.784267684179632,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0005129982292253384,
+ "loss": 0.0063,
+ "macro_f1": 0.6589147448539734,
+ "num_tokens": 8852447.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.021452350541949272,
+ "skip_count": 6.0,
+ "step": 5492,
+ "text_loss": 0.31878748536109924
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0005126888148427927,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8855886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026911941822618246,
+ "skip_count": 0.0,
+ "step": 5494,
+ "text_loss": 0.4021807909011841
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 25.80305253889052,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0005123793955978693,
+ "loss": 0.007,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 8859378.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019764510914683342,
+ "skip_count": 2.0,
+ "step": 5496,
+ "text_loss": 0.21608132123947144
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.812444966245963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.0005120699716091379,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8862310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008988190093077719,
+ "skip_count": 0.0,
+ "step": 5498,
+ "text_loss": 0.34666743874549866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0005117605429951707,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8865166.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011137975379824638,
+ "skip_count": 2.0,
+ "step": 5500,
+ "text_loss": 0.25385144352912903
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 25.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0005114511098745412,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 8869923.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006476947572082281,
+ "skip_count": 4.0,
+ "step": 5502,
+ "text_loss": 0.4503856301307678
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.840622248312297,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.000511141672365825,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8872451.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022727579344063997,
+ "skip_count": 0.0,
+ "step": 5504,
+ "text_loss": 0.7522464990615845
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.0005108322305875987,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8875968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020014268811792135,
+ "skip_count": 0.0,
+ "step": 5506,
+ "text_loss": 0.30184176564216614
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04833984375,
+ "learning_rate": 0.0005105227846584414,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8879705.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001179999322630465,
+ "skip_count": 0.0,
+ "step": 5508,
+ "text_loss": 0.6187804937362671
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0005102133346969329,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8883535.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002946492750197649,
+ "skip_count": 0.0,
+ "step": 5510,
+ "text_loss": 0.5961501002311707
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.878191957734078,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.0005099038808216555,
+ "loss": 0.0063,
+ "macro_f1": 1.0,
+ "num_tokens": 8886683.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004532935563474894,
+ "skip_count": 3.0,
+ "step": 5512,
+ "text_loss": 0.38462957739830017
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.88758438508952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0005095944231511922,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8891049.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00917842984199524,
+ "skip_count": 2.0,
+ "step": 5514,
+ "text_loss": 0.27541956305503845
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.896976812444965,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0005092849618041279,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8893604.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008756510796956718,
+ "skip_count": 0.0,
+ "step": 5516,
+ "text_loss": 0.681315541267395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.906369239800412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0005089754968990487,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8898072.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008704439387656748,
+ "skip_count": 1.0,
+ "step": 5518,
+ "text_loss": 0.5060005187988281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0005086660285545422,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8901539.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004750201944261789,
+ "skip_count": 1.0,
+ "step": 5520,
+ "text_loss": 0.6008047461509705
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.000508356556889197,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8904525.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026552649214863777,
+ "skip_count": 0.0,
+ "step": 5522,
+ "text_loss": 0.4539012908935547
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.934546521866746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0005080470820216037,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8907624.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002621029270812869,
+ "skip_count": 1.0,
+ "step": 5524,
+ "text_loss": 0.20088370144367218
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 25.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.0005077376040703533,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8910515.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.0028921898920089006,
+ "skip_count": 0.0,
+ "step": 5526,
+ "text_loss": 0.6575983166694641
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.8888888955116272,
+ "avg_layers": 21.0,
+ "epoch": 25.953331376577633,
+ "f1_execute": 0.9729729890823364,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.9411765336990356,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0005074281231540384,
+ "loss": 0.0076,
+ "macro_f1": 0.9713832139968872,
+ "num_tokens": 8914419.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.024232301861047745,
+ "skip_count": 9.0,
+ "step": 5528,
+ "text_loss": 0.5435594916343689
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.0005071186393912527,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8917543.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003731841454282403,
+ "skip_count": 2.0,
+ "step": 5530,
+ "text_loss": 0.5152071118354797
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0005068091529005909,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8920728.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005905418191105127,
+ "skip_count": 0.0,
+ "step": 5532,
+ "text_loss": 0.29741042852401733
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.981508658643968,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.000506499663800649,
+ "loss": 0.0096,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8924112.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0021933517418801785,
+ "skip_count": 0.0,
+ "step": 5534,
+ "text_loss": 0.45704230666160583
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 25.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0005061901722100235,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8927323.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009227502159774303,
+ "skip_count": 4.0,
+ "step": 5536,
+ "text_loss": 0.1968434453010559
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.0,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.0005058806782473125,
+ "loss": 0.0053,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 8931052.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02054760232567787,
+ "skip_count": 2.0,
+ "step": 5538,
+ "text_loss": 0.23851273953914642
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.009392427355444,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.0005055711820311144,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8934215.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008434011251665652,
+ "skip_count": 0.0,
+ "step": 5540,
+ "text_loss": 0.85942542552948
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 26.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0005052616836800288,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8937173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011105241253972054,
+ "skip_count": 4.0,
+ "step": 5542,
+ "text_loss": 0.2614556849002838
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0005049521833126561,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8940553.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006273435428738594,
+ "skip_count": 0.0,
+ "step": 5544,
+ "text_loss": 0.6430498957633972
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.037569709421778,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0005046426810475976,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8943753.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023464353289455175,
+ "skip_count": 1.0,
+ "step": 5546,
+ "text_loss": 0.7015808820724487
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0005043331770034547,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8947149.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016024730866774917,
+ "skip_count": 1.0,
+ "step": 5548,
+ "text_loss": 0.5875257253646851
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0005040236712988304,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8950374.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004096277989447117,
+ "skip_count": 0.0,
+ "step": 5550,
+ "text_loss": 0.1712338626384735
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 26.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0005037141640523275,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8953256.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00441550649702549,
+ "skip_count": 0.0,
+ "step": 5552,
+ "text_loss": 0.16560404002666473
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.07513941884356,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0005034046553825501,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 8956845.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.011712636798620224,
+ "skip_count": 6.0,
+ "step": 5554,
+ "text_loss": 0.24278216063976288
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0005030951454081023,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8961165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00235542468726635,
+ "skip_count": 1.0,
+ "step": 5556,
+ "text_loss": 0.17214511334896088
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.093924273554446,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0005027856342475888,
+ "loss": 0.0037,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 8965262.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0160827673971653,
+ "skip_count": 1.0,
+ "step": 5558,
+ "text_loss": 0.40229740738868713
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 26.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 0.0005024761220196151,
+ "loss": 0.0091,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8968278.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004786997567862272,
+ "skip_count": 0.0,
+ "step": 5560,
+ "text_loss": 0.24828575551509857
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 26.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0005021666088427868,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8971443.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015378865646198392,
+ "skip_count": 0.0,
+ "step": 5562,
+ "text_loss": 0.7269657254219055
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 0.0005018570948357099,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8975312.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015218508196994662,
+ "skip_count": 0.0,
+ "step": 5564,
+ "text_loss": 0.5198811292648315
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0005015475801169908,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8977951.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008865317329764366,
+ "skip_count": 1.0,
+ "step": 5566,
+ "text_loss": 0.1541406810283661
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 26.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0005012380648052359,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8981325.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0055318837985396385,
+ "skip_count": 0.0,
+ "step": 5568,
+ "text_loss": 0.510314404964447
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.0005009285490190523,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8984661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035060355439782143,
+ "skip_count": 0.0,
+ "step": 5570,
+ "text_loss": 0.29421761631965637
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.159671265042558,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.000500619032877047,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8987573.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0050126477144658566,
+ "skip_count": 2.0,
+ "step": 5572,
+ "text_loss": 0.1984361708164215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.0005003095164978271,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8991136.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019407360814511776,
+ "skip_count": 0.0,
+ "step": 5574,
+ "text_loss": 0.42751404643058777
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0005,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8994198.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029819176997989416,
+ "skip_count": 2.0,
+ "step": 5576,
+ "text_loss": 0.20589640736579895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.187848547108892,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0004996904835021729,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8997907.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000878945691511035,
+ "skip_count": 1.0,
+ "step": 5578,
+ "text_loss": 0.2801406979560852
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.19724097446434,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.000499380967122953,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9001141.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005223734769970179,
+ "skip_count": 1.0,
+ "step": 5580,
+ "text_loss": 0.20542480051517487
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0004990714509809478,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9004794.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015868612099438906,
+ "skip_count": 0.0,
+ "step": 5582,
+ "text_loss": 0.32094934582710266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 26.216025829175226,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0004987619351947643,
+ "loss": 0.0064,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 9009250.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.031923454254865646,
+ "skip_count": 4.0,
+ "step": 5584,
+ "text_loss": 0.609201967716217
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.225418256530673,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.0004984524198830095,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9013254.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033124545589089394,
+ "skip_count": 0.0,
+ "step": 5586,
+ "text_loss": 0.3698650300502777
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0004981429051642903,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9016598.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017190382350236177,
+ "skip_count": 1.0,
+ "step": 5588,
+ "text_loss": 0.5306026935577393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.24420311124156,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0004978333911572132,
+ "loss": 0.0059,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9019558.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02051064372062683,
+ "skip_count": 1.0,
+ "step": 5590,
+ "text_loss": 0.23494470119476318
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.253595538597008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.0004975238779803849,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9023024.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010489600244909525,
+ "skip_count": 0.0,
+ "step": 5592,
+ "text_loss": 0.579275906085968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0004972143657524112,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9026161.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012039231369271874,
+ "skip_count": 0.0,
+ "step": 5594,
+ "text_loss": 0.5776295065879822
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0004969048545918978,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9028814.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010212450288236141,
+ "skip_count": 1.0,
+ "step": 5596,
+ "text_loss": 0.6816855669021606
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 26.281772820663342,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00049659534461745,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9032243.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0024297661148011684,
+ "skip_count": 0.0,
+ "step": 5598,
+ "text_loss": 0.743188202381134
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.0004962858359476726,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9035493.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002151754219084978,
+ "skip_count": 0.0,
+ "step": 5600,
+ "text_loss": 0.5213983654975891
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0004959763287011698,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9038213.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028108188416808844,
+ "skip_count": 2.0,
+ "step": 5602,
+ "text_loss": 0.5128397345542908
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.309950102729672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0004956668229965454,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9041152.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004022551700472832,
+ "skip_count": 2.0,
+ "step": 5604,
+ "text_loss": 0.15361636877059937
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.31934253008512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0004953573189524026,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9044503.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010689410846680403,
+ "skip_count": 1.0,
+ "step": 5606,
+ "text_loss": 0.6454885005950928
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0004950478166873439,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9047742.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025760293938219547,
+ "skip_count": 0.0,
+ "step": 5608,
+ "text_loss": 0.7654000520706177
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.338127384796007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0004947383163199713,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9050349.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009846165776252747,
+ "skip_count": 0.0,
+ "step": 5610,
+ "text_loss": 0.41533342003822327
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 0.0004944288179688858,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9053667.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017193946987390518,
+ "skip_count": 1.0,
+ "step": 5612,
+ "text_loss": 1.0172475576400757
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0004941193217526875,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9056777.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026750199031084776,
+ "skip_count": 0.0,
+ "step": 5614,
+ "text_loss": 0.17584927380084991
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 26.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0004938098277899765,
+ "loss": 0.0068,
+ "macro_f1": 1.0,
+ "num_tokens": 9060609.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005259076599031687,
+ "skip_count": 1.0,
+ "step": 5616,
+ "text_loss": 0.5522297024726868
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.375697094217788,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0004935003361993511,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9063633.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006837095716036856,
+ "skip_count": 0.0,
+ "step": 5618,
+ "text_loss": 0.5212588310241699
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 26.38508952157323,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0004931908470994091,
+ "loss": 0.0059,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 9067777.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01067375484853983,
+ "skip_count": 1.0,
+ "step": 5620,
+ "text_loss": 0.5515062808990479
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 26.394481948928675,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 0.0004928813606087474,
+ "loss": 0.0043,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 9070938.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016635602340102196,
+ "skip_count": 3.0,
+ "step": 5622,
+ "text_loss": 0.3225076198577881
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.403874376284122,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0004925718768459617,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9074050.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002216119086369872,
+ "skip_count": 0.0,
+ "step": 5624,
+ "text_loss": 0.32438889145851135
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 26.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0004922623959296469,
+ "loss": 0.0082,
+ "macro_f1": 1.0,
+ "num_tokens": 9076785.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012125075794756413,
+ "skip_count": 5.0,
+ "step": 5626,
+ "text_loss": 0.39563658833503723
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.0004919529179783965,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9080239.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026486809365451336,
+ "skip_count": 0.0,
+ "step": 5628,
+ "text_loss": 0.5401569604873657
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.432051658350456,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0004916434431108031,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9083935.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011849761940538883,
+ "skip_count": 0.0,
+ "step": 5630,
+ "text_loss": 0.4798774719238281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 0.000491333971445458,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9087174.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002799210138618946,
+ "skip_count": 0.0,
+ "step": 5632,
+ "text_loss": 0.22488386929035187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0004910245031009515,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9089803.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00139117450453341,
+ "skip_count": 0.0,
+ "step": 5634,
+ "text_loss": 0.6237335205078125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0004907150381958723,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9093075.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006503603886812925,
+ "skip_count": 1.0,
+ "step": 5636,
+ "text_loss": 0.18781614303588867
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.0004904055768488077,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9096355.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009764843271113932,
+ "skip_count": 0.0,
+ "step": 5638,
+ "text_loss": 0.6821450591087341
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0004900961191783445,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 9098994.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00693159457296133,
+ "skip_count": 3.0,
+ "step": 5640,
+ "text_loss": 0.214790940284729
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.488406222483125,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0004897866653030671,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9102048.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002469591563567519,
+ "skip_count": 0.0,
+ "step": 5642,
+ "text_loss": 0.1556607335805893
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.49779864983857,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0004894772153415588,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9105379.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004824921488761902,
+ "skip_count": 0.0,
+ "step": 5644,
+ "text_loss": 0.499972403049469
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0004891677694124013,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9108240.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029356612358242273,
+ "skip_count": 1.0,
+ "step": 5646,
+ "text_loss": 0.5169754028320312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.516583504549455,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0174560546875,
+ "learning_rate": 0.0004888583276341751,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9111381.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009489183314144611,
+ "skip_count": 1.0,
+ "step": 5648,
+ "text_loss": 0.23630797863006592
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.017822265625,
+ "learning_rate": 0.0004885488901254588,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9114015.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004154495894908905,
+ "skip_count": 1.0,
+ "step": 5650,
+ "text_loss": 0.3345947563648224
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0546875,
+ "learning_rate": 0.0004882394570048294,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9117044.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018865863094106317,
+ "skip_count": 0.0,
+ "step": 5652,
+ "text_loss": 0.32814112305641174
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.0004879300283908623,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9120035.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035278978757560253,
+ "skip_count": 1.0,
+ "step": 5654,
+ "text_loss": 0.4081386625766754
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 26.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.00048762060440213096,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9122955.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0053498269990086555,
+ "skip_count": 0.0,
+ "step": 5656,
+ "text_loss": 0.31027838587760925
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0004873111851572075,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9125635.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004556098487228155,
+ "skip_count": 0.0,
+ "step": 5658,
+ "text_loss": 0.25703540444374084
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.572938068682124,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0004870017707746617,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9128906.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031165245454758406,
+ "skip_count": 2.0,
+ "step": 5660,
+ "text_loss": 0.20663656294345856
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0004866923613730617,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 9132030.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004887583665549755,
+ "skip_count": 2.0,
+ "step": 5662,
+ "text_loss": 0.6062649488449097
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0004863829570709741,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9135274.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021857863757759333,
+ "skip_count": 0.0,
+ "step": 5664,
+ "text_loss": 0.49644309282302856
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 26.601115350748458,
+ "f1_execute": 0.9756097793579102,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0004860735579869631,
+ "loss": 0.0088,
+ "macro_f1": 0.925203263759613,
+ "num_tokens": 9139735.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.05413912236690521,
+ "skip_count": 5.0,
+ "step": 5666,
+ "text_loss": 0.25161290168762207
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.610507778103905,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.00048576416423959097,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9142419.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002229376696050167,
+ "skip_count": 0.0,
+ "step": 5668,
+ "text_loss": 0.5332949161529541
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 26.61990020545935,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0004854547759474179,
+ "loss": 0.0045,
+ "macro_f1": 1.0,
+ "num_tokens": 9145443.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005968933925032616,
+ "skip_count": 4.0,
+ "step": 5670,
+ "text_loss": 0.5282154083251953
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.629292632814792,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060302734375,
+ "learning_rate": 0.0004851453932290021,
+ "loss": 0.0085,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9147754.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04015754163265228,
+ "skip_count": 1.0,
+ "step": 5672,
+ "text_loss": 0.8564629554748535
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.63868506017024,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00048483601620289974,
+ "loss": 0.0058,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 9151714.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.019172413274645805,
+ "skip_count": 2.0,
+ "step": 5674,
+ "text_loss": 0.4149441123008728
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 26.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0004845266449876645,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9154524.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005025535821914673,
+ "skip_count": 0.0,
+ "step": 5676,
+ "text_loss": 0.26525792479515076
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.000484217279701848,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9158546.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012200147612020373,
+ "skip_count": 0.0,
+ "step": 5678,
+ "text_loss": 0.5532271862030029
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.666862342236573,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0004839079204639998,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9161003.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013485675444826484,
+ "skip_count": 1.0,
+ "step": 5680,
+ "text_loss": 0.36826151609420776
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.0004835985673926668,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9164741.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00532014574855566,
+ "skip_count": 2.0,
+ "step": 5682,
+ "text_loss": 0.16154609620571136
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0004832892206063938,
+ "loss": 0.0075,
+ "macro_f1": 1.0,
+ "num_tokens": 9168079.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.007782323285937309,
+ "skip_count": 3.0,
+ "step": 5684,
+ "text_loss": 0.4323575496673584
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.695039624302908,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.0004829798802237228,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9171352.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024159469176083803,
+ "skip_count": 2.0,
+ "step": 5686,
+ "text_loss": 0.3163119852542877
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.000482670546363194,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9175197.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002455134643241763,
+ "skip_count": 0.0,
+ "step": 5688,
+ "text_loss": 0.59735506772995
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.713824479013795,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0004823612191433443,
+ "loss": 0.0042,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 9177648.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.015524548478424549,
+ "skip_count": 2.0,
+ "step": 5690,
+ "text_loss": 0.759812593460083
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.723216906369238,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.00048205189868270887,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9180694.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002112736226990819,
+ "skip_count": 2.0,
+ "step": 5692,
+ "text_loss": 0.3516882061958313
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 28.0,
+ "epoch": 26.732609333724685,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.00048174258509981973,
+ "loss": 0.0063,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 9183502.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03100527822971344,
+ "skip_count": 3.0,
+ "step": 5694,
+ "text_loss": 0.3722715973854065
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0004814332785132064,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9186417.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009176591411232948,
+ "skip_count": 2.0,
+ "step": 5696,
+ "text_loss": 0.33363673090934753
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.751394188435572,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0004811239790413958,
+ "loss": 0.0076,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9189478.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023586507886648178,
+ "skip_count": 1.0,
+ "step": 5698,
+ "text_loss": 0.19698107242584229
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.00048081468680291194,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9192115.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005083440337330103,
+ "skip_count": 1.0,
+ "step": 5700,
+ "text_loss": 0.3476336896419525
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0004805054019162764,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9195176.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007766073569655418,
+ "skip_count": 1.0,
+ "step": 5702,
+ "text_loss": 0.27114811539649963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.779571470501907,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0004801961245000076,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9199091.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009058842551894486,
+ "skip_count": 0.0,
+ "step": 5704,
+ "text_loss": 0.6249846816062927
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.788963897857354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0004798868546726212,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9202003.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005479823332279921,
+ "skip_count": 0.0,
+ "step": 5706,
+ "text_loss": 0.47223609685897827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0166015625,
+ "learning_rate": 0.00047957759255263014,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9205277.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001055705244652927,
+ "skip_count": 0.0,
+ "step": 5708,
+ "text_loss": 0.677215576171875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.80774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.00047926833825854377,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9208844.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003291431115940213,
+ "skip_count": 2.0,
+ "step": 5710,
+ "text_loss": 0.12439999729394913
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.817141179923688,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06591796875,
+ "learning_rate": 0.0004789590919088696,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9211619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005120242480188608,
+ "skip_count": 2.0,
+ "step": 5712,
+ "text_loss": 0.5771954655647278
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0004786498536221111,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 9214914.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004877795465290546,
+ "skip_count": 2.0,
+ "step": 5714,
+ "text_loss": 0.6432198882102966
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.00047834062351676893,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9218186.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026507999282330275,
+ "skip_count": 0.0,
+ "step": 5716,
+ "text_loss": 0.23814935982227325
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.845318461990022,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.00047803140171134075,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9221754.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002605629386380315,
+ "skip_count": 1.0,
+ "step": 5718,
+ "text_loss": 0.2910388708114624
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 26.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0004777221883243208,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9224502.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0048494706861674786,
+ "skip_count": 3.0,
+ "step": 5720,
+ "text_loss": 0.6195104122161865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0004774129834742004,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9227350.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003092368133366108,
+ "skip_count": 0.0,
+ "step": 5722,
+ "text_loss": 0.35447990894317627
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.00047710378727946725,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9230166.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012780336663126945,
+ "skip_count": 2.0,
+ "step": 5724,
+ "text_loss": 0.27581867575645447
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00047679459985860604,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9233029.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005429140292108059,
+ "skip_count": 1.0,
+ "step": 5726,
+ "text_loss": 0.2636827826499939
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.00047648542133009794,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9236317.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023909916635602713,
+ "skip_count": 0.0,
+ "step": 5728,
+ "text_loss": 0.4801979064941406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.901673026122687,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.00047617625181242077,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9239796.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003603481687605381,
+ "skip_count": 0.0,
+ "step": 5730,
+ "text_loss": 0.8374754786491394
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.0004758670914240488,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9243489.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004478964954614639,
+ "skip_count": 2.0,
+ "step": 5732,
+ "text_loss": 0.3870154917240143
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.000475557940283453,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9246758.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00312575395219028,
+ "skip_count": 1.0,
+ "step": 5734,
+ "text_loss": 0.42341071367263794
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 26.92985030818902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.00047524879850910026,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9250053.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010855631902813911,
+ "skip_count": 4.0,
+ "step": 5736,
+ "text_loss": 0.25729796290397644
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.0004749396662194549,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9253691.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009250419097952545,
+ "skip_count": 0.0,
+ "step": 5738,
+ "text_loss": 0.6151770949363708
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.0004746305435329767,
+ "loss": 0.0064,
+ "macro_f1": 1.0,
+ "num_tokens": 9256866.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007521102204918861,
+ "skip_count": 3.0,
+ "step": 5740,
+ "text_loss": 0.3094986379146576
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0004743214305681221,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9259790.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022241887636482716,
+ "skip_count": 1.0,
+ "step": 5742,
+ "text_loss": 0.5418204069137573
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.967420017610802,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.00047401232744334376,
+ "loss": 0.0071,
+ "macro_f1": 1.0,
+ "num_tokens": 9263205.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008611299097537994,
+ "skip_count": 2.0,
+ "step": 5744,
+ "text_loss": 0.35824623703956604
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 26.976812444966246,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0004737032342770906,
+ "loss": 0.0062,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 9266126.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010788857005536556,
+ "skip_count": 2.0,
+ "step": 5746,
+ "text_loss": 0.2172674983739853
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0004733941511878074,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9269308.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005309196189045906,
+ "skip_count": 2.0,
+ "step": 5748,
+ "text_loss": 0.1696814000606537
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.995597299677137,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.00047308507829393594,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9272801.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009940510615706444,
+ "skip_count": 2.0,
+ "step": 5750,
+ "text_loss": 0.24295592308044434
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.00469621367772,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00047277601571391314,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9276197.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000687236781232059,
+ "skip_count": 0.0,
+ "step": 5752,
+ "text_loss": 0.8511804342269897
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.014088641033165,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.00047246696356617254,
+ "loss": 0.0059,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 9278965.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009816894307732582,
+ "skip_count": 1.0,
+ "step": 5754,
+ "text_loss": 0.45420053601264954
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 0.0004721579219691434,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9282076.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015747188590466976,
+ "skip_count": 0.0,
+ "step": 5756,
+ "text_loss": 0.21671754121780396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0004718488910412511,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9285465.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008654040284454823,
+ "skip_count": 2.0,
+ "step": 5758,
+ "text_loss": 0.25920194387435913
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.00047153987090091674,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9288156.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011430777376517653,
+ "skip_count": 0.0,
+ "step": 5760,
+ "text_loss": 0.7655444741249084
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.051658350454947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0004712308616665576,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9291529.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003674200503155589,
+ "skip_count": 2.0,
+ "step": 5762,
+ "text_loss": 0.269486665725708
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0004709218634565866,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9294699.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003249827306717634,
+ "skip_count": 1.0,
+ "step": 5764,
+ "text_loss": 0.5073734521865845
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.070443205165834,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.00047061287638941235,
+ "loss": 0.0068,
+ "macro_f1": 1.0,
+ "num_tokens": 9297863.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002763139782473445,
+ "skip_count": 2.0,
+ "step": 5766,
+ "text_loss": 0.2572014033794403
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 27.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.00047030390058343935,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9301124.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007100266870111227,
+ "skip_count": 3.0,
+ "step": 5768,
+ "text_loss": 0.4147387742996216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0186767578125,
+ "learning_rate": 0.0004699949361570676,
+ "loss": 0.0034,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9304330.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005467240232974291,
+ "skip_count": 1.0,
+ "step": 5770,
+ "text_loss": 0.21510964632034302
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.098620487232168,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.000469685983228693,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9306882.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003167890477925539,
+ "skip_count": 0.0,
+ "step": 5772,
+ "text_loss": 0.45717427134513855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.108012914587615,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00046937704191670675,
+ "loss": 0.0057,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 9309767.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.014881107024848461,
+ "skip_count": 2.0,
+ "step": 5774,
+ "text_loss": 0.3464985191822052
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.11740534194306,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0004690681123394959,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9313045.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00379011663608253,
+ "skip_count": 2.0,
+ "step": 5776,
+ "text_loss": 0.33194616436958313
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.126797769298502,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00046875919461544265,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9315736.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016733441734686494,
+ "skip_count": 0.0,
+ "step": 5778,
+ "text_loss": 0.5009998679161072
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.00046845028886292493,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9318456.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005318894516676664,
+ "skip_count": 1.0,
+ "step": 5780,
+ "text_loss": 0.17702752351760864
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.145582624009393,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044921875,
+ "learning_rate": 0.00046814139520031615,
+ "loss": 0.006,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 9323152.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01133672520518303,
+ "skip_count": 2.0,
+ "step": 5782,
+ "text_loss": 0.2886650860309601
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.0004678325137459845,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9326318.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002458433620631695,
+ "skip_count": 0.0,
+ "step": 5784,
+ "text_loss": 0.5832745432853699
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.164367478720283,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0004675236446182946,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9329779.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005402310052886605,
+ "skip_count": 0.0,
+ "step": 5786,
+ "text_loss": 0.5699237585067749
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00046721478793560525,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9333360.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0002638917067088187,
+ "skip_count": 0.0,
+ "step": 5788,
+ "text_loss": 0.6555714011192322
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.00046690594381627106,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9336498.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003998351749032736,
+ "skip_count": 2.0,
+ "step": 5790,
+ "text_loss": 0.2076750248670578
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.192544760786618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.00046659711237864157,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9339724.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0045847659930586815,
+ "skip_count": 1.0,
+ "step": 5792,
+ "text_loss": 0.22027169167995453
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0186767578125,
+ "learning_rate": 0.00046628829374106167,
+ "loss": 0.0033,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9342835.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014064523857086897,
+ "skip_count": 1.0,
+ "step": 5794,
+ "text_loss": 0.5120179057121277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0004659794880218712,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9346757.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011155207175761461,
+ "skip_count": 1.0,
+ "step": 5796,
+ "text_loss": 0.6415372490882874
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.220722042852948,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0004656706953394051,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9349652.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020385095849633217,
+ "skip_count": 0.0,
+ "step": 5798,
+ "text_loss": 0.5410398840904236
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 27.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0004653619158119933,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9354286.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0012847178149968386,
+ "skip_count": 0.0,
+ "step": 5800,
+ "text_loss": 0.4386860728263855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.00046505314955796074,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9357682.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035008061677217484,
+ "skip_count": 2.0,
+ "step": 5802,
+ "text_loss": 0.13655950129032135
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.248899324919282,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00046474439669562715,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9361058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020033426117151976,
+ "skip_count": 1.0,
+ "step": 5804,
+ "text_loss": 0.6293444037437439
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00046443565734330714,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9364173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004935986362397671,
+ "skip_count": 0.0,
+ "step": 5806,
+ "text_loss": 0.2923166751861572
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0004641269316193104,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9366980.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001654456602409482,
+ "skip_count": 0.0,
+ "step": 5808,
+ "text_loss": 0.7273373007774353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.277076606985617,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0004638182196419411,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9370581.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017011919990181923,
+ "skip_count": 0.0,
+ "step": 5810,
+ "text_loss": 0.6029995083808899
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 27.286469034341064,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.0004635095215294984,
+ "loss": 0.0072,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 9374233.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01361197978258133,
+ "skip_count": 3.0,
+ "step": 5812,
+ "text_loss": 0.14051523804664612
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.00046320083740027584,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9377217.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004597014281898737,
+ "skip_count": 0.0,
+ "step": 5814,
+ "text_loss": 0.2766880691051483
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 27.30525388905195,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.00046289216737256184,
+ "loss": 0.0041,
+ "macro_f1": 1.0,
+ "num_tokens": 9380336.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.006628422066569328,
+ "skip_count": 1.0,
+ "step": 5816,
+ "text_loss": 0.8092381954193115
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.314646316407398,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.0004625835115646393,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9382968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002737772185355425,
+ "skip_count": 0.0,
+ "step": 5818,
+ "text_loss": 0.22090643644332886
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 27.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0004622748700947856,
+ "loss": 0.0057,
+ "macro_f1": 1.0,
+ "num_tokens": 9386203.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004552177153527737,
+ "skip_count": 1.0,
+ "step": 5820,
+ "text_loss": 0.42869850993156433
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0004619662430812729,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9388968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003149240743368864,
+ "skip_count": 2.0,
+ "step": 5822,
+ "text_loss": 0.45137661695480347
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.342823598473732,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0004616576306423677,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9392487.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008133690571412444,
+ "skip_count": 0.0,
+ "step": 5824,
+ "text_loss": 0.638685941696167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0004613490328963307,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9395665.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00042717234464362264,
+ "skip_count": 0.0,
+ "step": 5826,
+ "text_loss": 0.8134317398071289
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.00046104044996141716,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9398831.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0084775285795331,
+ "skip_count": 2.0,
+ "step": 5828,
+ "text_loss": 0.19263958930969238
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 27.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0004607318819558768,
+ "loss": 0.0087,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9403118.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030239911284297705,
+ "skip_count": 0.0,
+ "step": 5830,
+ "text_loss": 0.45556432008743286
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 27.38039330789551,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 0.00046042332899795313,
+ "loss": 0.0075,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 9406206.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.026389889419078827,
+ "skip_count": 2.0,
+ "step": 5832,
+ "text_loss": 0.26458361744880676
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.0004601147912058845,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9409806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013476534513756633,
+ "skip_count": 0.0,
+ "step": 5834,
+ "text_loss": 0.7443689107894897
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.399178162606397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0004598062686979033,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9412737.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004275512881577015,
+ "skip_count": 1.0,
+ "step": 5836,
+ "text_loss": 0.2808683514595032
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.00045949776159223563,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9415818.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027225434314459562,
+ "skip_count": 0.0,
+ "step": 5838,
+ "text_loss": 0.6283587217330933
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.055419921875,
+ "learning_rate": 0.0004591892700071022,
+ "loss": 0.0056,
+ "macro_f1": 1.0,
+ "num_tokens": 9419119.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01574302278459072,
+ "skip_count": 2.0,
+ "step": 5840,
+ "text_loss": 0.33239027857780457
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.42735544467273,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.00045888079406071746,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9422257.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007227854221127927,
+ "skip_count": 0.0,
+ "step": 5842,
+ "text_loss": 0.6658740043640137
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.00045857233387129,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9425071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020696306601166725,
+ "skip_count": 2.0,
+ "step": 5844,
+ "text_loss": 0.5773820877075195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.0004582638895570224,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9427980.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019764541648328304,
+ "skip_count": 0.0,
+ "step": 5846,
+ "text_loss": 0.3388919532299042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.455532726739065,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.000457955461236111,
+ "loss": 0.0058,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9430733.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04235004261136055,
+ "skip_count": 0.0,
+ "step": 5848,
+ "text_loss": 0.44346582889556885
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.464925154094512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0004576470490267462,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9433347.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000801609072368592,
+ "skip_count": 0.0,
+ "step": 5850,
+ "text_loss": 0.5825944542884827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.0004573386530471121,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9436172.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018224078230559826,
+ "skip_count": 2.0,
+ "step": 5852,
+ "text_loss": 0.8111652135848999
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.0004570302734153866,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9439040.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006614950485527515,
+ "skip_count": 2.0,
+ "step": 5854,
+ "text_loss": 0.31270334124565125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.493102436160846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05859375,
+ "learning_rate": 0.0004567219102497412,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9442138.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012984242057427764,
+ "skip_count": 0.0,
+ "step": 5856,
+ "text_loss": 0.6126856803894043
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0004564135636683416,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9445600.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008388847345486283,
+ "skip_count": 0.0,
+ "step": 5858,
+ "text_loss": 0.8526380658149719
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046875,
+ "learning_rate": 0.0004561052337893467,
+ "loss": 0.0108,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9449609.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008125773631036282,
+ "skip_count": 2.0,
+ "step": 5860,
+ "text_loss": 0.2843833863735199
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.000455796920730909,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9452756.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019371749367564917,
+ "skip_count": 0.0,
+ "step": 5862,
+ "text_loss": 0.5293750166893005
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0004554886246111746,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 9455467.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005594742484390736,
+ "skip_count": 2.0,
+ "step": 5864,
+ "text_loss": 0.572329044342041
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 27.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0004551803455482833,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9458953.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005960086826235056,
+ "skip_count": 3.0,
+ "step": 5866,
+ "text_loss": 0.19459208846092224
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.00045487208366036807,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9462130.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034781871363520622,
+ "skip_count": 1.0,
+ "step": 5868,
+ "text_loss": 0.20467053353786469
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.00045456383906555554,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9465590.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012246103724464774,
+ "skip_count": 0.0,
+ "step": 5870,
+ "text_loss": 0.6086251735687256
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.00045425561188196565,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9468092.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002874316181987524,
+ "skip_count": 1.0,
+ "step": 5872,
+ "text_loss": 0.3430633544921875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.57763428235985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0004539474022277115,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9471433.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004340244457125664,
+ "skip_count": 2.0,
+ "step": 5874,
+ "text_loss": 0.28219133615493774
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.0004536392102208997,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9474363.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007322742021642625,
+ "skip_count": 0.0,
+ "step": 5876,
+ "text_loss": 0.7305856943130493
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.0004533310359796299,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9478469.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018631393322721124,
+ "skip_count": 0.0,
+ "step": 5878,
+ "text_loss": 0.5821442604064941
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 28.0,
+ "epoch": 27.60581156442618,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0004530228796219952,
+ "loss": 0.0088,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 9481200.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.026109615340828896,
+ "skip_count": 3.0,
+ "step": 5880,
+ "text_loss": 0.3962891101837158
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.00045271474126608167,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9484200.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004716445691883564,
+ "skip_count": 0.0,
+ "step": 5882,
+ "text_loss": 0.31901776790618896
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0004524066210299685,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9488939.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003797562967520207,
+ "skip_count": 0.0,
+ "step": 5884,
+ "text_loss": 0.3992912471294403
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.0004520985190317279,
+ "loss": 0.0032,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9492010.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005681614391505718,
+ "skip_count": 1.0,
+ "step": 5886,
+ "text_loss": 0.5318995118141174
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0004517904353894253,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9494770.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021422000136226416,
+ "skip_count": 0.0,
+ "step": 5888,
+ "text_loss": 0.435088187456131
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.652773701203404,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0004514823702211187,
+ "loss": 0.0052,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 9497327.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01593884639441967,
+ "skip_count": 2.0,
+ "step": 5890,
+ "text_loss": 0.5068450570106506
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.662166128558848,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.00045117432364485927,
+ "loss": 0.0075,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 9500488.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0729660913348198,
+ "skip_count": 2.0,
+ "step": 5892,
+ "text_loss": 0.42718732357025146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.00045086629577869127,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9503593.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007092897780239582,
+ "skip_count": 2.0,
+ "step": 5894,
+ "text_loss": 0.4264345169067383
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.00045055828674065134,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9507188.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004088073968887329,
+ "skip_count": 2.0,
+ "step": 5896,
+ "text_loss": 0.20932413637638092
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 27.690343410625182,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.00045025029664876926,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9510126.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0026970503386110067,
+ "skip_count": 0.0,
+ "step": 5898,
+ "text_loss": 0.47661110758781433
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.69973583798063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0164794921875,
+ "learning_rate": 0.0004499423256210673,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9513891.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003428407246246934,
+ "skip_count": 0.0,
+ "step": 5900,
+ "text_loss": 0.18232668936252594
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.00044963437377556066,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9516718.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020270352251827717,
+ "skip_count": 0.0,
+ "step": 5902,
+ "text_loss": 0.16833586990833282
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.718520692691516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.000449326441230257,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9520248.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019144838443025947,
+ "skip_count": 0.0,
+ "step": 5904,
+ "text_loss": 0.44434574246406555
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.727913120046964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 0.00044901852810315634,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9523651.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0044578867964446545,
+ "skip_count": 2.0,
+ "step": 5906,
+ "text_loss": 0.1248839721083641
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.0004487106345122522,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9527235.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000827222247608006,
+ "skip_count": 0.0,
+ "step": 5908,
+ "text_loss": 0.6052893996238708
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 27.74669797475785,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0004484027605755296,
+ "loss": 0.0065,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 9530407.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.029739778488874435,
+ "skip_count": 0.0,
+ "step": 5910,
+ "text_loss": 0.7625715732574463
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.756090402113298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 0.00044809490641096653,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9533229.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025658784434199333,
+ "skip_count": 0.0,
+ "step": 5912,
+ "text_loss": 0.27842655777931213
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 27.76548282946874,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.00044778707213653324,
+ "loss": 0.0069,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 9537397.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010157953947782516,
+ "skip_count": 3.0,
+ "step": 5914,
+ "text_loss": 0.45196083188056946
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.774875256824185,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0004474792578701924,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 9540564.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.011994685977697372,
+ "skip_count": 5.0,
+ "step": 5916,
+ "text_loss": 0.22617442905902863
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.784267684179632,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.000447171463729899,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9543602.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022214490454643965,
+ "skip_count": 0.0,
+ "step": 5918,
+ "text_loss": 0.5089073777198792
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0004468636898336003,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 9546829.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009353389963507652,
+ "skip_count": 2.0,
+ "step": 5920,
+ "text_loss": 0.7560386657714844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.057373046875,
+ "learning_rate": 0.00044655593629923596,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9550259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005637963302433491,
+ "skip_count": 0.0,
+ "step": 5922,
+ "text_loss": 0.17084793746471405
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.812444966245963,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.00044624820324473766,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 9554376.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008556432090699673,
+ "skip_count": 2.0,
+ "step": 5924,
+ "text_loss": 0.5906872749328613
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 27.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0004459404907880292,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9558348.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016659445827826858,
+ "skip_count": 0.0,
+ "step": 5926,
+ "text_loss": 0.8197194933891296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 27.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.00044563279904702674,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9561139.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01341368816792965,
+ "skip_count": 3.0,
+ "step": 5928,
+ "text_loss": 0.3264874815940857
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.840622248312297,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 0.000445325128139638,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9564387.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005023977253586054,
+ "skip_count": 2.0,
+ "step": 5930,
+ "text_loss": 0.9055862426757812
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0004450174781837635,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9567053.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006051476229913533,
+ "skip_count": 0.0,
+ "step": 5932,
+ "text_loss": 0.6908539533615112
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0004447098492972951,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9570036.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003152312943711877,
+ "skip_count": 0.0,
+ "step": 5934,
+ "text_loss": 0.6321061849594116
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 0.0004444022415981167,
+ "loss": 0.0094,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9574146.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004859412554651499,
+ "skip_count": 1.0,
+ "step": 5936,
+ "text_loss": 0.5905604958534241
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 27.878191957734078,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.00044409465520410426,
+ "loss": 0.0071,
+ "macro_f1": 1.0,
+ "num_tokens": 9577071.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004376287572085857,
+ "skip_count": 1.0,
+ "step": 5938,
+ "text_loss": 0.6928377747535706
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.88758438508952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.00044378709023312535,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9580537.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004038849379867315,
+ "skip_count": 1.0,
+ "step": 5940,
+ "text_loss": 0.2686770558357239
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.896976812444965,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0004434795468030396,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9583225.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005459951236844063,
+ "skip_count": 2.0,
+ "step": 5942,
+ "text_loss": 0.16855180263519287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.906369239800412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.000443172025031698,
+ "loss": 0.0037,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9586018.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032985717989504337,
+ "skip_count": 2.0,
+ "step": 5944,
+ "text_loss": 0.20335732400417328
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 27.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0004428645250369437,
+ "loss": 0.0037,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9589321.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003573323367163539,
+ "skip_count": 0.0,
+ "step": 5946,
+ "text_loss": 0.6318653225898743
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.00044255704693661117,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9592518.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002226749900728464,
+ "skip_count": 0.0,
+ "step": 5948,
+ "text_loss": 0.5320658683776855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.934546521866746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0004422495908485265,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9595664.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007805621717125177,
+ "skip_count": 0.0,
+ "step": 5950,
+ "text_loss": 0.6330106258392334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0004419421568905077,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9598885.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017050127498805523,
+ "skip_count": 0.0,
+ "step": 5952,
+ "text_loss": 0.6098045706748962
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00044163474518036375,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9603021.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025974081363528967,
+ "skip_count": 0.0,
+ "step": 5954,
+ "text_loss": 0.2655932903289795
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.00044132735583589567,
+ "loss": 0.0072,
+ "macro_f1": 1.0,
+ "num_tokens": 9605841.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010364850051701069,
+ "skip_count": 2.0,
+ "step": 5956,
+ "text_loss": 0.3028552532196045
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 27.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.015869140625,
+ "learning_rate": 0.00044101998897489553,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9608810.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015063622267916799,
+ "skip_count": 0.0,
+ "step": 5958,
+ "text_loss": 0.5602094531059265
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 27.981508658643968,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.00044071264471514683,
+ "loss": 0.0051,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 9611995.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011538165621459484,
+ "skip_count": 3.0,
+ "step": 5960,
+ "text_loss": 0.14332173764705658
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00044040532317442455,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9615434.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004693889059126377,
+ "skip_count": 0.0,
+ "step": 5962,
+ "text_loss": 0.334369033575058
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 28.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.00044009802447049474,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 9618056.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0045085870660841465,
+ "skip_count": 1.0,
+ "step": 5964,
+ "text_loss": 0.8163170218467712
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.009392427355444,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.00043979074872111507,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9621428.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018220023484900594,
+ "skip_count": 0.0,
+ "step": 5966,
+ "text_loss": 0.2513850927352905
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0004394834960440341,
+ "loss": 0.0051,
+ "macro_f1": 1.0,
+ "num_tokens": 9625433.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.007051277905702591,
+ "skip_count": 5.0,
+ "step": 5968,
+ "text_loss": 0.6263421177864075
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.00043917626655699154,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9629508.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006454752874560654,
+ "skip_count": 0.0,
+ "step": 5970,
+ "text_loss": 0.645618736743927
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.037569709421778,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0004388690603777184,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9632504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004847112577408552,
+ "skip_count": 1.0,
+ "step": 5972,
+ "text_loss": 0.47306978702545166
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.00043856187762393665,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9636685.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006580828921869397,
+ "skip_count": 0.0,
+ "step": 5974,
+ "text_loss": 0.42226532101631165
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0004382547184133593,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9639958.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002188180573284626,
+ "skip_count": 0.0,
+ "step": 5976,
+ "text_loss": 0.4456600248813629
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0004379475828636901,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 9643228.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017135308589786291,
+ "skip_count": 2.0,
+ "step": 5978,
+ "text_loss": 0.6295822262763977
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.07513941884356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0004376404710926244,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9646746.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008841048111207783,
+ "skip_count": 0.0,
+ "step": 5980,
+ "text_loss": 0.5102712512016296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.00043733338321784784,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9649452.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006229099817574024,
+ "skip_count": 0.0,
+ "step": 5982,
+ "text_loss": 0.6944046020507812
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.000437026319357037,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9652700.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005293759983032942,
+ "skip_count": 2.0,
+ "step": 5984,
+ "text_loss": 0.6748214960098267
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00043671927962785946,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9655825.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013537590857595205,
+ "skip_count": 0.0,
+ "step": 5986,
+ "text_loss": 1.000306248664856
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0004364122641479733,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9658713.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004548195283859968,
+ "skip_count": 0.0,
+ "step": 5988,
+ "text_loss": 0.24580086767673492
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 28.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0004361052730350275,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9661535.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011149964295327663,
+ "skip_count": 4.0,
+ "step": 5990,
+ "text_loss": 0.5737863779067993
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 28.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.00043579830640666154,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 9664406.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003783488878980279,
+ "skip_count": 1.0,
+ "step": 5992,
+ "text_loss": 0.7836558222770691
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.00043549136438050573,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9669050.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0050374288111925125,
+ "skip_count": 1.0,
+ "step": 5994,
+ "text_loss": 0.13072487711906433
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.00043518444707418076,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9672698.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004047670867294073,
+ "skip_count": 2.0,
+ "step": 5996,
+ "text_loss": 0.4748993217945099
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.159671265042558,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.00043487755460529796,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9676159.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008628991432487965,
+ "skip_count": 2.0,
+ "step": 5998,
+ "text_loss": 0.1921990066766739
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 28.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00043457068709145904,
+ "loss": 0.0072,
+ "macro_f1": 1.0,
+ "num_tokens": 9679528.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.01094671618193388,
+ "skip_count": 3.0,
+ "step": 6000,
+ "text_loss": 0.3651769459247589
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 28.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 0.00043426384465025604,
+ "loss": 0.0033,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9682677.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0011284075444564223,
+ "skip_count": 0.0,
+ "step": 6002,
+ "text_loss": 0.28305181860923767
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.187848547108892,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.000433957027399272,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9685310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030473743099719286,
+ "skip_count": 1.0,
+ "step": 6004,
+ "text_loss": 0.3650054931640625
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.19724097446434,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.00043365023545607965,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 9687944.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011621905490756035,
+ "skip_count": 2.0,
+ "step": 6006,
+ "text_loss": 0.5409000515937805
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0004333434689382423,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9690932.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005297541501931846,
+ "skip_count": 0.0,
+ "step": 6008,
+ "text_loss": 0.4311029314994812
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.216025829175226,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.00043303672796331336,
+ "loss": 0.0058,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9693972.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06166421249508858,
+ "skip_count": 0.0,
+ "step": 6010,
+ "text_loss": 0.2658997178077698
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.225418256530673,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.00043273001264883655,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9697712.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018419031985104084,
+ "skip_count": 0.0,
+ "step": 6012,
+ "text_loss": 0.5813497304916382
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0004324233231123458,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9700746.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003635555040091276,
+ "skip_count": 0.0,
+ "step": 6014,
+ "text_loss": 0.24211904406547546
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 28.24420311124156,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.0004321166594713651,
+ "loss": 0.0048,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 9704087.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021067705005407333,
+ "skip_count": 2.0,
+ "step": 6016,
+ "text_loss": 0.5908042788505554
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.253595538597008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.00043181002184340857,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9708695.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008712753187865019,
+ "skip_count": 0.0,
+ "step": 6018,
+ "text_loss": 0.7788549661636353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.26298796595245,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0004315034103459803,
+ "loss": 0.0054,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9711631.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03231092542409897,
+ "skip_count": 0.0,
+ "step": 6020,
+ "text_loss": 0.6127741932868958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.0004311968250965743,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9715526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020149527117609978,
+ "skip_count": 2.0,
+ "step": 6022,
+ "text_loss": 0.49970078468322754
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.281772820663342,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.0004308902662126748,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9718475.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031795913819223642,
+ "skip_count": 0.0,
+ "step": 6024,
+ "text_loss": 0.3254713714122772
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.291165248018785,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.00043058373381175567,
+ "loss": 0.004,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9722194.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0148378387093544,
+ "skip_count": 1.0,
+ "step": 6026,
+ "text_loss": 0.17670343816280365
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0004302772280112806,
+ "loss": 0.0076,
+ "macro_f1": 1.0,
+ "num_tokens": 9725489.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005742347799241543,
+ "skip_count": 2.0,
+ "step": 6028,
+ "text_loss": 0.26184776425361633
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.309950102729672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.00042997074892870335,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9729416.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023561837151646614,
+ "skip_count": 0.0,
+ "step": 6030,
+ "text_loss": 0.3026008605957031
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.31934253008512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.0004296642966814673,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9732559.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010108393616974354,
+ "skip_count": 1.0,
+ "step": 6032,
+ "text_loss": 0.43198078870773315
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 28.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.00042935787138700525,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 9736324.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005443581845611334,
+ "skip_count": 2.0,
+ "step": 6034,
+ "text_loss": 0.24883155524730682
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.338127384796007,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0004290514731627403,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 9739630.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010645060800015926,
+ "skip_count": 2.0,
+ "step": 6036,
+ "text_loss": 0.24207182228565216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018798828125,
+ "learning_rate": 0.0004287451021260846,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9742221.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008162845042534173,
+ "skip_count": 0.0,
+ "step": 6038,
+ "text_loss": 0.33018553256988525
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0004284387583944403,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9744925.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003782407147809863,
+ "skip_count": 1.0,
+ "step": 6040,
+ "text_loss": 0.6600399613380432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0004281324420851987,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9748103.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009834285592660308,
+ "skip_count": 0.0,
+ "step": 6042,
+ "text_loss": 0.6402350664138794
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.375697094217788,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0004278261533157409,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9751128.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004100334830582142,
+ "skip_count": 2.0,
+ "step": 6044,
+ "text_loss": 0.1545136719942093
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.38508952157323,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0004275198922034372,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9754140.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017166603356599808,
+ "skip_count": 1.0,
+ "step": 6046,
+ "text_loss": 0.5875935554504395
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.394481948928675,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.00042721365886564766,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 9756945.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00915827602148056,
+ "skip_count": 2.0,
+ "step": 6048,
+ "text_loss": 0.3885214328765869
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.403874376284122,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.00042690745341972134,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9759738.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0057020667009055614,
+ "skip_count": 2.0,
+ "step": 6050,
+ "text_loss": 0.3107164204120636
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.00042660127598299647,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9762987.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004196313209831715,
+ "skip_count": 2.0,
+ "step": 6052,
+ "text_loss": 0.3073577582836151
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.00042629512667280135,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9765828.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023119752295315266,
+ "skip_count": 1.0,
+ "step": 6054,
+ "text_loss": 0.8228643536567688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.432051658350456,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0004259890056064527,
+ "loss": 0.009,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9769129.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021007524337619543,
+ "skip_count": 1.0,
+ "step": 6056,
+ "text_loss": 0.8334706425666809
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0004256829129012568,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 9771821.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00671970471739769,
+ "skip_count": 2.0,
+ "step": 6058,
+ "text_loss": 0.17845536768436432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00042537684867450875,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9774566.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014770646812394261,
+ "skip_count": 0.0,
+ "step": 6060,
+ "text_loss": 0.4445459246635437
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 28.46022894041679,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00042507081304349315,
+ "loss": 0.0067,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 9777909.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.014822427183389664,
+ "skip_count": 0.0,
+ "step": 6062,
+ "text_loss": 0.45526158809661865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0004247648061254833,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9781159.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00568385748192668,
+ "skip_count": 1.0,
+ "step": 6064,
+ "text_loss": 0.18535588681697845
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.479013795127678,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.00042445882803774173,
+ "loss": 0.0046,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9784960.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0179694052785635,
+ "skip_count": 0.0,
+ "step": 6066,
+ "text_loss": 0.23591181635856628
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.488406222483125,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.00042415287889751966,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9787941.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019039154285565019,
+ "skip_count": 0.0,
+ "step": 6068,
+ "text_loss": 0.9447930455207825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.49779864983857,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0004238469588220575,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9791096.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004039563238620758,
+ "skip_count": 0.0,
+ "step": 6070,
+ "text_loss": 0.3134256601333618
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.00042354106792858446,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9794082.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018352365586906672,
+ "skip_count": 0.0,
+ "step": 6072,
+ "text_loss": 0.5681536197662354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.516583504549455,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 0.00042323520633431833,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9797303.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019325513858348131,
+ "skip_count": 0.0,
+ "step": 6074,
+ "text_loss": 0.2835809290409088
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.00042292937415646574,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9800435.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002513401210308075,
+ "skip_count": 0.0,
+ "step": 6076,
+ "text_loss": 0.1931663602590561
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00042262357151222265,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9803873.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004864581860601902,
+ "skip_count": 0.0,
+ "step": 6078,
+ "text_loss": 0.25809767842292786
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 28.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0004223177985187728,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9806438.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004932792857289314,
+ "skip_count": 0.0,
+ "step": 6080,
+ "text_loss": 0.6409249305725098
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00042201205529328925,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9809400.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00590938376262784,
+ "skip_count": 1.0,
+ "step": 6082,
+ "text_loss": 0.31158050894737244
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.00042170634195293314,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9813246.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006805860437452793,
+ "skip_count": 0.0,
+ "step": 6084,
+ "text_loss": 0.32945963740348816
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.572938068682124,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0004214006586148545,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9816513.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010186503641307354,
+ "skip_count": 0.0,
+ "step": 6086,
+ "text_loss": 0.48659923672676086
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.0004210950053961917,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9819908.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00402973173186183,
+ "skip_count": 1.0,
+ "step": 6088,
+ "text_loss": 0.6249601244926453
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.00042078938241407174,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9822950.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00236532068811357,
+ "skip_count": 1.0,
+ "step": 6090,
+ "text_loss": 0.26589256525039673
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.601115350748458,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0004204837897856098,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 9826493.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003072192659601569,
+ "skip_count": 2.0,
+ "step": 6092,
+ "text_loss": 0.5216912627220154
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.610507778103905,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.0004201782276279096,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9829698.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027553171385079622,
+ "skip_count": 1.0,
+ "step": 6094,
+ "text_loss": 0.40127676725387573
+ },
+ {
+ "acc_repeat": 0.75,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.61990020545935,
+ "f1_execute": 0.9756097793579102,
+ "f1_repeat": 0.8571428656578064,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.00041987269605806325,
+ "loss": 0.0045,
+ "macro_f1": 0.9442509412765503,
+ "num_tokens": 9833719.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.013845407404005527,
+ "skip_count": 4.0,
+ "step": 6096,
+ "text_loss": 0.23114071786403656
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.629292632814792,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0004195671951931509,
+ "loss": 0.0116,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9838235.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019887303933501244,
+ "skip_count": 2.0,
+ "step": 6098,
+ "text_loss": 0.7467341423034668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0004192617251502409,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9840867.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007213905337266624,
+ "skip_count": 0.0,
+ "step": 6100,
+ "text_loss": 0.6283472180366516
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.00041895628604639036,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9843827.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003863139310851693,
+ "skip_count": 1.0,
+ "step": 6102,
+ "text_loss": 0.3602744936943054
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.00041865087799864374,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9846939.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013336286647245288,
+ "skip_count": 0.0,
+ "step": 6104,
+ "text_loss": 0.4182434678077698
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.666862342236573,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.0004183455011240341,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9849827.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00038455065805464983,
+ "skip_count": 0.0,
+ "step": 6106,
+ "text_loss": 0.7122722864151001
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 28.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0004180401555395826,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 9853487.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.0038226440083235502,
+ "skip_count": 1.0,
+ "step": 6108,
+ "text_loss": 0.2521185576915741
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0004177348413622981,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9856321.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015809801407158375,
+ "skip_count": 0.0,
+ "step": 6110,
+ "text_loss": 0.423979252576828
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.695039624302908,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0004174295587091776,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9859238.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007586454739794135,
+ "skip_count": 0.0,
+ "step": 6112,
+ "text_loss": 0.4720100462436676
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 28.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.00041712430769720593,
+ "loss": 0.0091,
+ "macro_f1": 1.0,
+ "num_tokens": 9862282.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0045816488564014435,
+ "skip_count": 1.0,
+ "step": 6114,
+ "text_loss": 0.279577374458313
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 28.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0004168190884433559,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 9865394.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004728195257484913,
+ "skip_count": 1.0,
+ "step": 6116,
+ "text_loss": 0.3826395571231842
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 28.723216906369238,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0179443359375,
+ "learning_rate": 0.0004165139010645881,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9869165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006160226184874773,
+ "skip_count": 3.0,
+ "step": 6118,
+ "text_loss": 0.4668935537338257
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 24.0,
+ "epoch": 28.732609333724685,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.04736328125,
+ "learning_rate": 0.0004162087456778509,
+ "loss": 0.0074,
+ "macro_f1": 0.9619450569152832,
+ "num_tokens": 9872381.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.027831824496388435,
+ "skip_count": 6.0,
+ "step": 6120,
+ "text_loss": 0.28708913922309875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0004159036224000804,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9875668.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030764432158321142,
+ "skip_count": 1.0,
+ "step": 6122,
+ "text_loss": 0.37078607082366943
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.751394188435572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0004155985313482002,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9878533.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00043521137558855116,
+ "skip_count": 0.0,
+ "step": 6124,
+ "text_loss": 0.34975379705429077
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00041529347263912224,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9881478.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016251741908490658,
+ "skip_count": 0.0,
+ "step": 6126,
+ "text_loss": 0.39166271686553955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.770179043146463,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00041498844638974535,
+ "loss": 0.005,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9884252.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.019553523510694504,
+ "skip_count": 0.0,
+ "step": 6128,
+ "text_loss": 0.2309480905532837
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 28.779571470501907,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.0004146834527169562,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9887485.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0036251386627554893,
+ "skip_count": 0.0,
+ "step": 6130,
+ "text_loss": 0.4464457631111145
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.788963897857354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.00041437849173762894,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9890711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008515548543073237,
+ "skip_count": 0.0,
+ "step": 6132,
+ "text_loss": 0.5012133717536926
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 28.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0004140735635686251,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9894458.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001084602321498096,
+ "skip_count": 0.0,
+ "step": 6134,
+ "text_loss": 0.32015663385391235
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.80774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0004137686683267938,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9897634.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025203595869243145,
+ "skip_count": 0.0,
+ "step": 6136,
+ "text_loss": 0.15804508328437805
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.817141179923688,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0004134638061289715,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9901157.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029381231870502234,
+ "skip_count": 0.0,
+ "step": 6138,
+ "text_loss": 0.14375236630439758
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.0004131589770919819,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9903958.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002789110178127885,
+ "skip_count": 0.0,
+ "step": 6140,
+ "text_loss": 0.2474033683538437
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0004128541813326361,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 9906799.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.010770512744784355,
+ "skip_count": 3.0,
+ "step": 6142,
+ "text_loss": 0.2304249256849289
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 28.845318461990022,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0004125494189677325,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9909286.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003122122259810567,
+ "skip_count": 0.0,
+ "step": 6144,
+ "text_loss": 0.3781827688217163
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 28.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.00041224469011405643,
+ "loss": 0.0045,
+ "macro_f1": 1.0,
+ "num_tokens": 9912416.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008443298749625683,
+ "skip_count": 1.0,
+ "step": 6146,
+ "text_loss": 0.3004767596721649
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0004119399948883806,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9915290.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033219947945326567,
+ "skip_count": 1.0,
+ "step": 6148,
+ "text_loss": 0.748744547367096
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 28.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 0.0004116353334074647,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9918493.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005501769948750734,
+ "skip_count": 0.0,
+ "step": 6150,
+ "text_loss": 0.330759733915329
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.000411330705788056,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9921027.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013694261433556676,
+ "skip_count": 0.0,
+ "step": 6152,
+ "text_loss": 0.43070924282073975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0164794921875,
+ "learning_rate": 0.000411026112146888,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9924303.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00046192589798010886,
+ "skip_count": 0.0,
+ "step": 6154,
+ "text_loss": 0.5674887895584106
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 28.901673026122687,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0004107215526006817,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9927065.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004311304073780775,
+ "skip_count": 0.0,
+ "step": 6156,
+ "text_loss": 0.16138267517089844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0004104170272661449,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9930713.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035845425445586443,
+ "skip_count": 0.0,
+ "step": 6158,
+ "text_loss": 0.18728356063365936
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.00041011253625997227,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9934393.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00247366214171052,
+ "skip_count": 0.0,
+ "step": 6160,
+ "text_loss": 0.3624019920825958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.92985030818902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0004098080796988452,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9937457.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003240241203457117,
+ "skip_count": 0.0,
+ "step": 6162,
+ "text_loss": 0.12348521500825882
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 0.0004095036576994321,
+ "loss": 0.0035,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9940523.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001985874492675066,
+ "skip_count": 1.0,
+ "step": 6164,
+ "text_loss": 0.2688066363334656
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 28.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.00040919927037838815,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9943802.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004264154937118292,
+ "skip_count": 3.0,
+ "step": 6166,
+ "text_loss": 0.49316367506980896
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0556640625,
+ "learning_rate": 0.00040889491785235513,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9946649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002545441733673215,
+ "skip_count": 0.0,
+ "step": 6168,
+ "text_loss": 0.4079313576221466
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.967420017610802,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0004085906002379614,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9949800.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009590961271896958,
+ "skip_count": 0.0,
+ "step": 6170,
+ "text_loss": 0.6166561245918274
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0004082863176518221,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9954008.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003795337164774537,
+ "skip_count": 2.0,
+ "step": 6172,
+ "text_loss": 0.4791361689567566
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044921875,
+ "learning_rate": 0.0004079820702105388,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9957153.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015634822193533182,
+ "skip_count": 0.0,
+ "step": 6174,
+ "text_loss": 0.7208777666091919
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.995597299677137,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0004076778580306999,
+ "loss": 0.0056,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 9960060.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03223998099565506,
+ "skip_count": 2.0,
+ "step": 6176,
+ "text_loss": 0.6617992520332336
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.00469621367772,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.00040737368122887983,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9963396.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033978577703237534,
+ "skip_count": 0.0,
+ "step": 6178,
+ "text_loss": 0.7339215278625488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.014088641033165,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00040706953992164,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9966364.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005358994239941239,
+ "skip_count": 0.0,
+ "step": 6180,
+ "text_loss": 0.44187214970588684
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00040676543422552767,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9969813.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018544091144576669,
+ "skip_count": 1.0,
+ "step": 6182,
+ "text_loss": 0.6244927048683167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0004064613642570769,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9973015.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005692692007869482,
+ "skip_count": 0.0,
+ "step": 6184,
+ "text_loss": 0.18860043585300446
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00040615733013280784,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9976201.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018737476784735918,
+ "skip_count": 0.0,
+ "step": 6186,
+ "text_loss": 0.21189232170581818
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.051658350454947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.00040585333196922687,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9979711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011945146135985851,
+ "skip_count": 2.0,
+ "step": 6188,
+ "text_loss": 0.2628154456615448
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.00040554936988282663,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9983003.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036045778542757034,
+ "skip_count": 1.0,
+ "step": 6190,
+ "text_loss": 0.5926038026809692
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.070443205165834,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0004052454439900861,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9986841.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004170368425548077,
+ "skip_count": 0.0,
+ "step": 6192,
+ "text_loss": 0.3088737726211548
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00040494155440747015,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9989596.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002254750579595566,
+ "skip_count": 2.0,
+ "step": 6194,
+ "text_loss": 0.6309700012207031
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 29.089228059876724,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.00040463770125142987,
+ "loss": 0.0087,
+ "macro_f1": 0.8814815282821655,
+ "num_tokens": 9992789.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04092822223901749,
+ "skip_count": 4.0,
+ "step": 6196,
+ "text_loss": 0.09625697880983353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.098620487232168,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.00040433388463840213,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9995782.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00029065192211419344,
+ "skip_count": 0.0,
+ "step": 6198,
+ "text_loss": 0.5600258111953735
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.108012914587615,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0004040301046848105,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9998712.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005865268758498132,
+ "skip_count": 0.0,
+ "step": 6200,
+ "text_loss": 0.6426429748535156
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 29.11740534194306,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.0004037263615070638,
+ "loss": 0.0078,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 10002020.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.025357060134410858,
+ "skip_count": 3.0,
+ "step": 6202,
+ "text_loss": 0.25125735998153687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.126797769298502,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.000403422655221557,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10005381.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003139561740681529,
+ "skip_count": 1.0,
+ "step": 6204,
+ "text_loss": 0.3639419376850128
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.00040311898594467085,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10008348.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004091196693480015,
+ "skip_count": 2.0,
+ "step": 6206,
+ "text_loss": 0.1602363884449005
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00040281535379277204,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10011171.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005771483760327101,
+ "skip_count": 0.0,
+ "step": 6208,
+ "text_loss": 0.5593504905700684
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.000402511758882213,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10014374.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005212264601141214,
+ "skip_count": 1.0,
+ "step": 6210,
+ "text_loss": 0.15668229758739471
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.164367478720283,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0004022082013293319,
+ "loss": 0.0032,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10017327.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027585842180997133,
+ "skip_count": 1.0,
+ "step": 6212,
+ "text_loss": 0.21188466250896454
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.173759906075727,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.00040190468125045255,
+ "loss": 0.0061,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 10020518.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013210589066147804,
+ "skip_count": 1.0,
+ "step": 6214,
+ "text_loss": 0.2551073729991913
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 29.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01708984375,
+ "learning_rate": 0.00040160119876188436,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10023799.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001590219559147954,
+ "skip_count": 0.0,
+ "step": 6216,
+ "text_loss": 0.5634782314300537
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.192544760786618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0004012977539799224,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10027107.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003917343448847532,
+ "skip_count": 0.0,
+ "step": 6218,
+ "text_loss": 0.6412819027900696
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.0004009943470208473,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10030460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00874288845807314,
+ "skip_count": 2.0,
+ "step": 6220,
+ "text_loss": 0.13269923627376556
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.211329615497505,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.000400690978000925,
+ "loss": 0.0075,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 10034086.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03736349940299988,
+ "skip_count": 3.0,
+ "step": 6222,
+ "text_loss": 0.4956454336643219
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.220722042852948,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0004003876470364075,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10037312.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008481289260089397,
+ "skip_count": 2.0,
+ "step": 6224,
+ "text_loss": 0.2148810178041458
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0152587890625,
+ "learning_rate": 0.0004000843542435315,
+ "loss": 0.0028,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10040393.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002235144842416048,
+ "skip_count": 0.0,
+ "step": 6226,
+ "text_loss": 0.17645306885242462
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 29.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.0003997810997385195,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10044386.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004541373811662197,
+ "skip_count": 0.0,
+ "step": 6228,
+ "text_loss": 0.5098661184310913
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.248899324919282,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.00039947788363757915,
+ "loss": 0.0088,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10049046.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019183673430234194,
+ "skip_count": 1.0,
+ "step": 6230,
+ "text_loss": 0.6953724026679993
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.00039917470605690334,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 10051787.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0032311067916452885,
+ "skip_count": 4.0,
+ "step": 6232,
+ "text_loss": 0.475127637386322
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 29.267684179630173,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.00039887156711267043,
+ "loss": 0.0079,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 10055396.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03247373178601265,
+ "skip_count": 0.0,
+ "step": 6234,
+ "text_loss": 0.4239100515842438
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 29.277076606985617,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.00039856846692104363,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10058395.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006287421099841595,
+ "skip_count": 3.0,
+ "step": 6236,
+ "text_loss": 0.24084535241127014
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 29.286469034341064,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.016357421875,
+ "learning_rate": 0.0003982654055981718,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 10061302.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0008686117362231016,
+ "skip_count": 1.0,
+ "step": 6238,
+ "text_loss": 0.4740419089794159
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.0003979623832601884,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10065318.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037686119321733713,
+ "skip_count": 2.0,
+ "step": 6240,
+ "text_loss": 0.43965795636177063
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.30525388905195,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0003976594000232123,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10068291.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005804901942610741,
+ "skip_count": 0.0,
+ "step": 6242,
+ "text_loss": 0.24424348771572113
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.314646316407398,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.00039735645600334714,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10071645.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002001055981963873,
+ "skip_count": 1.0,
+ "step": 6244,
+ "text_loss": 0.6524377465248108
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0003970535513166815,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10075136.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001252001617103815,
+ "skip_count": 0.0,
+ "step": 6246,
+ "text_loss": 0.22803714871406555
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0003967506860792893,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10078230.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004913780372589827,
+ "skip_count": 1.0,
+ "step": 6248,
+ "text_loss": 0.9835516214370728
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.342823598473732,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.000396447860407229,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10080852.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037437966093420982,
+ "skip_count": 2.0,
+ "step": 6250,
+ "text_loss": 0.4021640121936798
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.00039614507441654393,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10084139.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005433002021163702,
+ "skip_count": 2.0,
+ "step": 6252,
+ "text_loss": 0.23060470819473267
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.00039584232822326224,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10088501.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007705377647653222,
+ "skip_count": 0.0,
+ "step": 6254,
+ "text_loss": 0.5994830131530762
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 0.0003955396219433969,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10091506.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012310115853324533,
+ "skip_count": 0.0,
+ "step": 6256,
+ "text_loss": 0.4639038145542145
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0003952369556929455,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10096236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008964627049863338,
+ "skip_count": 2.0,
+ "step": 6258,
+ "text_loss": 0.24845287203788757
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0003949343295878903,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10099213.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033088945783674717,
+ "skip_count": 0.0,
+ "step": 6260,
+ "text_loss": 0.6527073979377747
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 29.399178162606397,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00039463174374419817,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 10103160.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.003462672932073474,
+ "skip_count": 1.0,
+ "step": 6262,
+ "text_loss": 0.4209299683570862
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 29.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00039432919827782066,
+ "loss": 0.0036,
+ "macro_f1": 1.0,
+ "num_tokens": 10105881.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0027124532498419285,
+ "skip_count": 2.0,
+ "step": 6264,
+ "text_loss": 0.4442266821861267
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0172119140625,
+ "learning_rate": 0.00039402669330469367,
+ "loss": 0.0032,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10108596.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005055282264947891,
+ "skip_count": 2.0,
+ "step": 6266,
+ "text_loss": 0.3331456780433655
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.42735544467273,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.00039372422894073765,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10111673.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009340311517007649,
+ "skip_count": 0.0,
+ "step": 6268,
+ "text_loss": 0.7664456367492676
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.00039342180530185745,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10116141.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00032052272581495345,
+ "skip_count": 0.0,
+ "step": 6270,
+ "text_loss": 0.47610244154930115
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00039311942250394274,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10119151.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015820999396964908,
+ "skip_count": 0.0,
+ "step": 6272,
+ "text_loss": 0.3815282881259918
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.0003928170806628669,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10122684.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007423736387863755,
+ "skip_count": 0.0,
+ "step": 6274,
+ "text_loss": 0.4630914628505707
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.464925154094512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00039251477989448797,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10126751.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006216703332029283,
+ "skip_count": 0.0,
+ "step": 6276,
+ "text_loss": 0.4342454671859741
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 29.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.00039221252031464816,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10129784.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004239698871970177,
+ "skip_count": 3.0,
+ "step": 6278,
+ "text_loss": 0.24661089479923248
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 29.4837100088054,
+ "f1_execute": 0.9743589162826538,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0003919103020391738,
+ "loss": 0.006,
+ "macro_f1": 0.8803418874740601,
+ "num_tokens": 10133066.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.027879100292921066,
+ "skip_count": 7.0,
+ "step": 6280,
+ "text_loss": 0.4705188274383545
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.493102436160846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.00039160812518387574,
+ "loss": 0.0099,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10136860.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002533538034185767,
+ "skip_count": 0.0,
+ "step": 6282,
+ "text_loss": 0.1953880786895752
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00039130598986454845,
+ "loss": 0.005,
+ "macro_f1": 1.0,
+ "num_tokens": 10140066.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002462630858644843,
+ "skip_count": 2.0,
+ "step": 6284,
+ "text_loss": 0.378487765789032
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 29.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.000391003896196971,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 10143646.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011922914534807205,
+ "skip_count": 1.0,
+ "step": 6286,
+ "text_loss": 0.2467316836118698
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 29.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.00039070184429690607,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 10146507.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0059767309576272964,
+ "skip_count": 1.0,
+ "step": 6288,
+ "text_loss": 0.9603674411773682
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.0003903998342801006,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 10149301.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030056277755647898,
+ "skip_count": 2.0,
+ "step": 6290,
+ "text_loss": 0.36631715297698975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 29.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.00039009786626228543,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10152158.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005298118572682142,
+ "skip_count": 3.0,
+ "step": 6292,
+ "text_loss": 0.2876455783843994
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0003897959403591751,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10155852.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004937763791531324,
+ "skip_count": 2.0,
+ "step": 6294,
+ "text_loss": 0.14649681746959686
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0003894940566864683,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10159164.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021474575623869896,
+ "skip_count": 0.0,
+ "step": 6296,
+ "text_loss": 0.5694304704666138
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 29.568241855004402,
+ "f1_execute": 0.9583333134651184,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.08251953125,
+ "learning_rate": 0.00038919221535984753,
+ "loss": 0.0073,
+ "macro_f1": 0.875,
+ "num_tokens": 10161806.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.040340203791856766,
+ "skip_count": 3.0,
+ "step": 6298,
+ "text_loss": 0.1574537754058838
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.57763428235985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.00038889041649497894,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10165669.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028486931696534157,
+ "skip_count": 0.0,
+ "step": 6300,
+ "text_loss": 0.9158071279525757
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0003885886602075123,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10168945.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006565484683960676,
+ "skip_count": 2.0,
+ "step": 6302,
+ "text_loss": 0.3530846834182739
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.00038828694661308116,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10171914.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009084723424166441,
+ "skip_count": 0.0,
+ "step": 6304,
+ "text_loss": 0.4603337347507477
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0003879852758273029,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 10175737.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004121702630072832,
+ "skip_count": 2.0,
+ "step": 6306,
+ "text_loss": 0.5294032096862793
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00038768364796577814,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10178543.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013208909658715129,
+ "skip_count": 0.0,
+ "step": 6308,
+ "text_loss": 0.41084006428718567
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 29.62459641913707,
+ "f1_execute": 0.9743589162826538,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00038738206314409144,
+ "loss": 0.0079,
+ "macro_f1": 0.9247862696647644,
+ "num_tokens": 10181880.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.03674180060625076,
+ "skip_count": 6.0,
+ "step": 6310,
+ "text_loss": 0.6920746564865112
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0003870805214778106,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10185173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00221974472515285,
+ "skip_count": 2.0,
+ "step": 6312,
+ "text_loss": 0.1376657634973526
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.0003867790230824869,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10188642.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001809283159673214,
+ "skip_count": 0.0,
+ "step": 6314,
+ "text_loss": 0.5220870971679688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0003864775680736552,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10191750.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013956360053271055,
+ "skip_count": 0.0,
+ "step": 6316,
+ "text_loss": 0.4109838902950287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.662166128558848,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.00038617615656683356,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10194578.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002947692759335041,
+ "skip_count": 2.0,
+ "step": 6318,
+ "text_loss": 0.4818590581417084
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0003858747886775232,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10197131.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008140999125316739,
+ "skip_count": 2.0,
+ "step": 6320,
+ "text_loss": 0.4004709720611572
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.68095098326974,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.0003855734645212093,
+ "loss": 0.0089,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 10199965.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.013056626543402672,
+ "skip_count": 2.0,
+ "step": 6322,
+ "text_loss": 0.3367139995098114
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.690343410625182,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.00038527218421335977,
+ "loss": 0.0087,
+ "macro_f1": 1.0,
+ "num_tokens": 10203184.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038112467154860497,
+ "skip_count": 2.0,
+ "step": 6324,
+ "text_loss": 0.5747989416122437
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.69973583798063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0003849709478694255,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10206436.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001232540002092719,
+ "skip_count": 0.0,
+ "step": 6326,
+ "text_loss": 0.4981732964515686
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.00038466975560484115,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10209889.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004343799781054258,
+ "skip_count": 0.0,
+ "step": 6328,
+ "text_loss": 0.2160186469554901
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.718520692691516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.000384368607535024,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10212520.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014161963481456041,
+ "skip_count": 1.0,
+ "step": 6330,
+ "text_loss": 0.3556232154369354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.727913120046964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0185546875,
+ "learning_rate": 0.0003840675037753745,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10215456.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014989010524004698,
+ "skip_count": 0.0,
+ "step": 6332,
+ "text_loss": 0.8510926961898804
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0003837664444412762,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10218558.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006702739745378494,
+ "skip_count": 0.0,
+ "step": 6334,
+ "text_loss": 0.3995226323604584
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0003834654296480958,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10221862.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00826781615614891,
+ "skip_count": 2.0,
+ "step": 6336,
+ "text_loss": 0.3534671664237976
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.756090402113298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0003831644595111825,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10224820.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002143894787877798,
+ "skip_count": 0.0,
+ "step": 6338,
+ "text_loss": 0.20216144621372223
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 29.76548282946874,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.04736328125,
+ "learning_rate": 0.0003828635341458687,
+ "loss": 0.0064,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 10227479.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012319118715822697,
+ "skip_count": 2.0,
+ "step": 6340,
+ "text_loss": 0.26248639822006226
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.774875256824185,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.0003825626536674697,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10231347.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00334449321962893,
+ "skip_count": 0.0,
+ "step": 6342,
+ "text_loss": 0.6357201337814331
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.784267684179632,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.000382261818191283,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10234347.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027788348961621523,
+ "skip_count": 0.0,
+ "step": 6344,
+ "text_loss": 0.2813846468925476
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.00038196102783258996,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10237105.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001545077539049089,
+ "skip_count": 0.0,
+ "step": 6346,
+ "text_loss": 0.47612661123275757
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.060791015625,
+ "learning_rate": 0.0003816602827066537,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10240249.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005602670833468437,
+ "skip_count": 2.0,
+ "step": 6348,
+ "text_loss": 0.18197228014469147
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.812444966245963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0003813595829287204,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10243417.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004317959537729621,
+ "skip_count": 0.0,
+ "step": 6350,
+ "text_loss": 0.3818575143814087
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 0.0003810589286140186,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10246824.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002225276781246066,
+ "skip_count": 0.0,
+ "step": 6352,
+ "text_loss": 0.14129821956157684
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 29.831229820956853,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0003807583198777599,
+ "loss": 0.0062,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 10249836.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.02445496805012226,
+ "skip_count": 1.0,
+ "step": 6354,
+ "text_loss": 0.3237064480781555
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.840622248312297,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.00038045775683513786,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10252900.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009264222462661564,
+ "skip_count": 0.0,
+ "step": 6356,
+ "text_loss": 0.6777551174163818
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 29.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.0003801572396013289,
+ "loss": 0.0051,
+ "macro_f1": 1.0,
+ "num_tokens": 10255526.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007189550437033176,
+ "skip_count": 5.0,
+ "step": 6358,
+ "text_loss": 0.25438982248306274
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.00037985676829149187,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10258865.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014201018493622541,
+ "skip_count": 0.0,
+ "step": 6360,
+ "text_loss": 0.5063154101371765
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 29.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0003795563430207678,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10261677.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035477925557643175,
+ "skip_count": 3.0,
+ "step": 6362,
+ "text_loss": 0.4815357029438019
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.878191957734078,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.0003792559639042803,
+ "loss": 0.0049,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 10264805.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013723359443247318,
+ "skip_count": 1.0,
+ "step": 6364,
+ "text_loss": 0.5563676357269287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.88758438508952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06494140625,
+ "learning_rate": 0.0003789556310571351,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10267885.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028159532230347395,
+ "skip_count": 0.0,
+ "step": 6366,
+ "text_loss": 0.7284183502197266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.896976812444965,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0003786553445944204,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10270934.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005918835522606969,
+ "skip_count": 0.0,
+ "step": 6368,
+ "text_loss": 0.7387746572494507
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.906369239800412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0003783551046312067,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10273818.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011416864581406116,
+ "skip_count": 0.0,
+ "step": 6370,
+ "text_loss": 0.5360285043716431
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 29.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.00037805491128254645,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 10276494.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.002382483799010515,
+ "skip_count": 1.0,
+ "step": 6372,
+ "text_loss": 0.7536854147911072
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.00037775476466347414,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10279719.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021104486659169197,
+ "skip_count": 1.0,
+ "step": 6374,
+ "text_loss": 0.6807253956794739
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.934546521866746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0003774546648890066,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10283000.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003148776013404131,
+ "skip_count": 2.0,
+ "step": 6376,
+ "text_loss": 0.30774110555648804
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 29.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0003771546120741426,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 10285666.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007700880523771048,
+ "skip_count": 1.0,
+ "step": 6378,
+ "text_loss": 0.4476076364517212
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0003768546063338631,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10289127.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023625255562365055,
+ "skip_count": 1.0,
+ "step": 6380,
+ "text_loss": 0.4350969195365906
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0179443359375,
+ "learning_rate": 0.0003765546477831307,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10292485.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001428726245649159,
+ "skip_count": 0.0,
+ "step": 6382,
+ "text_loss": 0.49078530073165894
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0003762547365368902,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10295361.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027160397730767727,
+ "skip_count": 2.0,
+ "step": 6384,
+ "text_loss": 0.3476370573043823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.981508658643968,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.00037595487271006807,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10298717.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002456068294122815,
+ "skip_count": 0.0,
+ "step": 6386,
+ "text_loss": 0.3634916841983795
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 29.99090108599941,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.0003756550564175727,
+ "loss": 0.0049,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 10302102.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02546076290309429,
+ "skip_count": 3.0,
+ "step": 6388,
+ "text_loss": 0.2422582060098648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.00037535528777429426,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10305060.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001045907847583294,
+ "skip_count": 0.0,
+ "step": 6390,
+ "text_loss": 0.5563194155693054
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.009392427355444,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0003750555668951045,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 10307903.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007391332648694515,
+ "skip_count": 2.0,
+ "step": 6392,
+ "text_loss": 0.3423991799354553
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 30.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.00037475589389485744,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 10311396.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0029360291082412004,
+ "skip_count": 1.0,
+ "step": 6394,
+ "text_loss": 0.9877024292945862
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.00037445626888838807,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10314250.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014932662015780807,
+ "skip_count": 0.0,
+ "step": 6396,
+ "text_loss": 0.3978523313999176
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 30.037569709421778,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0003741566919905133,
+ "loss": 0.0051,
+ "macro_f1": 1.0,
+ "num_tokens": 10316894.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007003722712397575,
+ "skip_count": 5.0,
+ "step": 6398,
+ "text_loss": 0.2945566475391388
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 30.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.00037385716331603155,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 10319603.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006710570305585861,
+ "skip_count": 1.0,
+ "step": 6400,
+ "text_loss": 0.2984389662742615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0179443359375,
+ "learning_rate": 0.00037355768297972275,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10322670.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00048738415353000164,
+ "skip_count": 0.0,
+ "step": 6402,
+ "text_loss": 0.483262300491333
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 30.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 0.00037325825109634837,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 10326280.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001625525183044374,
+ "skip_count": 1.0,
+ "step": 6404,
+ "text_loss": 0.42678722739219666
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.07513941884356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0003729588677806513,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10329008.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004408636130392551,
+ "skip_count": 0.0,
+ "step": 6406,
+ "text_loss": 0.2264070063829422
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0003726595331473557,
+ "loss": 0.0032,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10332533.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038099216762930155,
+ "skip_count": 2.0,
+ "step": 6408,
+ "text_loss": 0.6670092940330505
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0003723602473111672,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10335643.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003097689710557461,
+ "skip_count": 0.0,
+ "step": 6410,
+ "text_loss": 0.45228812098503113
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.00037206101038677274,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10338522.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005268602631986141,
+ "skip_count": 1.0,
+ "step": 6412,
+ "text_loss": 0.7288079857826233
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0003717618224888405,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10341516.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004640138708055019,
+ "skip_count": 2.0,
+ "step": 6414,
+ "text_loss": 0.22850871086120605
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.00037146268373201954,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10344831.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006379318656399846,
+ "skip_count": 0.0,
+ "step": 6416,
+ "text_loss": 0.7864460945129395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0003711635942309408,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10348499.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004005273221991956,
+ "skip_count": 0.0,
+ "step": 6418,
+ "text_loss": 0.605839192867279
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0157470703125,
+ "learning_rate": 0.0003708645541002159,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10351722.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001061634044162929,
+ "skip_count": 0.0,
+ "step": 6420,
+ "text_loss": 0.8226510286331177
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 30.150278837687114,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.0003705655634544374,
+ "loss": 0.0052,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 10355275.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013980664312839508,
+ "skip_count": 2.0,
+ "step": 6422,
+ "text_loss": 0.2709597647190094
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.159671265042558,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.0003702666224081792,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10359702.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0013196271611377597,
+ "skip_count": 0.0,
+ "step": 6424,
+ "text_loss": 0.6451483368873596
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00036996773107599604,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10363364.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028023163322359324,
+ "skip_count": 1.0,
+ "step": 6426,
+ "text_loss": 0.2770799398422241
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01373291015625,
+ "learning_rate": 0.0003696688895724235,
+ "loss": 0.0029,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10366554.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011023655533790588,
+ "skip_count": 0.0,
+ "step": 6428,
+ "text_loss": 0.5466503500938416
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.187848547108892,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.0003693700980119784,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10369733.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00230707717128098,
+ "skip_count": 0.0,
+ "step": 6430,
+ "text_loss": 0.45667049288749695
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.19724097446434,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.00036907135650915824,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10373382.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036784098483622074,
+ "skip_count": 2.0,
+ "step": 6432,
+ "text_loss": 0.13856995105743408
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.00036877266517844115,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10376202.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008461157558485866,
+ "skip_count": 0.0,
+ "step": 6434,
+ "text_loss": 0.27238601446151733
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.216025829175226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.0003684740241342863,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10380748.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0052765593864023685,
+ "skip_count": 0.0,
+ "step": 6436,
+ "text_loss": 0.6182295083999634
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.225418256530673,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.00036817543349113355,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 10386148.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005562922917306423,
+ "skip_count": 2.0,
+ "step": 6438,
+ "text_loss": 0.5591027140617371
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0003678768933634033,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10389385.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008686366491019726,
+ "skip_count": 0.0,
+ "step": 6440,
+ "text_loss": 0.5158660411834717
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.0003675784038654968,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10391893.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022222092375159264,
+ "skip_count": 1.0,
+ "step": 6442,
+ "text_loss": 0.2865697741508484
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.253595538597008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0003672799651117958,
+ "loss": 0.0099,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10395082.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030799773521721363,
+ "skip_count": 2.0,
+ "step": 6444,
+ "text_loss": 0.21298295259475708
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 30.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0003669815772166625,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10398015.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035721305757761,
+ "skip_count": 3.0,
+ "step": 6446,
+ "text_loss": 0.5286803841590881
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 30.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 0.00036668324029443975,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10400749.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00741040613502264,
+ "skip_count": 4.0,
+ "step": 6448,
+ "text_loss": 0.3922366201877594
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.281772820663342,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0003663849544594507,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10404439.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002974750241264701,
+ "skip_count": 2.0,
+ "step": 6450,
+ "text_loss": 0.21894219517707825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 0.00036608671982599927,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10408476.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004810616374015808,
+ "skip_count": 0.0,
+ "step": 6452,
+ "text_loss": 0.3928622305393219
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.0003657885365083694,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10411533.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005527745466679335,
+ "skip_count": 0.0,
+ "step": 6454,
+ "text_loss": 0.22816279530525208
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.309950102729672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052734375,
+ "learning_rate": 0.00036549040462082556,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10414501.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021297158673405647,
+ "skip_count": 0.0,
+ "step": 6456,
+ "text_loss": 0.20487719774246216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 30.31934253008512,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0003651923242776124,
+ "loss": 0.0082,
+ "macro_f1": 0.6592592597007751,
+ "num_tokens": 10418296.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.046412210911512375,
+ "skip_count": 5.0,
+ "step": 6458,
+ "text_loss": 0.2890419065952301
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.00036489429559295484,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10421211.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004002603702247143,
+ "skip_count": 0.0,
+ "step": 6460,
+ "text_loss": 0.23165544867515564
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.338127384796007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0003645963186810581,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10424231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003480088198557496,
+ "skip_count": 1.0,
+ "step": 6462,
+ "text_loss": 0.6286683082580566
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0003642983936561075,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10427387.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009358933195471764,
+ "skip_count": 2.0,
+ "step": 6464,
+ "text_loss": 0.3258316218852997
+ },
+ {
+ "acc_repeat": 0.800000011920929,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.356912239506897,
+ "f1_execute": 0.9729729890823364,
+ "f1_repeat": 0.888888955116272,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.00036400052063226816,
+ "loss": 0.0048,
+ "macro_f1": 0.9539539813995361,
+ "num_tokens": 10430813.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.03567950055003166,
+ "skip_count": 5.0,
+ "step": 6466,
+ "text_loss": 0.7278715968132019
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.00036370269972368615,
+ "loss": 0.008,
+ "macro_f1": 1.0,
+ "num_tokens": 10434175.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00226925453171134,
+ "skip_count": 2.0,
+ "step": 6468,
+ "text_loss": 0.5652450919151306
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.375697094217788,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0174560546875,
+ "learning_rate": 0.0003634049310444867,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10437393.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013644809368997812,
+ "skip_count": 0.0,
+ "step": 6470,
+ "text_loss": 0.5985191464424133
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.38508952157323,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.0003631072147087753,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10440412.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003114990540780127,
+ "skip_count": 0.0,
+ "step": 6472,
+ "text_loss": 0.5588209629058838
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.394481948928675,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.00036280955083063747,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10443471.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005486322334036231,
+ "skip_count": 0.0,
+ "step": 6474,
+ "text_loss": 0.6969016194343567
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.403874376284122,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.00036251193952413865,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 10446548.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008256378583610058,
+ "skip_count": 2.0,
+ "step": 6476,
+ "text_loss": 0.27083566784858704
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0003622143809033239,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10449478.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001008771825581789,
+ "skip_count": 0.0,
+ "step": 6478,
+ "text_loss": 0.1689433604478836
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.00036191687508221827,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10453017.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0014678959269076586,
+ "skip_count": 0.0,
+ "step": 6480,
+ "text_loss": 0.9571998715400696
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.432051658350456,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 0.0003616194221748267,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10456061.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001516164978966117,
+ "skip_count": 0.0,
+ "step": 6482,
+ "text_loss": 0.5750429034233093
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.0003613220222951335,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10459130.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031315975356847048,
+ "skip_count": 0.0,
+ "step": 6484,
+ "text_loss": 0.47120073437690735
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.0003610246755571029,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10462190.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006079549202695489,
+ "skip_count": 0.0,
+ "step": 6486,
+ "text_loss": 0.8426173329353333
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.000360727382074679,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10465233.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00596054969355464,
+ "skip_count": 0.0,
+ "step": 6488,
+ "text_loss": 0.18435880541801453
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.469621367772234,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00036043014196178463,
+ "loss": 0.0046,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 10468135.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008584967814385891,
+ "skip_count": 1.0,
+ "step": 6490,
+ "text_loss": 0.3827758729457855
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 30.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 0.00036013295533232344,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 10471032.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005076571833342314,
+ "skip_count": 5.0,
+ "step": 6492,
+ "text_loss": 0.1215854063630104
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 30.488406222483125,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.0003598358223001776,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10474779.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.005972118582576513,
+ "skip_count": 0.0,
+ "step": 6494,
+ "text_loss": 0.22768665850162506
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.49779864983857,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.0003595387429792091,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10478015.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004733685404062271,
+ "skip_count": 1.0,
+ "step": 6496,
+ "text_loss": 0.5013535618782043
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.00035924171748325916,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10481113.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01148980576545,
+ "skip_count": 2.0,
+ "step": 6498,
+ "text_loss": 0.3281762897968292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.516583504549455,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0003589447459261487,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10484049.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007726775947958231,
+ "skip_count": 2.0,
+ "step": 6500,
+ "text_loss": 0.46294569969177246
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00035864782842167763,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10487443.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0013331319205462933,
+ "skip_count": 0.0,
+ "step": 6502,
+ "text_loss": 0.5122153759002686
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.00035835096508362544,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10490535.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011629529763013124,
+ "skip_count": 0.0,
+ "step": 6504,
+ "text_loss": 0.40683525800704956
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00035805415602575054,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10493575.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004780632443726063,
+ "skip_count": 0.0,
+ "step": 6506,
+ "text_loss": 0.37263134121894836
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.00035775740136179075,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10496193.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018355643842369318,
+ "skip_count": 0.0,
+ "step": 6508,
+ "text_loss": 0.2074306458234787
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.00035746070120546314,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10500135.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004067617934197187,
+ "skip_count": 1.0,
+ "step": 6510,
+ "text_loss": 0.26313406229019165
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.572938068682124,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00035716405567046383,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10503533.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005438363179564476,
+ "skip_count": 0.0,
+ "step": 6512,
+ "text_loss": 0.3448122441768646
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.00035686746487046767,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10506207.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012895528925582767,
+ "skip_count": 0.0,
+ "step": 6514,
+ "text_loss": 0.43096476793289185
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0003565709289191291,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10509257.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003141741268336773,
+ "skip_count": 0.0,
+ "step": 6516,
+ "text_loss": 0.22349724173545837
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.601115350748458,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 0.0003562744479300811,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10512554.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005669888923875988,
+ "skip_count": 0.0,
+ "step": 6518,
+ "text_loss": 0.5319190621376038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.610507778103905,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.00035597802201693587,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10515720.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020814717281609774,
+ "skip_count": 0.0,
+ "step": 6520,
+ "text_loss": 0.20216144621372223
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.61990020545935,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0003556816512932841,
+ "loss": 0.0057,
+ "macro_f1": 1.0,
+ "num_tokens": 10518517.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.010716461576521397,
+ "skip_count": 3.0,
+ "step": 6522,
+ "text_loss": 0.15843836963176727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.629292632814792,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01806640625,
+ "learning_rate": 0.0003553853358726959,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10521414.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014748790999874473,
+ "skip_count": 0.0,
+ "step": 6524,
+ "text_loss": 0.393892377614975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00035508907586871984,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10524210.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004757299611810595,
+ "skip_count": 0.0,
+ "step": 6526,
+ "text_loss": 0.2557907700538635
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.00035479287139488327,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10527327.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002445317106321454,
+ "skip_count": 0.0,
+ "step": 6528,
+ "text_loss": 0.48338422179222107
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.0003544967225646922,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10530363.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015845977468416095,
+ "skip_count": 0.0,
+ "step": 6530,
+ "text_loss": 0.6474354267120361
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.666862342236573,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.00035420062949163166,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10533444.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002190655330196023,
+ "skip_count": 0.0,
+ "step": 6532,
+ "text_loss": 0.3789777457714081
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0003539045922891649,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10536711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00317079434171319,
+ "skip_count": 0.0,
+ "step": 6534,
+ "text_loss": 0.25758084654808044
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00035360861107073394,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10539849.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010938458144664764,
+ "skip_count": 0.0,
+ "step": 6536,
+ "text_loss": 0.9821014404296875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.695039624302908,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0003533126859497592,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10543004.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003071998478844762,
+ "skip_count": 2.0,
+ "step": 6538,
+ "text_loss": 0.6314182281494141
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0003530168170396401,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10545965.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006067665759474039,
+ "skip_count": 2.0,
+ "step": 6540,
+ "text_loss": 0.5021927356719971
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0167236328125,
+ "learning_rate": 0.000352721004453754,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10549188.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019109295681118965,
+ "skip_count": 0.0,
+ "step": 6542,
+ "text_loss": 0.3008780777454376
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 30.723216906369238,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.00035242524830545683,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10552298.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007457790896296501,
+ "skip_count": 3.0,
+ "step": 6544,
+ "text_loss": 0.5675695538520813
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0003521295487080829,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10555123.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007243642583489418,
+ "skip_count": 1.0,
+ "step": 6546,
+ "text_loss": 0.17955881357192993
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.00035183390577494476,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10559653.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004024330526590347,
+ "skip_count": 0.0,
+ "step": 6548,
+ "text_loss": 0.2634682357311249
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.751394188435572,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017578125,
+ "learning_rate": 0.0003515383196193336,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10563770.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010837121866643429,
+ "skip_count": 0.0,
+ "step": 6550,
+ "text_loss": 0.1608252227306366
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0003512427903545183,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10567117.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003473864868283272,
+ "skip_count": 0.0,
+ "step": 6552,
+ "text_loss": 0.231611430644989
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0003509473180937464,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10570622.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004441239405423403,
+ "skip_count": 1.0,
+ "step": 6554,
+ "text_loss": 0.3193909227848053
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.779571470501907,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0003506519029502433,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10573411.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008821079391054809,
+ "skip_count": 0.0,
+ "step": 6556,
+ "text_loss": 0.4478783905506134
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.788963897857354,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.0003503565450372128,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10576422.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0014448441797867417,
+ "skip_count": 0.0,
+ "step": 6558,
+ "text_loss": 0.46065983176231384
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0003500612444678365,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10579879.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007939066737890244,
+ "skip_count": 1.0,
+ "step": 6560,
+ "text_loss": 0.3299395740032196
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.80774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.000349766001355274,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10583067.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010073966346681118,
+ "skip_count": 2.0,
+ "step": 6562,
+ "text_loss": 0.278255820274353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.817141179923688,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.00034947081581266335,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10586276.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0062315030954778194,
+ "skip_count": 1.0,
+ "step": 6564,
+ "text_loss": 0.22706018388271332
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0003491756879531201,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 10589257.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.0023778853937983513,
+ "skip_count": 4.0,
+ "step": 6566,
+ "text_loss": 0.5567800998687744
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 0.0003488806178897377,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10592163.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004184350254945457,
+ "skip_count": 0.0,
+ "step": 6568,
+ "text_loss": 0.4027897119522095
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.845318461990022,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.0003485856057355876,
+ "loss": 0.0027,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10595326.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035254736430943012,
+ "skip_count": 1.0,
+ "step": 6570,
+ "text_loss": 0.3044572174549103
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.000348290651603719,
+ "loss": 0.0029,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10598236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030894684605300426,
+ "skip_count": 0.0,
+ "step": 6572,
+ "text_loss": 0.23021161556243896
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.00034799575560715896,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10601653.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0036557347048074007,
+ "skip_count": 0.0,
+ "step": 6574,
+ "text_loss": 0.5437754392623901
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0003477009178589121,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 10604581.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.021344119682908058,
+ "skip_count": 4.0,
+ "step": 6576,
+ "text_loss": 0.29078927636146545
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 30.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0003474061384719608,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 10607676.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0037169242277741432,
+ "skip_count": 1.0,
+ "step": 6578,
+ "text_loss": 1.1790896654129028
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.0003471114175592649,
+ "loss": 0.0057,
+ "macro_f1": 1.0,
+ "num_tokens": 10611269.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005873420741409063,
+ "skip_count": 4.0,
+ "step": 6580,
+ "text_loss": 0.36204129457473755
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.901673026122687,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0003468167552337624,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 10614335.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01030842587351799,
+ "skip_count": 2.0,
+ "step": 6582,
+ "text_loss": 0.20400437712669373
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.061767578125,
+ "learning_rate": 0.00034652215160836826,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10617565.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025721401907503605,
+ "skip_count": 0.0,
+ "step": 6584,
+ "text_loss": 0.44676345586776733
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.00034622760679597507,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10620706.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005751762073487043,
+ "skip_count": 1.0,
+ "step": 6586,
+ "text_loss": 0.4733653664588928
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 30.92985030818902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.00034593312090945306,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10623916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029759553726762533,
+ "skip_count": 3.0,
+ "step": 6588,
+ "text_loss": 0.49876922369003296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0003456386940616498,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10628093.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010031822603195906,
+ "skip_count": 0.0,
+ "step": 6590,
+ "text_loss": 0.42708611488342285
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.00034534432636539004,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10631739.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014793311711400747,
+ "skip_count": 0.0,
+ "step": 6592,
+ "text_loss": 0.18193726241588593
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0003450500179334762,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10634862.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0059733521193265915,
+ "skip_count": 2.0,
+ "step": 6594,
+ "text_loss": 0.28596529364585876
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.967420017610802,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0003447557688786879,
+ "loss": 0.0043,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 10637758.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0076768649742007256,
+ "skip_count": 1.0,
+ "step": 6596,
+ "text_loss": 0.39428210258483887
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00034446157931378185,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10640440.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015128811355680227,
+ "skip_count": 0.0,
+ "step": 6598,
+ "text_loss": 0.45584383606910706
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043701171875,
+ "learning_rate": 0.00034416744935149193,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10643600.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000757391273509711,
+ "skip_count": 0.0,
+ "step": 6600,
+ "text_loss": 0.503209114074707
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.995597299677137,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.060302734375,
+ "learning_rate": 0.0003438733791045294,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10646907.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025944956578314304,
+ "skip_count": 2.0,
+ "step": 6602,
+ "text_loss": 0.4370735287666321
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.00469621367772,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.00034357936868558255,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10649995.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006543452036567032,
+ "skip_count": 0.0,
+ "step": 6604,
+ "text_loss": 0.4125586748123169
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.014088641033165,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00034328541820731663,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10653251.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00027016724925488234,
+ "skip_count": 1.0,
+ "step": 6606,
+ "text_loss": 0.7309898734092712
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 31.023481068388612,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 0.00034299152778237413,
+ "loss": 0.0062,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 10657229.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01905548945069313,
+ "skip_count": 2.0,
+ "step": 6608,
+ "text_loss": 0.42367079854011536
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 0.0003426976975233744,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10660524.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004718089767266065,
+ "skip_count": 0.0,
+ "step": 6610,
+ "text_loss": 0.6613664627075195
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 31.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.00034240392754291343,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10663908.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0027069442439824343,
+ "skip_count": 0.0,
+ "step": 6612,
+ "text_loss": 0.859471321105957
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.051658350454947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.000342110217953565,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10667814.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015497280983254313,
+ "skip_count": 0.0,
+ "step": 6614,
+ "text_loss": 0.18337638676166534
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.0003418165688678788,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10671630.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013396464055404067,
+ "skip_count": 0.0,
+ "step": 6616,
+ "text_loss": 0.860016405582428
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 31.070443205165834,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.0003415229803983819,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10675308.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007542039267718792,
+ "skip_count": 3.0,
+ "step": 6618,
+ "text_loss": 0.15481022000312805
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0003412294526575779,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10678092.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002029839437454939,
+ "skip_count": 2.0,
+ "step": 6620,
+ "text_loss": 0.5121933221817017
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.00034093598575794706,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10681382.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013001341139897704,
+ "skip_count": 0.0,
+ "step": 6622,
+ "text_loss": 0.4555061161518097
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.098620487232168,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.00034064257981194655,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10684255.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007926415419206023,
+ "skip_count": 0.0,
+ "step": 6624,
+ "text_loss": 0.7298227548599243
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.108012914587615,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0003403492349320101,
+ "loss": 0.0031,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10686904.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021080176811665297,
+ "skip_count": 1.0,
+ "step": 6626,
+ "text_loss": 0.45434215664863586
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.11740534194306,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.000340055951230548,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10690311.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004011874087154865,
+ "skip_count": 0.0,
+ "step": 6628,
+ "text_loss": 0.15496443212032318
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.126797769298502,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.00033976272881994707,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10693395.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031893099658191204,
+ "skip_count": 2.0,
+ "step": 6630,
+ "text_loss": 0.5291517972946167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0003394695678125708,
+ "loss": 0.0085,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10697046.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033124347683042288,
+ "skip_count": 1.0,
+ "step": 6632,
+ "text_loss": 0.2893230617046356
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.00033917646832075886,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10700111.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002547801472246647,
+ "skip_count": 0.0,
+ "step": 6634,
+ "text_loss": 0.10363512486219406
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 31.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.0003388834304568275,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10703939.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0019040531478822231,
+ "skip_count": 0.0,
+ "step": 6636,
+ "text_loss": 0.5185034275054932
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.164367478720283,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00033859045433306975,
+ "loss": 0.0034,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10707187.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0074104927480220795,
+ "skip_count": 2.0,
+ "step": 6638,
+ "text_loss": 0.1618153154850006
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048583984375,
+ "learning_rate": 0.0003382975400617543,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10710029.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013861875049769878,
+ "skip_count": 1.0,
+ "step": 6640,
+ "text_loss": 0.6674485206604004
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0003380046877551266,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10713318.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034452753607183695,
+ "skip_count": 0.0,
+ "step": 6642,
+ "text_loss": 0.39299124479293823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.192544760786618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.0003377118975254082,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10716130.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006802885327488184,
+ "skip_count": 2.0,
+ "step": 6644,
+ "text_loss": 0.12942606210708618
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.20193718814206,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.0003374191694847968,
+ "loss": 0.0052,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 10719400.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03718209266662598,
+ "skip_count": 2.0,
+ "step": 6646,
+ "text_loss": 0.34327754378318787
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0003371265037454663,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10722108.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006016947794705629,
+ "skip_count": 2.0,
+ "step": 6648,
+ "text_loss": 0.15644726157188416
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.220722042852948,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00033683390041956663,
+ "loss": 0.0075,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 10725709.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04308273270726204,
+ "skip_count": 2.0,
+ "step": 6650,
+ "text_loss": 0.1875772923231125
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 31.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.0003365413596192243,
+ "loss": 0.0037,
+ "macro_f1": 1.0,
+ "num_tokens": 10728717.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006372809875756502,
+ "skip_count": 1.0,
+ "step": 6652,
+ "text_loss": 0.4948291778564453
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.00033624888145654137,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10732082.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014530479675158858,
+ "skip_count": 0.0,
+ "step": 6654,
+ "text_loss": 0.44932305812835693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.248899324919282,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00033595646604359585,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10734663.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001924810465425253,
+ "skip_count": 0.0,
+ "step": 6656,
+ "text_loss": 0.45626893639564514
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 31.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.00033566411349244206,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10737470.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0040014320984482765,
+ "skip_count": 0.0,
+ "step": 6658,
+ "text_loss": 0.2700682580471039
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.00033537182391510996,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10740228.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008573737577535212,
+ "skip_count": 0.0,
+ "step": 6660,
+ "text_loss": 0.5626822113990784
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.277076606985617,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0003350795974236055,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10742883.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011166860349476337,
+ "skip_count": 1.0,
+ "step": 6662,
+ "text_loss": 0.23357805609703064
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 31.286469034341064,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.00033478743412991037,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 10746459.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01719980500638485,
+ "skip_count": 6.0,
+ "step": 6664,
+ "text_loss": 0.150017648935318
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.00033449533414598223,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10749984.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038280142471194267,
+ "skip_count": 2.0,
+ "step": 6666,
+ "text_loss": 0.6312657594680786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.30525388905195,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.00033420329758375423,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10752792.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007688060286454856,
+ "skip_count": 1.0,
+ "step": 6668,
+ "text_loss": 0.6794863939285278
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.314646316407398,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.00033391132455513537,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10756125.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003196930279955268,
+ "skip_count": 2.0,
+ "step": 6670,
+ "text_loss": 0.22897565364837646
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0003336194151720102,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10759296.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026212623342871666,
+ "skip_count": 0.0,
+ "step": 6672,
+ "text_loss": 0.5236268639564514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.0003333275695462391,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10762574.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007855101488530636,
+ "skip_count": 2.0,
+ "step": 6674,
+ "text_loss": 0.2971038818359375
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.342823598473732,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0003330357877896577,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10765758.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004191791173070669,
+ "skip_count": 2.0,
+ "step": 6676,
+ "text_loss": 0.17358586192131042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.0003327440700140774,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10769396.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004101858474314213,
+ "skip_count": 1.0,
+ "step": 6678,
+ "text_loss": 0.28932204842567444
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.000332452416331285,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10772605.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008305918308906257,
+ "skip_count": 0.0,
+ "step": 6680,
+ "text_loss": 0.47090092301368713
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 0.0003321608268530427,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10776576.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003022305201739073,
+ "skip_count": 1.0,
+ "step": 6682,
+ "text_loss": 0.4467788338661194
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 31.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.00033186930169108795,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10779648.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0021474999375641346,
+ "skip_count": 0.0,
+ "step": 6684,
+ "text_loss": 0.6249470710754395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.054931640625,
+ "learning_rate": 0.00033157784095713417,
+ "loss": 0.009,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10782665.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025120675563812256,
+ "skip_count": 1.0,
+ "step": 6686,
+ "text_loss": 0.6763803958892822
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.399178162606397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0003312864447628695,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10785789.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013111691223457456,
+ "skip_count": 1.0,
+ "step": 6688,
+ "text_loss": 0.6609058380126953
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.00033099511321995744,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10788846.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012354454956948757,
+ "skip_count": 0.0,
+ "step": 6690,
+ "text_loss": 0.4421829283237457
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0003307038464400368,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10791611.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035219944547861814,
+ "skip_count": 2.0,
+ "step": 6692,
+ "text_loss": 0.16222824156284332
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 31.42735544467273,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.00033041264453472153,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10794868.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0007216202793642879,
+ "skip_count": 0.0,
+ "step": 6694,
+ "text_loss": 0.37388721108436584
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 31.436747872028178,
+ "f1_execute": 0.9743589162826538,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0003301215076156008,
+ "loss": 0.0063,
+ "macro_f1": 0.8803418874740601,
+ "num_tokens": 10797737.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.025403080508112907,
+ "skip_count": 7.0,
+ "step": 6696,
+ "text_loss": 0.5086690187454224
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0003298304357942389,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10800972.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010532539337873459,
+ "skip_count": 2.0,
+ "step": 6698,
+ "text_loss": 0.22500646114349365
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.00032953942918217494,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10803654.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009591903653927147,
+ "skip_count": 0.0,
+ "step": 6700,
+ "text_loss": 0.6256277561187744
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.464925154094512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0003292484878909232,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10807506.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003801517654210329,
+ "skip_count": 2.0,
+ "step": 6702,
+ "text_loss": 0.522081196308136
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0189208984375,
+ "learning_rate": 0.00032895761203197317,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10810163.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002608039416372776,
+ "skip_count": 2.0,
+ "step": 6704,
+ "text_loss": 0.3600201904773712
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00032866680171678874,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10813202.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026464913971722126,
+ "skip_count": 0.0,
+ "step": 6706,
+ "text_loss": 0.2513798773288727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.493102436160846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.00032837605705680895,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10816484.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027157769072800875,
+ "skip_count": 0.0,
+ "step": 6708,
+ "text_loss": 0.34391456842422485
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 31.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 0.0003280853781634481,
+ "loss": 0.0041,
+ "macro_f1": 1.0,
+ "num_tokens": 10819794.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016086180694401264,
+ "skip_count": 1.0,
+ "step": 6710,
+ "text_loss": 0.6535179615020752
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0003277947651480946,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10823033.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002368347719311714,
+ "skip_count": 0.0,
+ "step": 6712,
+ "text_loss": 0.5596423745155334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.0003275042181221119,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10826276.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003124286886304617,
+ "skip_count": 0.0,
+ "step": 6714,
+ "text_loss": 0.6584402322769165
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0003272137371968382,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10828846.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006088328082114458,
+ "skip_count": 0.0,
+ "step": 6716,
+ "text_loss": 0.4602710008621216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.00032692332248358645,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10832025.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002511275466531515,
+ "skip_count": 2.0,
+ "step": 6718,
+ "text_loss": 0.42790886759757996
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 31.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.000326632974093644,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10835110.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01076667383313179,
+ "skip_count": 0.0,
+ "step": 6720,
+ "text_loss": 0.5659847855567932
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 31.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.0003263426921382728,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 10838279.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.004973042290657759,
+ "skip_count": 2.0,
+ "step": 6722,
+ "text_loss": 0.675341010093689
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.00032605247672870964,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10841381.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013990222942084074,
+ "skip_count": 0.0,
+ "step": 6724,
+ "text_loss": 0.5389315485954285
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.57763428235985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.00032576232797616554,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10844583.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003186358604580164,
+ "skip_count": 1.0,
+ "step": 6726,
+ "text_loss": 0.5603348016738892
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0003254722459918261,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10847670.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001443870598450303,
+ "skip_count": 0.0,
+ "step": 6728,
+ "text_loss": 0.6922405362129211
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0003251822308868512,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10851479.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004294445738196373,
+ "skip_count": 0.0,
+ "step": 6730,
+ "text_loss": 0.7145437002182007
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.00032489228277237514,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10854489.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032078945077955723,
+ "skip_count": 0.0,
+ "step": 6732,
+ "text_loss": 0.4077773094177246
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.00032460240175950664,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 10856954.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038214854430407286,
+ "skip_count": 2.0,
+ "step": 6734,
+ "text_loss": 0.32071781158447266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.0003243125879593286,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10860016.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013407845981419086,
+ "skip_count": 0.0,
+ "step": 6736,
+ "text_loss": 0.45335495471954346
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0003240228414828984,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10863021.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010989385191351175,
+ "skip_count": 0.0,
+ "step": 6738,
+ "text_loss": 0.562619149684906
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046630859375,
+ "learning_rate": 0.0003237331624412473,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10866548.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006139552686363459,
+ "skip_count": 0.0,
+ "step": 6740,
+ "text_loss": 0.14510060846805573
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00032344355094538087,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10869402.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004785746335983276,
+ "skip_count": 0.0,
+ "step": 6742,
+ "text_loss": 0.5655979514122009
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.662166128558848,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.00032315400710627876,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10874165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0052397786639630795,
+ "skip_count": 0.0,
+ "step": 6744,
+ "text_loss": 0.4785873591899872
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 31.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.0003228645310348948,
+ "loss": 0.0036,
+ "macro_f1": 1.0,
+ "num_tokens": 10876919.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.00460197776556015,
+ "skip_count": 1.0,
+ "step": 6746,
+ "text_loss": 0.5683879256248474
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.0003225751228421566,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10880179.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032690472435206175,
+ "skip_count": 0.0,
+ "step": 6748,
+ "text_loss": 0.5268497467041016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.690343410625182,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052978515625,
+ "learning_rate": 0.00032228578263896607,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10883711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036305058747529984,
+ "skip_count": 0.0,
+ "step": 6750,
+ "text_loss": 0.16675594449043274
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.69973583798063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0003219965105361989,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10887041.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002453352091833949,
+ "skip_count": 1.0,
+ "step": 6752,
+ "text_loss": 0.7010246515274048
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.00032170730664470465,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10890053.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020381701178848743,
+ "skip_count": 0.0,
+ "step": 6754,
+ "text_loss": 0.46637895703315735
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.718520692691516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0003214181710753069,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10893501.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004525696858763695,
+ "skip_count": 0.0,
+ "step": 6756,
+ "text_loss": 0.1768684983253479
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 31.727913120046964,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0003211291039388026,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10896480.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038154330104589462,
+ "skip_count": 0.0,
+ "step": 6758,
+ "text_loss": 0.7908347845077515
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.00032084010534596326,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10899158.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004711449146270752,
+ "skip_count": 2.0,
+ "step": 6760,
+ "text_loss": 0.37209007143974304
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 31.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0003205511754075335,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 10901791.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0025003373157233,
+ "skip_count": 1.0,
+ "step": 6762,
+ "text_loss": 0.8081201314926147
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 31.756090402113298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.00032026231423423204,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10904817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007387075573205948,
+ "skip_count": 3.0,
+ "step": 6764,
+ "text_loss": 0.30355480313301086
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 31.76548282946874,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0003199735219367507,
+ "loss": 0.0061,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 10908018.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04275592789053917,
+ "skip_count": 0.0,
+ "step": 6766,
+ "text_loss": 0.26562029123306274
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.774875256824185,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.0003196847986257553,
+ "loss": 0.008,
+ "macro_f1": 0.9255813956260681,
+ "num_tokens": 10911264.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.034824032336473465,
+ "skip_count": 4.0,
+ "step": 6768,
+ "text_loss": 0.2761698067188263
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.784267684179632,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00031939614441188523,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10915964.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011179742868989706,
+ "skip_count": 0.0,
+ "step": 6770,
+ "text_loss": 0.4107927083969116
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.00031910755940575344,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10918678.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011521469568833709,
+ "skip_count": 0.0,
+ "step": 6772,
+ "text_loss": 0.43064895272254944
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 31.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01708984375,
+ "learning_rate": 0.000318819043717946,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 10921757.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002861087443307042,
+ "skip_count": 1.0,
+ "step": 6774,
+ "text_loss": 0.5945150852203369
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.812444966245963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.0003185305974590229,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10924767.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011365334503352642,
+ "skip_count": 0.0,
+ "step": 6776,
+ "text_loss": 0.36615172028541565
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 31.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0003182422207395171,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10927750.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0034391419030725956,
+ "skip_count": 0.0,
+ "step": 6778,
+ "text_loss": 0.17081251740455627
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0003179539136699351,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10930817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004941808991134167,
+ "skip_count": 2.0,
+ "step": 6780,
+ "text_loss": 0.7683762311935425
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 31.840622248312297,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.00031766567636075675,
+ "loss": 0.0061,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 10933882.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.017502857372164726,
+ "skip_count": 2.0,
+ "step": 6782,
+ "text_loss": 0.38010457158088684
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0003173775089224353,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 10936909.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0035372809506952763,
+ "skip_count": 2.0,
+ "step": 6784,
+ "text_loss": 0.5760656595230103
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.859407103023187,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.00031708941146539707,
+ "loss": 0.0061,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 10940032.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02229934185743332,
+ "skip_count": 0.0,
+ "step": 6786,
+ "text_loss": 0.5767728090286255
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.00031680138410004123,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10943217.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028649091254919767,
+ "skip_count": 1.0,
+ "step": 6788,
+ "text_loss": 0.9756367802619934
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.878191957734078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.00031651342693674066,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10947847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0039158593863248825,
+ "skip_count": 2.0,
+ "step": 6790,
+ "text_loss": 0.2504335045814514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.88758438508952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.000316225540085841,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10950879.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022091215942054987,
+ "skip_count": 0.0,
+ "step": 6792,
+ "text_loss": 0.525842547416687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.896976812444965,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.00031593772365766105,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10954960.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006841494468972087,
+ "skip_count": 0.0,
+ "step": 6794,
+ "text_loss": 0.6383582353591919
+ },
+ {
+ "acc_repeat": 0.800000011920929,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.906369239800412,
+ "f1_execute": 0.9729729890823364,
+ "f1_repeat": 0.888888955116272,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0003156499777624926,
+ "loss": 0.006,
+ "macro_f1": 0.9539539813995361,
+ "num_tokens": 10958278.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.03810702636837959,
+ "skip_count": 5.0,
+ "step": 6796,
+ "text_loss": 0.5901661515235901
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01708984375,
+ "learning_rate": 0.0003153623025106005,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10962412.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00046833412488922477,
+ "skip_count": 0.0,
+ "step": 6798,
+ "text_loss": 0.42693984508514404
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00031507469801222233,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10966037.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006818041671067476,
+ "skip_count": 2.0,
+ "step": 6800,
+ "text_loss": 0.5326262712478638
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.934546521866746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00031478716437756876,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10969369.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029889161232858896,
+ "skip_count": 0.0,
+ "step": 6802,
+ "text_loss": 0.49028220772743225
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0003144997017168232,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10972016.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038266500923782587,
+ "skip_count": 2.0,
+ "step": 6804,
+ "text_loss": 0.43391722440719604
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0189208984375,
+ "learning_rate": 0.0003142123101401417,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10975153.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005866789724677801,
+ "skip_count": 0.0,
+ "step": 6806,
+ "text_loss": 0.5888382196426392
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.00031392498975765353,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10977881.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002122384263202548,
+ "skip_count": 0.0,
+ "step": 6808,
+ "text_loss": 0.30313390493392944
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0003136377406794604,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10982025.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005535652744583786,
+ "skip_count": 0.0,
+ "step": 6810,
+ "text_loss": 0.5788959264755249
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.981508658643968,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0003133505630156365,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10985419.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010623604990541935,
+ "skip_count": 2.0,
+ "step": 6812,
+ "text_loss": 0.18577243387699127
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.00031306345687622905,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10989116.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004721239674836397,
+ "skip_count": 0.0,
+ "step": 6814,
+ "text_loss": 0.4818301200866699
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0167236328125,
+ "learning_rate": 0.0003127764223712575,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10992064.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004238430701661855,
+ "skip_count": 0.0,
+ "step": 6816,
+ "text_loss": 0.7482771277427673
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.00939242735544,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0003124894596107141,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 10994903.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005224394146353006,
+ "skip_count": 2.0,
+ "step": 6818,
+ "text_loss": 0.186603844165802
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.00031220256870456356,
+ "loss": 0.0069,
+ "macro_f1": 1.0,
+ "num_tokens": 10998692.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0021751862950623035,
+ "skip_count": 2.0,
+ "step": 6820,
+ "text_loss": 0.45633986592292786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 32.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.00031191574976274284,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11001284.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004747046157717705,
+ "skip_count": 4.0,
+ "step": 6822,
+ "text_loss": 0.5651670694351196
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0003116290028951617,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11004293.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008316585444845259,
+ "skip_count": 0.0,
+ "step": 6824,
+ "text_loss": 0.3167279362678528
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055419921875,
+ "learning_rate": 0.000311342328211702,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11007080.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004732926026917994,
+ "skip_count": 0.0,
+ "step": 6826,
+ "text_loss": 0.49171411991119385
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.000311055725822218,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11010078.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004238729365170002,
+ "skip_count": 0.0,
+ "step": 6828,
+ "text_loss": 0.21484950184822083
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0003107691958365361,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11013368.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029175232630223036,
+ "skip_count": 2.0,
+ "step": 6830,
+ "text_loss": 0.3718266189098358
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0003104827383644555,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11016704.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00191891985014081,
+ "skip_count": 0.0,
+ "step": 6832,
+ "text_loss": 0.28772637248039246
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.084531846199,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.00031019635351574705,
+ "loss": 0.0035,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11019651.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004300855100154877,
+ "skip_count": 2.0,
+ "step": 6834,
+ "text_loss": 0.6583508849143982
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.09392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.000309910041400154,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11023847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00037701442488469183,
+ "skip_count": 0.0,
+ "step": 6836,
+ "text_loss": 0.36090534925460815
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 32.10331670090989,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0003096238021273917,
+ "loss": 0.0077,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 11027804.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03601725772023201,
+ "skip_count": 3.0,
+ "step": 6838,
+ "text_loss": 0.24180401861667633
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.11270912826534,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.00030933763580714757,
+ "loss": 0.0052,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 11030778.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.023780640214681625,
+ "skip_count": 2.0,
+ "step": 6840,
+ "text_loss": 0.4978102743625641
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00030905154254908104,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11034863.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00565778324380517,
+ "skip_count": 0.0,
+ "step": 6842,
+ "text_loss": 0.558772623538971
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.00030876552246282356,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11038488.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010575232096016407,
+ "skip_count": 0.0,
+ "step": 6844,
+ "text_loss": 0.2955974340438843
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.0003084795756579787,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11041796.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015910190995782614,
+ "skip_count": 0.0,
+ "step": 6846,
+ "text_loss": 0.5009704828262329
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0003081937022441217,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11045141.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008034126949496567,
+ "skip_count": 0.0,
+ "step": 6848,
+ "text_loss": 0.3965311646461487
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 32.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0003079079023307999,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11047814.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.00810160581022501,
+ "skip_count": 0.0,
+ "step": 6850,
+ "text_loss": 0.24341927468776703
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0003076221760275321,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11051330.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006590691395103931,
+ "skip_count": 0.0,
+ "step": 6852,
+ "text_loss": 0.5887606739997864
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00030733652344380936,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11055006.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005845054984092712,
+ "skip_count": 0.0,
+ "step": 6854,
+ "text_loss": 0.6621366739273071
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0003070509446890944,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11058470.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0041051446460187435,
+ "skip_count": 1.0,
+ "step": 6856,
+ "text_loss": 0.31603100895881653
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0003067654398728214,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11061620.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001603201380930841,
+ "skip_count": 0.0,
+ "step": 6858,
+ "text_loss": 0.5167516469955444
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 0.00030648000910439636,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11064727.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024816282093524933,
+ "skip_count": 0.0,
+ "step": 6860,
+ "text_loss": 0.5869330167770386
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.21602582917523,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00030619465249319693,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11068208.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003121294779703021,
+ "skip_count": 0.0,
+ "step": 6862,
+ "text_loss": 0.3920222818851471
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 0.0003059093701485722,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11071315.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033239589538425207,
+ "skip_count": 1.0,
+ "step": 6864,
+ "text_loss": 0.4201887845993042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 0.00030562416217984296,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11074144.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016117560444399714,
+ "skip_count": 0.0,
+ "step": 6866,
+ "text_loss": 0.5283045172691345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0003053390286963015,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11077152.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003879208816215396,
+ "skip_count": 0.0,
+ "step": 6868,
+ "text_loss": 0.16188788414001465
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.00030505396980721143,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11080200.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007632353343069553,
+ "skip_count": 1.0,
+ "step": 6870,
+ "text_loss": 0.25986847281455994
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.00030476898562180793,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11083356.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004322016146034002,
+ "skip_count": 2.0,
+ "step": 6872,
+ "text_loss": 0.49556297063827515
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.0003044840762492974,
+ "loss": 0.0037,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11086354.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031272871419787407,
+ "skip_count": 2.0,
+ "step": 6874,
+ "text_loss": 0.1658666580915451
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0003041992417988577,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11088850.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005371398758143187,
+ "skip_count": 2.0,
+ "step": 6876,
+ "text_loss": 0.22437214851379395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.0003039144823796378,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11091784.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025086402893066406,
+ "skip_count": 0.0,
+ "step": 6878,
+ "text_loss": 0.7293354868888855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.0003036297981007581,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11095204.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015590827912092209,
+ "skip_count": 1.0,
+ "step": 6880,
+ "text_loss": 0.6406328678131104
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.30995010272967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.0003033451890713103,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11098367.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013142531970515847,
+ "skip_count": 0.0,
+ "step": 6882,
+ "text_loss": 0.5209086537361145
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 32.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0003030606554003571,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11101047.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0018484699539840221,
+ "skip_count": 0.0,
+ "step": 6884,
+ "text_loss": 0.743188202381134
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.32873495744057,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.00030277619719693217,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11104269.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016667681047692895,
+ "skip_count": 0.0,
+ "step": 6886,
+ "text_loss": 0.7918420433998108
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.33812738479601,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 0.0003024918145700406,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11107248.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008098077378235757,
+ "skip_count": 0.0,
+ "step": 6888,
+ "text_loss": 0.3871288299560547
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.0003022075076286582,
+ "loss": 0.0031,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11111204.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002324736909940839,
+ "skip_count": 0.0,
+ "step": 6890,
+ "text_loss": 0.3722921907901764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0003019232764817321,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11114363.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00254769716411829,
+ "skip_count": 0.0,
+ "step": 6892,
+ "text_loss": 0.418519526720047
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.00030163912123818006,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11117718.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000547234492842108,
+ "skip_count": 0.0,
+ "step": 6894,
+ "text_loss": 0.6087009310722351
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.375697094217784,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0003013550420068909,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11120437.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00015221568173728883,
+ "skip_count": 0.0,
+ "step": 6896,
+ "text_loss": 0.6013991832733154
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 32.385089521573235,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.00030107103889672436,
+ "loss": 0.0085,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 11123708.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.024048971012234688,
+ "skip_count": 2.0,
+ "step": 6898,
+ "text_loss": 0.3612423837184906
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0003007871120165111,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11127294.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013236473314464092,
+ "skip_count": 0.0,
+ "step": 6900,
+ "text_loss": 0.5277031064033508
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.00030050326147505226,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11130270.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028277861420065165,
+ "skip_count": 0.0,
+ "step": 6902,
+ "text_loss": 0.5726971626281738
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0003002194873811197,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11132955.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022369837388396263,
+ "skip_count": 0.0,
+ "step": 6904,
+ "text_loss": 0.18510448932647705
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.00029993578984345673,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11136387.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038351211696863174,
+ "skip_count": 0.0,
+ "step": 6906,
+ "text_loss": 0.28313153982162476
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.43205165835045,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0002996521689707764,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11139740.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00032925375853665173,
+ "skip_count": 0.0,
+ "step": 6908,
+ "text_loss": 0.7315025329589844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.441444085705896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0002993686248717629,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11142587.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002886304398998618,
+ "skip_count": 0.0,
+ "step": 6910,
+ "text_loss": 0.677378237247467
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.45083651306135,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00029908515765507084,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11145415.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038471966981887817,
+ "skip_count": 0.0,
+ "step": 6912,
+ "text_loss": 0.5207083225250244
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0002988017674293254,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11148524.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023522782139480114,
+ "skip_count": 0.0,
+ "step": 6914,
+ "text_loss": 0.42507871985435486
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0189208984375,
+ "learning_rate": 0.0002985184543031222,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11152069.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012464249739423394,
+ "skip_count": 0.0,
+ "step": 6916,
+ "text_loss": 0.5694169998168945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.47901379512768,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 0.0002982352183850274,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11155675.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00828156154602766,
+ "skip_count": 2.0,
+ "step": 6918,
+ "text_loss": 0.22304373979568481
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.48840622248312,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.00029795205978357754,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11158555.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019234733190387487,
+ "skip_count": 0.0,
+ "step": 6920,
+ "text_loss": 0.5519064664840698
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.0002976689786072795,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11161407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003542431222740561,
+ "skip_count": 0.0,
+ "step": 6922,
+ "text_loss": 0.6748810410499573
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.507191077194015,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0002973859749646104,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11166007.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004024899681098759,
+ "skip_count": 0.0,
+ "step": 6924,
+ "text_loss": 0.6613664627075195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 32.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.000297103048964018,
+ "loss": 0.0076,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11169007.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005519595462828875,
+ "skip_count": 3.0,
+ "step": 6926,
+ "text_loss": 0.3815552592277527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.00029682020071392,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11172939.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016999440267682076,
+ "skip_count": 0.0,
+ "step": 6928,
+ "text_loss": 0.6727893352508545
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.535368359260346,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0002965374303227044,
+ "loss": 0.0055,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 11176232.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.030950307846069336,
+ "skip_count": 0.0,
+ "step": 6930,
+ "text_loss": 0.5577763915061951
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.00029625473789872923,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11179775.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00525702815502882,
+ "skip_count": 1.0,
+ "step": 6932,
+ "text_loss": 0.5860039591789246
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.55415321397123,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.000295972123550323,
+ "loss": 0.005,
+ "macro_f1": 1.0,
+ "num_tokens": 11183262.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0048187971115112305,
+ "skip_count": 2.0,
+ "step": 6934,
+ "text_loss": 0.7328732013702393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.563545641326684,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.016357421875,
+ "learning_rate": 0.00029568958738578364,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11186591.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015159632312133908,
+ "skip_count": 0.0,
+ "step": 6936,
+ "text_loss": 0.40563541650772095
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 32.57293806868213,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.017333984375,
+ "learning_rate": 0.0002954071295133801,
+ "loss": 0.005,
+ "macro_f1": 1.0,
+ "num_tokens": 11190056.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011282073333859444,
+ "skip_count": 1.0,
+ "step": 6938,
+ "text_loss": 0.15986496210098267
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0002951247500413504,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 11193504.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.010220487602055073,
+ "skip_count": 5.0,
+ "step": 6940,
+ "text_loss": 0.2604432702064514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0002948424490779029,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11196725.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002620660001412034,
+ "skip_count": 1.0,
+ "step": 6942,
+ "text_loss": 0.48028868436813354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.00029456022673121597,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11199303.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00042651945841498673,
+ "skip_count": 0.0,
+ "step": 6944,
+ "text_loss": 0.5135554671287537
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.0002942780831094377,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11202319.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005366047378629446,
+ "skip_count": 2.0,
+ "step": 6946,
+ "text_loss": 0.2809196710586548
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.619900205459345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0002939960183206861,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11205622.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033479216508567333,
+ "skip_count": 0.0,
+ "step": 6948,
+ "text_loss": 0.2013140618801117
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.629292632814796,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00029371403247304887,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11208637.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0013508419506251812,
+ "skip_count": 0.0,
+ "step": 6950,
+ "text_loss": 0.4427332580089569
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0002934321256745833,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11211618.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020944071002304554,
+ "skip_count": 0.0,
+ "step": 6952,
+ "text_loss": 0.5406652688980103
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.00029315029803331704,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11214432.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012655078899115324,
+ "skip_count": 0.0,
+ "step": 6954,
+ "text_loss": 0.7720552086830139
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.00029286854965724686,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11218127.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009041395038366318,
+ "skip_count": 0.0,
+ "step": 6956,
+ "text_loss": 0.258109986782074
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 32.66686234223657,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0002925868806543391,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 11221440.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0034558263141661882,
+ "skip_count": 1.0,
+ "step": 6958,
+ "text_loss": 0.5378029942512512
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.00029230529113253,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11225391.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005263930186629295,
+ "skip_count": 2.0,
+ "step": 6960,
+ "text_loss": 0.3616539537906647
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.685647196947464,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0002920237811997251,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11228648.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003730480559170246,
+ "skip_count": 1.0,
+ "step": 6962,
+ "text_loss": 0.46682238578796387
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043701171875,
+ "learning_rate": 0.00029174235096379963,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11231828.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004831735976040363,
+ "skip_count": 1.0,
+ "step": 6964,
+ "text_loss": 0.5718355178833008
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 32.70443205165835,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.046875,
+ "learning_rate": 0.0002914610005325981,
+ "loss": 0.0102,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 11234984.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03880132734775543,
+ "skip_count": 2.0,
+ "step": 6966,
+ "text_loss": 0.3139013946056366
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0002911797300139345,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11239153.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006673726020380855,
+ "skip_count": 0.0,
+ "step": 6968,
+ "text_loss": 0.6040399074554443
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00029089853951559235,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11242178.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0028971200808882713,
+ "skip_count": 0.0,
+ "step": 6970,
+ "text_loss": 0.304967999458313
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.73260933372468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.00029061742914532427,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11245865.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010410466929897666,
+ "skip_count": 0.0,
+ "step": 6972,
+ "text_loss": 0.47892290353775024
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.0002903363990108524,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11248806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002133697969838977,
+ "skip_count": 0.0,
+ "step": 6974,
+ "text_loss": 0.2561415433883667
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 32.751394188435576,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.0002900554492198677,
+ "loss": 0.011,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11251807.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.002402493730187416,
+ "skip_count": 0.0,
+ "step": 6976,
+ "text_loss": 0.652428388595581
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.0002897745798800311,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11254615.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006423915736377239,
+ "skip_count": 0.0,
+ "step": 6978,
+ "text_loss": 0.22414511442184448
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.77017904314646,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.000289493791098972,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11257721.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002536606043577194,
+ "skip_count": 0.0,
+ "step": 6980,
+ "text_loss": 0.1328018754720688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.00028921308298428933,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11260840.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000745086173992604,
+ "skip_count": 0.0,
+ "step": 6982,
+ "text_loss": 0.61724853515625
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05029296875,
+ "learning_rate": 0.0002889324556435509,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11264279.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005258981604129076,
+ "skip_count": 0.0,
+ "step": 6984,
+ "text_loss": 0.1664455235004425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.00028865190918429356,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11268096.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008756023598834872,
+ "skip_count": 0.0,
+ "step": 6986,
+ "text_loss": 0.45111921429634094
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.807748752568244,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.00028837144371402336,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11270611.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008175788098014891,
+ "skip_count": 0.0,
+ "step": 6988,
+ "text_loss": 0.5332239270210266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.81714117992369,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.00028809105934021517,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11273826.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003494064789265394,
+ "skip_count": 0.0,
+ "step": 6990,
+ "text_loss": 0.20264241099357605
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.82653360727913,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0002878107561703127,
+ "loss": 0.0056,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 11276917.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.025257345288991928,
+ "skip_count": 3.0,
+ "step": 6992,
+ "text_loss": 0.18000070750713348
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.835926034634575,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.0002875305343117289,
+ "loss": 0.0044,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 11279637.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.019206687808036804,
+ "skip_count": 1.0,
+ "step": 6994,
+ "text_loss": 0.5872798562049866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.00028725039387184504,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11282717.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009358765557408333,
+ "skip_count": 1.0,
+ "step": 6996,
+ "text_loss": 0.3412095904350281
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 32.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.00028697033495801163,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 11285433.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038775671273469925,
+ "skip_count": 1.0,
+ "step": 6998,
+ "text_loss": 0.4316727817058563
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0002866903576775475,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11288414.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004292591474950314,
+ "skip_count": 0.0,
+ "step": 7000,
+ "text_loss": 0.45106515288352966
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.873495744056356,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046875,
+ "learning_rate": 0.0002864104621377409,
+ "loss": 0.007,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 11291811.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02195967361330986,
+ "skip_count": 2.0,
+ "step": 7002,
+ "text_loss": 0.29841285943984985
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.0002861306484458481,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11295179.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010119527578353882,
+ "skip_count": 0.0,
+ "step": 7004,
+ "text_loss": 0.5218569040298462
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.89228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.00028585091670909436,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11298182.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002615996403619647,
+ "skip_count": 0.0,
+ "step": 7006,
+ "text_loss": 0.20382621884346008
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00028557126703467316,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11301262.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002726050792261958,
+ "skip_count": 0.0,
+ "step": 7008,
+ "text_loss": 0.26718559861183167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0002852916995297471,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11304590.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005590448854491115,
+ "skip_count": 0.0,
+ "step": 7010,
+ "text_loss": 0.5392091274261475
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.92045788083358,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.00028501221430144667,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11307690.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004541353322565556,
+ "skip_count": 2.0,
+ "step": 7012,
+ "text_loss": 0.16159705817699432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.00028473281145687137,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11310866.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029630991630256176,
+ "skip_count": 1.0,
+ "step": 7014,
+ "text_loss": 0.9148072600364685
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 28.0,
+ "epoch": 32.93924273554447,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0002844534911030888,
+ "loss": 0.0067,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 11314517.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.023258809000253677,
+ "skip_count": 3.0,
+ "step": 7016,
+ "text_loss": 0.3853590488433838
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.94863516289991,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060546875,
+ "learning_rate": 0.000284174253347135,
+ "loss": 0.0064,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 11317526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010060093365609646,
+ "skip_count": 1.0,
+ "step": 7018,
+ "text_loss": 0.3412325382232666
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00028389509829601444,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11321684.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016713893273845315,
+ "skip_count": 0.0,
+ "step": 7020,
+ "text_loss": 0.9049796462059021
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00028361602605670003,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11324709.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004167001228779554,
+ "skip_count": 2.0,
+ "step": 7022,
+ "text_loss": 0.24364058673381805
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 32.97681244496625,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00028333703673613224,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11327449.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027954576071351767,
+ "skip_count": 4.0,
+ "step": 7024,
+ "text_loss": 0.2872125506401062
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.00028305813044122096,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11330846.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004644687287509441,
+ "skip_count": 0.0,
+ "step": 7026,
+ "text_loss": 0.1717570424079895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.99559729967714,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06884765625,
+ "learning_rate": 0.00028277930727884336,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11333575.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00557848671451211,
+ "skip_count": 2.0,
+ "step": 7028,
+ "text_loss": 0.3501792550086975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.004696213677725,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00028250056735584496,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11336899.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005694970604963601,
+ "skip_count": 0.0,
+ "step": 7030,
+ "text_loss": 0.5541794300079346
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.01408864103317,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.00028222191077903946,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11340163.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032896639313548803,
+ "skip_count": 0.0,
+ "step": 7032,
+ "text_loss": 0.5618721842765808
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 33.02348106838861,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00028194333765520853,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11343494.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005377276800572872,
+ "skip_count": 0.0,
+ "step": 7034,
+ "text_loss": 0.325153648853302
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.00028166484809110206,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11346126.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001204605447128415,
+ "skip_count": 0.0,
+ "step": 7036,
+ "text_loss": 0.5016651749610901
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.00028138644219343736,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11348879.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005026837810873985,
+ "skip_count": 2.0,
+ "step": 7038,
+ "text_loss": 0.2430499643087387
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.05165835045494,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00028110812006890064,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11352457.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019850607495754957,
+ "skip_count": 0.0,
+ "step": 7040,
+ "text_loss": 0.42376917600631714
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.061050777810394,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0179443359375,
+ "learning_rate": 0.00028082988182414524,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 11356602.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003362950636073947,
+ "skip_count": 2.0,
+ "step": 7042,
+ "text_loss": 0.4165397882461548
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.07044320516584,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.0002805517275657926,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11359451.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019725612364709377,
+ "skip_count": 1.0,
+ "step": 7044,
+ "text_loss": 0.5597621202468872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0002802736574004319,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11363614.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013963640667498112,
+ "skip_count": 0.0,
+ "step": 7046,
+ "text_loss": 0.6112356185913086
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.00027999567143462015,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11367015.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005658161826431751,
+ "skip_count": 0.0,
+ "step": 7048,
+ "text_loss": 0.4920886754989624
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 33.09862048723217,
+ "f1_execute": 0.9756097793579102,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00027971776977488193,
+ "loss": 0.0064,
+ "macro_f1": 0.925203263759613,
+ "num_tokens": 11370489.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.03657131269574165,
+ "skip_count": 5.0,
+ "step": 7050,
+ "text_loss": 0.28003939986228943
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.10801291458761,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01708984375,
+ "learning_rate": 0.00027943995252771017,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11373614.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004096088465303183,
+ "skip_count": 2.0,
+ "step": 7052,
+ "text_loss": 0.3145081400871277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.117405341943055,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.00027916221979956457,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11377631.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009888096246868372,
+ "skip_count": 0.0,
+ "step": 7054,
+ "text_loss": 0.4898056983947754
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.126797769298506,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.00027888457169687297,
+ "loss": 0.0065,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 11380620.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013347696512937546,
+ "skip_count": 1.0,
+ "step": 7056,
+ "text_loss": 0.7011964917182922
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.00027860700832603056,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11383297.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000849733711220324,
+ "skip_count": 1.0,
+ "step": 7058,
+ "text_loss": 0.4007014334201813
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.14558262400939,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0002783295297934003,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11386460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001546313869766891,
+ "skip_count": 1.0,
+ "step": 7060,
+ "text_loss": 0.3992713689804077
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0002780521362053123,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11389605.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001045585609972477,
+ "skip_count": 0.0,
+ "step": 7062,
+ "text_loss": 0.4440680146217346
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 33.16436747872028,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.00027777482766806446,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11392105.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00752411549910903,
+ "skip_count": 0.0,
+ "step": 7064,
+ "text_loss": 0.20152349770069122
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 33.17375990607572,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0002774976042879218,
+ "loss": 0.0088,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 11396142.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019917849451303482,
+ "skip_count": 3.0,
+ "step": 7066,
+ "text_loss": 0.24365149438381195
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 33.183152333431174,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.00027722046617111696,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11398827.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015933843096718192,
+ "skip_count": 0.0,
+ "step": 7068,
+ "text_loss": 0.31948477029800415
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.19254476078662,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.00027694341342384977,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11402623.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018986845389008522,
+ "skip_count": 2.0,
+ "step": 7070,
+ "text_loss": 0.47721394896507263
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.00027666644615228727,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11405628.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002975719515234232,
+ "skip_count": 1.0,
+ "step": 7072,
+ "text_loss": 0.3972358703613281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0002763895644625637,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11409468.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005657708737999201,
+ "skip_count": 1.0,
+ "step": 7074,
+ "text_loss": 0.6004229187965393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0002761127684607811,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11412572.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038351903203874826,
+ "skip_count": 2.0,
+ "step": 7076,
+ "text_loss": 1.0837591886520386
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 33.23011447020839,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.00027583605825300795,
+ "loss": 0.0056,
+ "macro_f1": 1.0,
+ "num_tokens": 11416831.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005529445596039295,
+ "skip_count": 2.0,
+ "step": 7078,
+ "text_loss": 0.575986921787262
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.00027555943394528014,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11420557.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006243749521672726,
+ "skip_count": 0.0,
+ "step": 7080,
+ "text_loss": 0.606263279914856
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.248899324919286,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00027528289564360064,
+ "loss": 0.0058,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 11423471.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.031515009701251984,
+ "skip_count": 1.0,
+ "step": 7082,
+ "text_loss": 0.19393208622932434
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0002750064434539394,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11426732.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005052287015132606,
+ "skip_count": 0.0,
+ "step": 7084,
+ "text_loss": 0.7202399969100952
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.26768417963017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00027473007748223357,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11429391.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005099403206259012,
+ "skip_count": 1.0,
+ "step": 7086,
+ "text_loss": 0.20651355385780334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.27707660698562,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00027445379783438685,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11432161.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001447655027732253,
+ "skip_count": 0.0,
+ "step": 7088,
+ "text_loss": 0.34758952260017395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.28646903434106,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.00027417760461627037,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11435417.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000808655982837081,
+ "skip_count": 0.0,
+ "step": 7090,
+ "text_loss": 0.7414838671684265
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.295861461696504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.00027390149793372177,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11438313.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005151710007339716,
+ "skip_count": 0.0,
+ "step": 7092,
+ "text_loss": 0.17792417109012604
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.305253889051954,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.00027362547789254574,
+ "loss": 0.0064,
+ "macro_f1": 1.0,
+ "num_tokens": 11441681.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0037353152874857187,
+ "skip_count": 3.0,
+ "step": 7094,
+ "text_loss": 0.5577781796455383
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.3146463164074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 0.0002733495445985135,
+ "loss": 0.0026,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11444521.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00038075417978689075,
+ "skip_count": 0.0,
+ "step": 7096,
+ "text_loss": 0.5052862167358398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.32403874376284,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.0002730736981573632,
+ "loss": 0.0033,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 11448481.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007313522044569254,
+ "skip_count": 1.0,
+ "step": 7098,
+ "text_loss": 0.5869139432907104
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.0002727979386748001,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11452164.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020673887338489294,
+ "skip_count": 0.0,
+ "step": 7100,
+ "text_loss": 0.4354212284088135
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0002725222662564954,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11455995.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008315460290759802,
+ "skip_count": 0.0,
+ "step": 7102,
+ "text_loss": 0.8714128732681274
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 33.35221602582917,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0002722466810080874,
+ "loss": 0.0053,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 11458828.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010913078673183918,
+ "skip_count": 1.0,
+ "step": 7104,
+ "text_loss": 0.6226683855056763
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.36160845318462,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0002719711830351809,
+ "loss": 0.0076,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 11462448.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.040428292006254196,
+ "skip_count": 1.0,
+ "step": 7106,
+ "text_loss": 0.2543688118457794
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00027169577244334726,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11465796.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004473939072340727,
+ "skip_count": 1.0,
+ "step": 7108,
+ "text_loss": 0.12356872111558914
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.00027142044933812424,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11469176.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017961655976250768,
+ "skip_count": 0.0,
+ "step": 7110,
+ "text_loss": 0.6800211668014526
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 33.38978573525095,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.0002711452138250162,
+ "loss": 0.0065,
+ "macro_f1": 1.0,
+ "num_tokens": 11471983.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.003279087832197547,
+ "skip_count": 2.0,
+ "step": 7112,
+ "text_loss": 0.340279757976532
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.3991781626064,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.00027087006600949403,
+ "loss": 0.0065,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 11475656.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.017024178057909012,
+ "skip_count": 1.0,
+ "step": 7114,
+ "text_loss": 0.3556337058544159
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.40857058996184,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0002705950059969948,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11479410.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015487123280763626,
+ "skip_count": 1.0,
+ "step": 7116,
+ "text_loss": 0.4404350817203522
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.41796301731729,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 0.00027032003389292194,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11483302.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011217560386285186,
+ "skip_count": 0.0,
+ "step": 7118,
+ "text_loss": 0.46771445870399475
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.427355444672735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0002700451498026454,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11486212.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010832607513293624,
+ "skip_count": 0.0,
+ "step": 7120,
+ "text_loss": 0.6795281767845154
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.43674787202818,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00026977035383150106,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11489320.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002290027216076851,
+ "skip_count": 1.0,
+ "step": 7122,
+ "text_loss": 0.5304523706436157
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 33.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.00026949564608479164,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 11492056.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.009950211271643639,
+ "skip_count": 6.0,
+ "step": 7124,
+ "text_loss": 0.21328973770141602
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 33.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0185546875,
+ "learning_rate": 0.0002692210266677855,
+ "loss": 0.0033,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11495165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0079165268689394,
+ "skip_count": 3.0,
+ "step": 7126,
+ "text_loss": 0.19840657711029053
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.00026894649568571724,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11497636.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013852717820554972,
+ "skip_count": 0.0,
+ "step": 7128,
+ "text_loss": 0.3360055088996887
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.47431758144996,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.00026867205324378776,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11500806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010151927126571536,
+ "skip_count": 0.0,
+ "step": 7130,
+ "text_loss": 0.6827390193939209
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.00026839769944716373,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11504187.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001110393786802888,
+ "skip_count": 0.0,
+ "step": 7132,
+ "text_loss": 0.5081584453582764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.49310243616085,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0002681234344009783,
+ "loss": 0.0071,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 11507900.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010587670840322971,
+ "skip_count": 1.0,
+ "step": 7134,
+ "text_loss": 0.28684356808662415
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.00026784925821033014,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11510627.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006658690981566906,
+ "skip_count": 0.0,
+ "step": 7136,
+ "text_loss": 0.24232104420661926
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.00026757517098028417,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11513304.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014556109672412276,
+ "skip_count": 0.0,
+ "step": 7138,
+ "text_loss": 0.4718358516693115
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 33.52127971822718,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00026730117281587116,
+ "loss": 0.0062,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 11516593.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01590067707002163,
+ "skip_count": 3.0,
+ "step": 7140,
+ "text_loss": 0.2810344696044922
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.53067214558262,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00026702726382208774,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11519776.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014479428064078093,
+ "skip_count": 0.0,
+ "step": 7142,
+ "text_loss": 0.48876339197158813
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.54006457293807,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00026675344410389623,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11522499.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003729258431121707,
+ "skip_count": 2.0,
+ "step": 7144,
+ "text_loss": 0.5350890755653381
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 33.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0002664797137662248,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 11525220.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015156447188928723,
+ "skip_count": 1.0,
+ "step": 7146,
+ "text_loss": 0.5742373466491699
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 33.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.00026620607291396773,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 11527926.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.004842780064791441,
+ "skip_count": 2.0,
+ "step": 7148,
+ "text_loss": 0.4994547665119171
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.5682418550044,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.00026593252165198455,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11531622.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026556351222097874,
+ "skip_count": 0.0,
+ "step": 7150,
+ "text_loss": 0.1567893922328949
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.577634282359845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.00026565906008510064,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11535191.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008135059848427773,
+ "skip_count": 1.0,
+ "step": 7152,
+ "text_loss": 0.289173424243927
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 33.58702670971529,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.000265385688318107,
+ "loss": 0.0083,
+ "macro_f1": 1.0,
+ "num_tokens": 11539060.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0020754633005708456,
+ "skip_count": 1.0,
+ "step": 7154,
+ "text_loss": 0.35089045763015747
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 33.59641913707074,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.0002651124064557602,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11541662.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0023738413583487272,
+ "skip_count": 0.0,
+ "step": 7156,
+ "text_loss": 0.5026801228523254
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.00026483921460278227,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11544763.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003311366541311145,
+ "skip_count": 1.0,
+ "step": 7158,
+ "text_loss": 0.22975654900074005
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.61520399178163,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.0002645661128638609,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11547649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008209354127757251,
+ "skip_count": 0.0,
+ "step": 7160,
+ "text_loss": 0.32840636372566223
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.00026429310134364926,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11550648.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028574815951287746,
+ "skip_count": 0.0,
+ "step": 7162,
+ "text_loss": 0.23239612579345703
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0177001953125,
+ "learning_rate": 0.00026402018014676584,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11553790.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005469404626637697,
+ "skip_count": 1.0,
+ "step": 7164,
+ "text_loss": 0.22877025604248047
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.0002637473493777943,
+ "loss": 0.0046,
+ "macro_f1": 1.0,
+ "num_tokens": 11556802.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0032242932356894016,
+ "skip_count": 2.0,
+ "step": 7166,
+ "text_loss": 0.6376226544380188
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.65277370120341,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.00026347460914128443,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 11559607.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0040627880953252316,
+ "skip_count": 2.0,
+ "step": 7168,
+ "text_loss": 0.6879657506942749
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.00026320195954175043,
+ "loss": 0.0069,
+ "macro_f1": 1.0,
+ "num_tokens": 11562677.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.020494163036346436,
+ "skip_count": 4.0,
+ "step": 7170,
+ "text_loss": 0.3710069954395294
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06787109375,
+ "learning_rate": 0.00026292940068367224,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11565948.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002662271959707141,
+ "skip_count": 0.0,
+ "step": 7172,
+ "text_loss": 0.15041157603263855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00026265693267149494,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11568836.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0039914860390126705,
+ "skip_count": 1.0,
+ "step": 7174,
+ "text_loss": 0.5372130870819092
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.69034341062518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.00026238455560962884,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11572542.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034708199091255665,
+ "skip_count": 0.0,
+ "step": 7176,
+ "text_loss": 0.2956286072731018
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.699735837980626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 0.00026211226960244914,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11575352.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007794995326548815,
+ "skip_count": 2.0,
+ "step": 7178,
+ "text_loss": 0.3691073954105377
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.70912826533607,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.0002618400747542964,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11579110.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009694626205600798,
+ "skip_count": 0.0,
+ "step": 7180,
+ "text_loss": 0.6523211598396301
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.71852069269152,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0002615679711694764,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11582476.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004227840341627598,
+ "skip_count": 1.0,
+ "step": 7182,
+ "text_loss": 0.1997286081314087
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.72791312004696,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 0.00026129595895225965,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11585685.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00126146269030869,
+ "skip_count": 0.0,
+ "step": 7184,
+ "text_loss": 0.486299604177475
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 33.73730554740241,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0002610240382068818,
+ "loss": 0.006,
+ "macro_f1": 0.8814815282821655,
+ "num_tokens": 11588804.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04553814232349396,
+ "skip_count": 4.0,
+ "step": 7186,
+ "text_loss": 0.1622236669063568
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 0.00026075220903754324,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11591822.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002460496500134468,
+ "skip_count": 2.0,
+ "step": 7188,
+ "text_loss": 0.5573232173919678
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.756090402113294,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0002604804715484095,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11594899.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006854622159153223,
+ "skip_count": 1.0,
+ "step": 7190,
+ "text_loss": 0.4753095507621765
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.00026020882584361094,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11598333.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001945660449564457,
+ "skip_count": 1.0,
+ "step": 7192,
+ "text_loss": 0.8912903666496277
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 31.0,
+ "epoch": 33.77487525682419,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.061767578125,
+ "learning_rate": 0.0002599372720272426,
+ "loss": 0.0064,
+ "macro_f1": 1.0,
+ "num_tokens": 11601814.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.005749753676354885,
+ "skip_count": 1.0,
+ "step": 7194,
+ "text_loss": 0.6041871905326843
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0002596658102033643,
+ "loss": 0.0097,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11604661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025942171923816204,
+ "skip_count": 1.0,
+ "step": 7196,
+ "text_loss": 0.4760607182979584
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 33.793660111535075,
+ "f1_execute": 0.9756097793579102,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.00025939444047600114,
+ "loss": 0.0075,
+ "macro_f1": 0.8807588815689087,
+ "num_tokens": 11608459.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.020141327753663063,
+ "skip_count": 6.0,
+ "step": 7198,
+ "text_loss": 0.6670252084732056
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0002591231629491423,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11611489.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005721202120184898,
+ "skip_count": 1.0,
+ "step": 7200,
+ "text_loss": 0.31318753957748413
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.81244496624596,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.00025885197772674174,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11615234.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027279339265078306,
+ "skip_count": 1.0,
+ "step": 7202,
+ "text_loss": 0.25728851556777954
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.821837393601406,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.00025858088491271825,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11618892.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006987092201597989,
+ "skip_count": 0.0,
+ "step": 7204,
+ "text_loss": 0.5504243969917297
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.83122982095686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00025830988461095504,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11622237.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029056845232844353,
+ "skip_count": 0.0,
+ "step": 7206,
+ "text_loss": 0.5319080948829651
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.8406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 0.0002580389769253001,
+ "loss": 0.0041,
+ "macro_f1": 1.0,
+ "num_tokens": 11624713.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.007346974220126867,
+ "skip_count": 5.0,
+ "step": 7208,
+ "text_loss": 0.8925374746322632
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 0.0002577681619595655,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11628689.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004166684520896524,
+ "skip_count": 0.0,
+ "step": 7210,
+ "text_loss": 0.37282413244247437
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.85940710302319,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.00025749743981752824,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11631581.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013194780796766281,
+ "skip_count": 2.0,
+ "step": 7212,
+ "text_loss": 0.220115065574646
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.0002572268106029295,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11634503.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009112557163462043,
+ "skip_count": 0.0,
+ "step": 7214,
+ "text_loss": 0.5631879568099976
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.878191957734074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.00025695627441947496,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11637790.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011178883723914623,
+ "skip_count": 2.0,
+ "step": 7216,
+ "text_loss": 0.24482154846191406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.887584385089525,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00025668583137083447,
+ "loss": 0.0047,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 11640806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01877705194056034,
+ "skip_count": 2.0,
+ "step": 7218,
+ "text_loss": 0.2229214459657669
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.89697681244497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0002564154815606422,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11644479.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030277224723249674,
+ "skip_count": 0.0,
+ "step": 7220,
+ "text_loss": 0.6025711894035339
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.00025614522509249715,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11647340.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002354414900764823,
+ "skip_count": 1.0,
+ "step": 7222,
+ "text_loss": 0.6497155427932739
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0002558750620699618,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 11650433.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009801039472222328,
+ "skip_count": 2.0,
+ "step": 7224,
+ "text_loss": 0.32049307227134705
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0002556049925965632,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11654451.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002949854824692011,
+ "skip_count": 0.0,
+ "step": 7226,
+ "text_loss": 0.17923395335674286
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 33.93454652186674,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.00025533501677579254,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 11657440.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0032915703486651182,
+ "skip_count": 1.0,
+ "step": 7228,
+ "text_loss": 0.60064297914505
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 33.943938949222186,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.0002550651347111049,
+ "loss": 0.0046,
+ "macro_f1": 1.0,
+ "num_tokens": 11660599.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00594533933326602,
+ "skip_count": 1.0,
+ "step": 7230,
+ "text_loss": 0.32829397916793823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.95333137657764,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.00025479534650591976,
+ "loss": 0.0032,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11663387.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014214308466762304,
+ "skip_count": 0.0,
+ "step": 7232,
+ "text_loss": 0.7317177653312683
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 0.00025452565226362036,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11666729.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0056374757550656796,
+ "skip_count": 2.0,
+ "step": 7234,
+ "text_loss": 0.3394623398780823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 33.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0152587890625,
+ "learning_rate": 0.00025425605208755406,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11669871.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006422565318644047,
+ "skip_count": 3.0,
+ "step": 7236,
+ "text_loss": 0.1725512444972992
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 33.98150865864397,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0002539865460810322,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11673008.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0023537934757769108,
+ "skip_count": 0.0,
+ "step": 7238,
+ "text_loss": 0.8873519897460938
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.00025371713434733,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11675988.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026300614699721336,
+ "skip_count": 1.0,
+ "step": 7240,
+ "text_loss": 0.4877084195613861
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 34.0,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0002534478169896864,
+ "loss": 0.0052,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 11679068.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.019549336284399033,
+ "skip_count": 3.0,
+ "step": 7242,
+ "text_loss": 0.15101417899131775
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.00939242735544,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0002531785941113044,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11682205.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007769173942506313,
+ "skip_count": 1.0,
+ "step": 7244,
+ "text_loss": 0.4035153090953827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0002529094658153508,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11685162.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003636054927483201,
+ "skip_count": 0.0,
+ "step": 7246,
+ "text_loss": 0.21048080921173096
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048583984375,
+ "learning_rate": 0.00025264043220495606,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11688512.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013363865436986089,
+ "skip_count": 0.0,
+ "step": 7248,
+ "text_loss": 0.6582038402557373
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00025237149338321437,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11691753.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005587349878624082,
+ "skip_count": 0.0,
+ "step": 7250,
+ "text_loss": 0.6899203658103943
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0002521026494531835,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11694689.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006221035961061716,
+ "skip_count": 0.0,
+ "step": 7252,
+ "text_loss": 0.17377600073814392
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.000251833900517885,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11697950.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004368607886135578,
+ "skip_count": 1.0,
+ "step": 7254,
+ "text_loss": 0.4147649109363556
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.000251565246680304,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11701214.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038269520737230778,
+ "skip_count": 2.0,
+ "step": 7256,
+ "text_loss": 0.42076823115348816
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.00025129668804338906,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11703935.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011755652958527207,
+ "skip_count": 0.0,
+ "step": 7258,
+ "text_loss": 0.5484340190887451
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.084531846199,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.00025102822471005247,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 11706818.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00735129788517952,
+ "skip_count": 2.0,
+ "step": 7260,
+ "text_loss": 0.29214802384376526
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.09392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00025075985678316983,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11709979.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0011552777141332626,
+ "skip_count": 0.0,
+ "step": 7262,
+ "text_loss": 0.6514551639556885
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 34.10331670090989,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0002504915843655802,
+ "loss": 0.0067,
+ "macro_f1": 0.8814815282821655,
+ "num_tokens": 11714075.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01438678614795208,
+ "skip_count": 4.0,
+ "step": 7264,
+ "text_loss": 0.5144859552383423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.11270912826534,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0002502234075600862,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11717610.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027831171173602343,
+ "skip_count": 0.0,
+ "step": 7266,
+ "text_loss": 0.6494308114051819
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00024995532646945336,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11721415.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012327058939263225,
+ "skip_count": 0.0,
+ "step": 7268,
+ "text_loss": 0.5111991763114929
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 34.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0002496873411964113,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 11724488.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.003060065908357501,
+ "skip_count": 1.0,
+ "step": 7270,
+ "text_loss": 0.5780492424964905
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.0002494194518436523,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11727708.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001369593315757811,
+ "skip_count": 0.0,
+ "step": 7272,
+ "text_loss": 0.3151950240135193
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.00024915165851383203,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11730897.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005724756047129631,
+ "skip_count": 0.0,
+ "step": 7274,
+ "text_loss": 0.5267965197563171
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.00024888396130956947,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11733870.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010036137886345387,
+ "skip_count": 0.0,
+ "step": 7276,
+ "text_loss": 0.5330777168273926
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00024861636033344657,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11737413.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008341848850250244,
+ "skip_count": 2.0,
+ "step": 7278,
+ "text_loss": 0.25949522852897644
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.0002483488556880087,
+ "loss": 0.0061,
+ "macro_f1": 1.0,
+ "num_tokens": 11740691.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008208763785660267,
+ "skip_count": 2.0,
+ "step": 7280,
+ "text_loss": 0.1867891401052475
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.000248081447475764,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11743715.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038434381131082773,
+ "skip_count": 0.0,
+ "step": 7282,
+ "text_loss": 0.4835410416126251
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0002478141357991838,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11746818.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019067893736064434,
+ "skip_count": 0.0,
+ "step": 7284,
+ "text_loss": 0.5959038734436035
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.00024754692076070256,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11750160.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007199060171842575,
+ "skip_count": 0.0,
+ "step": 7286,
+ "text_loss": 0.5068115592002869
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.21602582917523,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0002472798024627175,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11752836.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014214382972568274,
+ "skip_count": 0.0,
+ "step": 7288,
+ "text_loss": 0.5742631554603577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0002470127810075889,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11756276.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018025166355073452,
+ "skip_count": 0.0,
+ "step": 7290,
+ "text_loss": 0.6616888642311096
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.00024674585649763983,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11760235.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0024077212437987328,
+ "skip_count": 0.0,
+ "step": 7292,
+ "text_loss": 0.7984768748283386
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06494140625,
+ "learning_rate": 0.00024647902903515614,
+ "loss": 0.009,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11763430.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007843999192118645,
+ "skip_count": 1.0,
+ "step": 7294,
+ "text_loss": 0.1943647861480713
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0002462122987223869,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11766583.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019727738108485937,
+ "skip_count": 0.0,
+ "step": 7296,
+ "text_loss": 0.43924200534820557
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6000000238418579,
+ "avg_layers": 27.0,
+ "epoch": 34.26298796595245,
+ "f1_execute": 0.9545454382896423,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.75,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0002459456656615436,
+ "loss": 0.0069,
+ "macro_f1": 0.9015151858329773,
+ "num_tokens": 11770360.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04594529792666435,
+ "skip_count": 5.0,
+ "step": 7298,
+ "text_loss": 0.32582250237464905
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0002456791299548004,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11773239.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0011880286037921906,
+ "skip_count": 0.0,
+ "step": 7300,
+ "text_loss": 0.7723727226257324
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.00024541269170429435,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11776945.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010577787179499865,
+ "skip_count": 0.0,
+ "step": 7302,
+ "text_loss": 0.8173839449882507
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0002451463510121252,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11780121.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019757342524826527,
+ "skip_count": 0.0,
+ "step": 7304,
+ "text_loss": 0.4015064239501953
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.000244880107980355,
+ "loss": 0.0106,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11783172.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002577328821644187,
+ "skip_count": 0.0,
+ "step": 7306,
+ "text_loss": 0.5465171933174133
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.30995010272967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 0.00024461396271100876,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11788608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004162502940744162,
+ "skip_count": 0.0,
+ "step": 7308,
+ "text_loss": 0.2419646978378296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0002443479153060735,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11791912.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003301614662632346,
+ "skip_count": 0.0,
+ "step": 7310,
+ "text_loss": 0.2568489909172058
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.32873495744057,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.00024408196586749964,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11794849.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019893983844667673,
+ "skip_count": 0.0,
+ "step": 7312,
+ "text_loss": 0.7044196128845215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.33812738479601,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0002438161144971992,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11797587.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006637922488152981,
+ "skip_count": 1.0,
+ "step": 7314,
+ "text_loss": 0.6863232254981995
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.000243550361297047,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11800173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003078785724937916,
+ "skip_count": 2.0,
+ "step": 7316,
+ "text_loss": 0.2868897616863251
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.00024328470636888005,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11802889.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011882453691214323,
+ "skip_count": 0.0,
+ "step": 7318,
+ "text_loss": 0.5522798299789429
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0002430191498144979,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11805607.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008720619371160865,
+ "skip_count": 0.0,
+ "step": 7320,
+ "text_loss": 0.5531370639801025
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.375697094217784,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.00024275369173566236,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11808838.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003213440766558051,
+ "skip_count": 0.0,
+ "step": 7322,
+ "text_loss": 0.5252627730369568
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.385089521573235,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.00024248833223409715,
+ "loss": 0.0102,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11811965.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004736232105642557,
+ "skip_count": 1.0,
+ "step": 7324,
+ "text_loss": 0.6033701300621033
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.00024222307141148907,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11814832.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007559265359304845,
+ "skip_count": 0.0,
+ "step": 7326,
+ "text_loss": 0.5607737302780151
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.00024195790936948626,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11818802.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005338212475180626,
+ "skip_count": 2.0,
+ "step": 7328,
+ "text_loss": 0.20618735253810883
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 34.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0002416928462096994,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11821998.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001919696107506752,
+ "skip_count": 3.0,
+ "step": 7330,
+ "text_loss": 0.42486369609832764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.00024142788203370107,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11824505.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013797834981232882,
+ "skip_count": 0.0,
+ "step": 7332,
+ "text_loss": 0.48403388261795044
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.43205165835045,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.00024116301694302621,
+ "loss": 0.0053,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 11828504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008978237397968769,
+ "skip_count": 1.0,
+ "step": 7334,
+ "text_loss": 0.43872755765914917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.441444085705896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 0.00024089825103917152,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11831171.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004589964635670185,
+ "skip_count": 1.0,
+ "step": 7336,
+ "text_loss": 0.5126842260360718
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.45083651306135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.00024063358442359572,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11834387.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002857893006876111,
+ "skip_count": 0.0,
+ "step": 7338,
+ "text_loss": 0.7521272301673889
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0002403690171977197,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11838693.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009023012826219201,
+ "skip_count": 0.0,
+ "step": 7340,
+ "text_loss": 0.6335242390632629
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.00024010454946292586,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11841882.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010992717929184437,
+ "skip_count": 0.0,
+ "step": 7342,
+ "text_loss": 0.64045649766922
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.47901379512768,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0002398401813205592,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11845181.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002247930970042944,
+ "skip_count": 2.0,
+ "step": 7344,
+ "text_loss": 0.31022098660469055
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.48840622248312,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.00023957591287192577,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11848537.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003184020286425948,
+ "skip_count": 2.0,
+ "step": 7346,
+ "text_loss": 0.5709269642829895
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.00023931174421829376,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 11851437.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006582654081285,
+ "skip_count": 4.0,
+ "step": 7348,
+ "text_loss": 0.3547070026397705
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.507191077194015,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.00023904767546089318,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11854161.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0022124287206679583,
+ "skip_count": 0.0,
+ "step": 7350,
+ "text_loss": 0.6984702348709106
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.00023878370670091565,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11856811.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0029868825804442167,
+ "skip_count": 0.0,
+ "step": 7352,
+ "text_loss": 0.25389090180397034
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01708984375,
+ "learning_rate": 0.00023851983803951444,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11860110.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028468978125602007,
+ "skip_count": 1.0,
+ "step": 7354,
+ "text_loss": 0.5729252099990845
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.00023825606957780454,
+ "loss": 0.0041,
+ "macro_f1": 1.0,
+ "num_tokens": 11863058.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003115740604698658,
+ "skip_count": 2.0,
+ "step": 7356,
+ "text_loss": 0.60753333568573
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.00023799240141686258,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11865865.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022254586219787598,
+ "skip_count": 0.0,
+ "step": 7358,
+ "text_loss": 0.2568866014480591
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.55415321397123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 0.00023772883365772658,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11869133.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017388637643307447,
+ "skip_count": 0.0,
+ "step": 7360,
+ "text_loss": 0.7657097578048706
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.563545641326684,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.00023746536640139633,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11872988.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002158832037821412,
+ "skip_count": 0.0,
+ "step": 7362,
+ "text_loss": 0.19717472791671753
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.57293806868213,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00023720199974883294,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11875810.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001037398586049676,
+ "skip_count": 0.0,
+ "step": 7364,
+ "text_loss": 0.47334593534469604
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 34.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.00023693873380095876,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11878558.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011853457428514957,
+ "skip_count": 5.0,
+ "step": 7366,
+ "text_loss": 0.2567826211452484
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01806640625,
+ "learning_rate": 0.00023667556865865824,
+ "loss": 0.0034,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11881473.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015339091187343001,
+ "skip_count": 0.0,
+ "step": 7368,
+ "text_loss": 0.40981143712997437
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.00023641250442277655,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11885033.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010062574408948421,
+ "skip_count": 0.0,
+ "step": 7370,
+ "text_loss": 0.3183043301105499
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 0.00023614954119412042,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11889136.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010769609361886978,
+ "skip_count": 0.0,
+ "step": 7372,
+ "text_loss": 0.5279555916786194
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 34.619900205459345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 0.00023588667907345785,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11893102.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032862431835383177,
+ "skip_count": 3.0,
+ "step": 7374,
+ "text_loss": 0.5425930023193359
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 34.629292632814796,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.00023562391816151808,
+ "loss": 0.0057,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 11895841.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02405562624335289,
+ "skip_count": 3.0,
+ "step": 7376,
+ "text_loss": 0.26054954528808594
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00023536125855899153,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 11899594.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008315852843225002,
+ "skip_count": 3.0,
+ "step": 7378,
+ "text_loss": 0.19068174064159393
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 34.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.00023509870036652998,
+ "loss": 0.0065,
+ "macro_f1": 1.0,
+ "num_tokens": 11902843.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006180883850902319,
+ "skip_count": 4.0,
+ "step": 7380,
+ "text_loss": 0.18461982905864716
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.00023483624368474614,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11905786.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008856299100443721,
+ "skip_count": 0.0,
+ "step": 7382,
+ "text_loss": 0.5216618180274963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.66686234223657,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.00023457388861421397,
+ "loss": 0.0059,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 11908706.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04762765392661095,
+ "skip_count": 1.0,
+ "step": 7384,
+ "text_loss": 0.25329193472862244
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 34.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.00023431163525546833,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 11911862.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.000989250373095274,
+ "skip_count": 1.0,
+ "step": 7386,
+ "text_loss": 0.2657507658004761
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.685647196947464,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01708984375,
+ "learning_rate": 0.0002340494837090053,
+ "loss": 0.0032,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11915483.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008857969660311937,
+ "skip_count": 0.0,
+ "step": 7388,
+ "text_loss": 0.5136669874191284
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.00023378743407528164,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11918778.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0041572838090360165,
+ "skip_count": 1.0,
+ "step": 7390,
+ "text_loss": 0.5212553143501282
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.00023352548645471556,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11921916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010537431808188558,
+ "skip_count": 0.0,
+ "step": 7392,
+ "text_loss": 0.48122525215148926
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00023326364094768576,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11924273.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004077036865055561,
+ "skip_count": 0.0,
+ "step": 7394,
+ "text_loss": 0.2128690630197525
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.00023300189765453194,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11927424.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005371362902224064,
+ "skip_count": 2.0,
+ "step": 7396,
+ "text_loss": 0.19448284804821014
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.73260933372468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00023274025667555464,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11930919.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002137752715498209,
+ "skip_count": 0.0,
+ "step": 7398,
+ "text_loss": 0.7537064552307129
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.00023247871811101512,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11933680.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0002398790093138814,
+ "skip_count": 0.0,
+ "step": 7400,
+ "text_loss": 0.5589297413825989
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.751394188435576,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.00023221728206113546,
+ "loss": 0.008,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 11937090.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019718777388334274,
+ "skip_count": 1.0,
+ "step": 7402,
+ "text_loss": 0.8014751672744751
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 0.0002319559486260985,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11940581.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001230534864589572,
+ "skip_count": 0.0,
+ "step": 7404,
+ "text_loss": 0.5218383073806763
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.77017904314646,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0002316947179060477,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11943832.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016393321566283703,
+ "skip_count": 0.0,
+ "step": 7406,
+ "text_loss": 0.17122556269168854
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.00023143359000108704,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11947025.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005269679240882397,
+ "skip_count": 2.0,
+ "step": 7408,
+ "text_loss": 0.2015499323606491
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.00023117256501128136,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 11950077.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005140089895576239,
+ "skip_count": 2.0,
+ "step": 7410,
+ "text_loss": 0.39068636298179626
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.00023091164303665592,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11953800.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005578748416155577,
+ "skip_count": 0.0,
+ "step": 7412,
+ "text_loss": 0.18851874768733978
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.807748752568244,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.00023065082417719624,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11956383.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006410991190932691,
+ "skip_count": 0.0,
+ "step": 7414,
+ "text_loss": 0.5663703083992004
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 34.81714117992369,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.0002303901085328491,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11959554.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005902954144403338,
+ "skip_count": 5.0,
+ "step": 7416,
+ "text_loss": 0.5225661993026733
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0002301294962035209,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11962582.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00045644037891179323,
+ "skip_count": 0.0,
+ "step": 7418,
+ "text_loss": 0.40572360157966614
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.0002298689872890789,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11965649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01017778366804123,
+ "skip_count": 2.0,
+ "step": 7420,
+ "text_loss": 0.12190715968608856
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.00022960858188935052,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11968850.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008010792662389576,
+ "skip_count": 0.0,
+ "step": 7422,
+ "text_loss": 0.5606820583343506
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0002293482801041236,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11972064.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001889281440526247,
+ "skip_count": 0.0,
+ "step": 7424,
+ "text_loss": 0.44142210483551025
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.00022908808203314635,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11975466.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00647713290527463,
+ "skip_count": 2.0,
+ "step": 7426,
+ "text_loss": 0.23273423314094543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0002288279877761271,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11979875.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004027119372040033,
+ "skip_count": 0.0,
+ "step": 7428,
+ "text_loss": 0.5608086585998535
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.0002285679974327345,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11982808.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009015435934998095,
+ "skip_count": 0.0,
+ "step": 7430,
+ "text_loss": 0.3976539373397827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.89228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.0002283081111025973,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11985978.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00047143330448307097,
+ "skip_count": 0.0,
+ "step": 7432,
+ "text_loss": 0.4280148446559906
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.00022804832888530447,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11988925.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004895820748060942,
+ "skip_count": 0.0,
+ "step": 7434,
+ "text_loss": 0.5137463808059692
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 0.000227788650880405,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11991631.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008349024574272335,
+ "skip_count": 0.0,
+ "step": 7436,
+ "text_loss": 0.4306720197200775
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.92045788083358,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00022752907718740807,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11995476.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038723985198885202,
+ "skip_count": 0.0,
+ "step": 7438,
+ "text_loss": 0.6413722038269043
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043701171875,
+ "learning_rate": 0.00022726960790578248,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11998846.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004433541093021631,
+ "skip_count": 0.0,
+ "step": 7440,
+ "text_loss": 0.6424159407615662
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 23.0,
+ "epoch": 34.93924273554447,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0002270102431349579,
+ "loss": 0.0062,
+ "macro_f1": 0.6289562582969666,
+ "num_tokens": 12002228.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023979803547263145,
+ "skip_count": 6.0,
+ "step": 7442,
+ "text_loss": 0.16657918691635132
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 34.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.00022675098297432307,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 12005003.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.005645833443850279,
+ "skip_count": 1.0,
+ "step": 7444,
+ "text_loss": 0.6388722658157349
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.00022649182752322705,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12007657.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001629356062039733,
+ "skip_count": 2.0,
+ "step": 7446,
+ "text_loss": 0.35670006275177
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00022623277688097864,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12010652.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006375396624207497,
+ "skip_count": 2.0,
+ "step": 7448,
+ "text_loss": 0.24273613095283508
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.97681244496625,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0002259738311468466,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12014042.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003734540194272995,
+ "skip_count": 0.0,
+ "step": 7450,
+ "text_loss": 0.4262580871582031
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 34.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.0002257149904200592,
+ "loss": 0.0076,
+ "macro_f1": 1.0,
+ "num_tokens": 12016987.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0027926203329116106,
+ "skip_count": 1.0,
+ "step": 7452,
+ "text_loss": 0.366216778755188
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.99559729967714,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.00022545625479980508,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12021584.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008985420572571456,
+ "skip_count": 0.0,
+ "step": 7454,
+ "text_loss": 0.533937394618988
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.004696213677725,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.00022519762438523205,
+ "loss": 0.0029,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12024142.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005394646432250738,
+ "skip_count": 1.0,
+ "step": 7456,
+ "text_loss": 0.2401239275932312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.01408864103317,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0002249390992754477,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12027262.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00275063537992537,
+ "skip_count": 0.0,
+ "step": 7458,
+ "text_loss": 0.21824975311756134
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.02348106838861,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00022468067956951944,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12030528.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008951274212449789,
+ "skip_count": 1.0,
+ "step": 7460,
+ "text_loss": 0.610903263092041
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.00022442236536647408,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12033699.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004062872380018234,
+ "skip_count": 2.0,
+ "step": 7462,
+ "text_loss": 0.26921433210372925
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.00022416415676529823,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12037402.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023089025635272264,
+ "skip_count": 1.0,
+ "step": 7464,
+ "text_loss": 0.4746153950691223
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.05165835045494,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.00022390605386493756,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12041129.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021355501376092434,
+ "skip_count": 2.0,
+ "step": 7466,
+ "text_loss": 0.4265538454055786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.061050777810394,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.00022364805676429816,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12044356.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0061582159250974655,
+ "skip_count": 1.0,
+ "step": 7468,
+ "text_loss": 0.12020833045244217
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.07044320516584,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.00022339016556224467,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12047158.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003753372235223651,
+ "skip_count": 1.0,
+ "step": 7470,
+ "text_loss": 0.6406939625740051
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 35.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.00022313238035760158,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 12050149.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005371729377657175,
+ "skip_count": 5.0,
+ "step": 7472,
+ "text_loss": 0.5184400677680969
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.0002228747012491526,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12053560.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000824139395263046,
+ "skip_count": 0.0,
+ "step": 7474,
+ "text_loss": 0.32644152641296387
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0002226171283356409,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12056309.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0044801668263971806,
+ "skip_count": 1.0,
+ "step": 7476,
+ "text_loss": 0.7027081847190857
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.10801291458761,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.00022235966171576887,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12059191.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007496353704482317,
+ "skip_count": 2.0,
+ "step": 7478,
+ "text_loss": 0.28705671429634094
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.117405341943055,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.0002221023014881982,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12062365.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018641395727172494,
+ "skip_count": 1.0,
+ "step": 7480,
+ "text_loss": 0.715477466583252
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.126797769298506,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.00022184504775154984,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12065508.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005825075786560774,
+ "skip_count": 0.0,
+ "step": 7482,
+ "text_loss": 0.7481293678283691
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.00022158790060440394,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12068043.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028906071092933416,
+ "skip_count": 0.0,
+ "step": 7484,
+ "text_loss": 0.6151962876319885
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.14558262400939,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.00022133086014529968,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12070897.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030862605199217796,
+ "skip_count": 1.0,
+ "step": 7486,
+ "text_loss": 0.4923575222492218
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.00022107392647273527,
+ "loss": 0.009,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12074644.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011101154377683997,
+ "skip_count": 0.0,
+ "step": 7488,
+ "text_loss": 0.5217859148979187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.16436747872028,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.00022081709968516867,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12077718.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004303969442844391,
+ "skip_count": 0.0,
+ "step": 7490,
+ "text_loss": 0.18933317065238953
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.17375990607572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.00022056037988101612,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12080509.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019941304344683886,
+ "skip_count": 1.0,
+ "step": 7492,
+ "text_loss": 0.6760565042495728
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.183152333431174,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.00022030376715865313,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12083580.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017090907786041498,
+ "skip_count": 0.0,
+ "step": 7494,
+ "text_loss": 0.4140956401824951
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.19254476078662,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0002200472616164142,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12086923.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005131757352501154,
+ "skip_count": 1.0,
+ "step": 7496,
+ "text_loss": 0.43287888169288635
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00021979086335259269,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12090003.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007472267607226968,
+ "skip_count": 0.0,
+ "step": 7498,
+ "text_loss": 0.6692602038383484
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.00021953457246544095,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12092936.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012374494690448046,
+ "skip_count": 0.0,
+ "step": 7500,
+ "text_loss": 0.5170100331306458
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.00021927838905317016,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12096395.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006784295197576284,
+ "skip_count": 2.0,
+ "step": 7502,
+ "text_loss": 0.340880811214447
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.23011447020839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.00021902231321395017,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12099743.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0058755455538630486,
+ "skip_count": 1.0,
+ "step": 7504,
+ "text_loss": 0.5299809575080872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00021876634504590985,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12103121.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010622406378388405,
+ "skip_count": 2.0,
+ "step": 7506,
+ "text_loss": 0.1817338913679123
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 35.248899324919286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.00021851048464713662,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12105883.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004382388666272163,
+ "skip_count": 3.0,
+ "step": 7508,
+ "text_loss": 0.5718557834625244
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.00021825473211567665,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12108936.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001638208981603384,
+ "skip_count": 0.0,
+ "step": 7510,
+ "text_loss": 0.4684678316116333
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.26768417963017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.00021799908754953468,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12112060.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007894381997175515,
+ "skip_count": 2.0,
+ "step": 7512,
+ "text_loss": 0.5146099328994751
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.27707660698562,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.00021774355104667455,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12115636.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01400370616465807,
+ "skip_count": 2.0,
+ "step": 7514,
+ "text_loss": 0.19512294232845306
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 35.28646903434106,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.00021748812270501805,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12119116.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005261222366243601,
+ "skip_count": 3.0,
+ "step": 7516,
+ "text_loss": 0.17316904664039612
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.295861461696504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 0.0002172328026224459,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12122070.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01021486520767212,
+ "skip_count": 2.0,
+ "step": 7518,
+ "text_loss": 0.2777172029018402
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.305253889051954,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00021697759089679713,
+ "loss": 0.0056,
+ "macro_f1": 1.0,
+ "num_tokens": 12125386.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005217147525399923,
+ "skip_count": 2.0,
+ "step": 7520,
+ "text_loss": 0.49744322896003723
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.3146463164074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.00021672248762586948,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12128753.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003868246916681528,
+ "skip_count": 0.0,
+ "step": 7522,
+ "text_loss": 0.4209211468696594
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 35.32403874376284,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00021646749290741895,
+ "loss": 0.009,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 12132425.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.044205982238054276,
+ "skip_count": 3.0,
+ "step": 7524,
+ "text_loss": 0.4180344343185425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.00021621260683916005,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12135740.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032584366854280233,
+ "skip_count": 2.0,
+ "step": 7526,
+ "text_loss": 0.21219655871391296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.00021595782951876552,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12139239.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002418758114799857,
+ "skip_count": 2.0,
+ "step": 7528,
+ "text_loss": 0.40800613164901733
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.35221602582917,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0186767578125,
+ "learning_rate": 0.0002157031610438665,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 12142572.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005265383515506983,
+ "skip_count": 1.0,
+ "step": 7530,
+ "text_loss": 0.7539705634117126
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0002154486015120525,
+ "loss": 0.0051,
+ "macro_f1": 1.0,
+ "num_tokens": 12145737.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006648020353168249,
+ "skip_count": 2.0,
+ "step": 7532,
+ "text_loss": 0.7824432253837585
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.371000880540066,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0002151941510208712,
+ "loss": 0.0049,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 12149376.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01692759431898594,
+ "skip_count": 0.0,
+ "step": 7534,
+ "text_loss": 0.4476291239261627
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 35.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0002149398096678283,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12152191.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013883143663406372,
+ "skip_count": 0.0,
+ "step": 7536,
+ "text_loss": 0.14996720850467682
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.38978573525095,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.058837890625,
+ "learning_rate": 0.00021468557755038826,
+ "loss": 0.0069,
+ "macro_f1": 1.0,
+ "num_tokens": 12155084.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.009390740655362606,
+ "skip_count": 2.0,
+ "step": 7538,
+ "text_loss": 0.23685340583324432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.3991781626064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0002144314547659731,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12159366.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025363171007484198,
+ "skip_count": 0.0,
+ "step": 7540,
+ "text_loss": 0.6687407493591309
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.40857058996184,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.00021417744141196315,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12162545.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004230613354593515,
+ "skip_count": 1.0,
+ "step": 7542,
+ "text_loss": 0.24885894358158112
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 35.41796301731729,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 0.00021392353758569694,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12165381.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008058524690568447,
+ "skip_count": 0.0,
+ "step": 7544,
+ "text_loss": 0.15833988785743713
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.427355444672735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0002136697433844707,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12168304.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018041770672425628,
+ "skip_count": 0.0,
+ "step": 7546,
+ "text_loss": 0.6046217083930969
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.43674787202818,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.00021341605890553894,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 12171040.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008584463968873024,
+ "skip_count": 2.0,
+ "step": 7548,
+ "text_loss": 0.3001522719860077
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.00021316248424611408,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12174702.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010506469989195466,
+ "skip_count": 0.0,
+ "step": 7550,
+ "text_loss": 0.2998376488685608
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0162353515625,
+ "learning_rate": 0.00021290901950336627,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12178388.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012753128539770842,
+ "skip_count": 0.0,
+ "step": 7552,
+ "text_loss": 0.8125656843185425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.00021265566477442384,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12181863.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004343052394688129,
+ "skip_count": 2.0,
+ "step": 7554,
+ "text_loss": 0.14004671573638916
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 35.47431758144996,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.00021240242015637268,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12185485.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0005794052849523723,
+ "skip_count": 0.0,
+ "step": 7556,
+ "text_loss": 0.7116519808769226
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.4837100088054,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.00021214928574625664,
+ "loss": 0.0063,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 12188914.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01066325418651104,
+ "skip_count": 0.0,
+ "step": 7558,
+ "text_loss": 0.4664429724216461
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.49310243616085,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00021189626164107718,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12193042.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011769415577873588,
+ "skip_count": 0.0,
+ "step": 7560,
+ "text_loss": 0.672637403011322
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.00021164334793779388,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 12195675.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008653911761939526,
+ "skip_count": 1.0,
+ "step": 7562,
+ "text_loss": 0.5301182866096497
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.00021139054473332357,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12198638.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0058176578022539616,
+ "skip_count": 0.0,
+ "step": 7564,
+ "text_loss": 0.1889677792787552
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.000211137852124541,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12202312.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004154018242843449,
+ "skip_count": 0.0,
+ "step": 7566,
+ "text_loss": 0.3610386848449707
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.53067214558262,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.00021088527020827848,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12205112.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014722816413268447,
+ "skip_count": 0.0,
+ "step": 7568,
+ "text_loss": 0.15214823186397552
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.54006457293807,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.0002106327990813257,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12208103.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015596678713336587,
+ "skip_count": 0.0,
+ "step": 7570,
+ "text_loss": 0.5034125447273254
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 35.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.00021038043884043022,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12211208.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007482443004846573,
+ "skip_count": 0.0,
+ "step": 7572,
+ "text_loss": 0.6760116219520569
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.00021012818958229696,
+ "loss": 0.0031,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12214463.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003875598544254899,
+ "skip_count": 2.0,
+ "step": 7574,
+ "text_loss": 0.3278147876262665
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.5682418550044,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.00020987605140358824,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12218199.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007918627932667732,
+ "skip_count": 2.0,
+ "step": 7576,
+ "text_loss": 0.23850615322589874
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.577634282359845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.00020962402440092388,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12221151.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005424308590590954,
+ "skip_count": 1.0,
+ "step": 7578,
+ "text_loss": 0.5670642256736755
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.58702670971529,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0002093721086708812,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 12224789.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0066504343412816525,
+ "skip_count": 1.0,
+ "step": 7580,
+ "text_loss": 0.30404478311538696
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 35.59641913707074,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.00020912030430999452,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12228134.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008815597742795944,
+ "skip_count": 0.0,
+ "step": 7582,
+ "text_loss": 0.32522889971733093
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 35.60581156442618,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.05126953125,
+ "learning_rate": 0.0002088686114147561,
+ "loss": 0.0098,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 12231335.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03785836696624756,
+ "skip_count": 2.0,
+ "step": 7584,
+ "text_loss": 0.6277920603752136
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.61520399178163,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.00020861703008161504,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12234619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016183801926672459,
+ "skip_count": 0.0,
+ "step": 7586,
+ "text_loss": 0.38319316506385803
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.00020836556040697767,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 12237296.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013077575713396072,
+ "skip_count": 1.0,
+ "step": 7588,
+ "text_loss": 0.297571063041687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.00020811420248720769,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12240633.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002858756808564067,
+ "skip_count": 0.0,
+ "step": 7590,
+ "text_loss": 0.2506035268306732
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.000207862956418626,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12244118.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032624071463942528,
+ "skip_count": 1.0,
+ "step": 7592,
+ "text_loss": 0.19843827188014984
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.65277370120341,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056640625,
+ "learning_rate": 0.00020761182229751045,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 12247367.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005885142367333174,
+ "skip_count": 3.0,
+ "step": 7594,
+ "text_loss": 0.3347153067588806
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 35.66216612855885,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.00020736080022009602,
+ "loss": 0.0088,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 12250487.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.021491389721632004,
+ "skip_count": 4.0,
+ "step": 7596,
+ "text_loss": 0.6777212619781494
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 35.671558555914295,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.00020710989028257514,
+ "loss": 0.0061,
+ "macro_f1": 0.6595745086669922,
+ "num_tokens": 12253834.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.014164486899971962,
+ "skip_count": 4.0,
+ "step": 7598,
+ "text_loss": 0.741127610206604
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0002068590925810968,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12257289.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012773120542988181,
+ "skip_count": 0.0,
+ "step": 7600,
+ "text_loss": 0.5336982607841492
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.69034341062518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.0002066084072117672,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12260825.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013102042488753796,
+ "skip_count": 2.0,
+ "step": 7602,
+ "text_loss": 0.30410775542259216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.699735837980626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.00020635783427064942,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12264609.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002602101070806384,
+ "skip_count": 0.0,
+ "step": 7604,
+ "text_loss": 0.29835572838783264
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.70912826533607,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00020610737385376348,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12267537.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0053265830501914024,
+ "skip_count": 0.0,
+ "step": 7606,
+ "text_loss": 0.2095658779144287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.71852069269152,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.00020585702605708628,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12271175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000614096992649138,
+ "skip_count": 0.0,
+ "step": 7608,
+ "text_loss": 0.8146751523017883
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.72791312004696,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.00020560679097655137,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12274067.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013201923575252295,
+ "skip_count": 0.0,
+ "step": 7610,
+ "text_loss": 0.40818271040916443
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.73730554740241,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0002053566687080497,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12276946.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004304401110857725,
+ "skip_count": 1.0,
+ "step": 7612,
+ "text_loss": 0.7063660025596619
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.0002051066593474284,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12279760.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032060579396784306,
+ "skip_count": 1.0,
+ "step": 7614,
+ "text_loss": 0.23671887814998627
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.756090402113294,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00020485676299049154,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12282737.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005103024188429117,
+ "skip_count": 2.0,
+ "step": 7616,
+ "text_loss": 0.17571020126342773
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00020460697973299986,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 12286290.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007189507596194744,
+ "skip_count": 1.0,
+ "step": 7618,
+ "text_loss": 0.30872994661331177
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.77487525682419,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0002043573096706708,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12289458.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010217712260782719,
+ "skip_count": 0.0,
+ "step": 7620,
+ "text_loss": 0.5155487060546875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0002041077528991784,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12292846.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022399788722395897,
+ "skip_count": 1.0,
+ "step": 7622,
+ "text_loss": 0.717949390411377
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0002038583095141532,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12295673.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018168877577409148,
+ "skip_count": 0.0,
+ "step": 7624,
+ "text_loss": 0.560361385345459
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.00020360897961118246,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12298624.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008487844606861472,
+ "skip_count": 0.0,
+ "step": 7626,
+ "text_loss": 0.6391524076461792
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.81244496624596,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.00020335976328580984,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12302136.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006127831293269992,
+ "skip_count": 0.0,
+ "step": 7628,
+ "text_loss": 0.5932226777076721
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.821837393601406,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.00020311066063353556,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12305152.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018765819258987904,
+ "skip_count": 0.0,
+ "step": 7630,
+ "text_loss": 0.37831631302833557
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.83122982095686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00020286167174981618,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12307771.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025384656619280577,
+ "skip_count": 0.0,
+ "step": 7632,
+ "text_loss": 0.34806445240974426
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.8406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.0002026127967300645,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12310921.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008239032700657845,
+ "skip_count": 2.0,
+ "step": 7634,
+ "text_loss": 0.34859901666641235
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00020236403566965027,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12314200.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029505928978323936,
+ "skip_count": 2.0,
+ "step": 7636,
+ "text_loss": 0.2647531032562256
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 35.85940710302319,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0002021153886638991,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12319221.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0014016951899975538,
+ "skip_count": 0.0,
+ "step": 7638,
+ "text_loss": 0.42428603768348694
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 35.86879953037863,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.00020186685580809288,
+ "loss": 0.0059,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 12322204.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01761031709611416,
+ "skip_count": 2.0,
+ "step": 7640,
+ "text_loss": 0.25929757952690125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.878191957734074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.00020161843719746997,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12324750.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023674629628658295,
+ "skip_count": 0.0,
+ "step": 7642,
+ "text_loss": 0.567159116268158
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.887584385089525,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0002013701329272248,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12327933.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004534341394901276,
+ "skip_count": 0.0,
+ "step": 7644,
+ "text_loss": 0.4765215516090393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.89697681244497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.00020112194309250797,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12330847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003144246758893132,
+ "skip_count": 2.0,
+ "step": 7646,
+ "text_loss": 0.39837369322776794
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 0.00020087386778842642,
+ "loss": 0.0046,
+ "macro_f1": 1.0,
+ "num_tokens": 12333782.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008137194439768791,
+ "skip_count": 1.0,
+ "step": 7648,
+ "text_loss": 0.42175763845443726
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.00020062590711004296,
+ "loss": 0.0034,
+ "macro_f1": 1.0,
+ "num_tokens": 12336837.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006499455776065588,
+ "skip_count": 1.0,
+ "step": 7650,
+ "text_loss": 0.18695278465747833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.00020037806115237667,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12340414.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001548365456983447,
+ "skip_count": 0.0,
+ "step": 7652,
+ "text_loss": 0.1981094628572464
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.93454652186674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00020013033001040255,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12343209.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008136926218867302,
+ "skip_count": 2.0,
+ "step": 7654,
+ "text_loss": 0.2231602668762207
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.943938949222186,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.00019988271377905165,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12346158.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00370375020429492,
+ "skip_count": 1.0,
+ "step": 7656,
+ "text_loss": 0.4809921383857727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 35.95333137657764,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.00019963521255321077,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12349279.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00690054427832365,
+ "skip_count": 3.0,
+ "step": 7658,
+ "text_loss": 0.40473970770835876
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0001993878264277233,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 12352848.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004367961548268795,
+ "skip_count": 1.0,
+ "step": 7660,
+ "text_loss": 0.3646799921989441
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.00019914055549738775,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12356737.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000662159756757319,
+ "skip_count": 0.0,
+ "step": 7662,
+ "text_loss": 0.3703214228153229
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.98150865864397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0001988933998569589,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12360085.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023262565955519676,
+ "skip_count": 0.0,
+ "step": 7664,
+ "text_loss": 0.12910836935043335
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0001986463596011473,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12363296.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002686078194528818,
+ "skip_count": 1.0,
+ "step": 7666,
+ "text_loss": 0.39628392457962036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.00019839943482461914,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12366072.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007100159768015146,
+ "skip_count": 1.0,
+ "step": 7668,
+ "text_loss": 0.6588287949562073
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.00939242735544,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00019815262562199648,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12368940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004194926470518112,
+ "skip_count": 0.0,
+ "step": 7670,
+ "text_loss": 0.36411619186401367
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0189208984375,
+ "learning_rate": 0.00019790593208785713,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12372031.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0041313013061881065,
+ "skip_count": 0.0,
+ "step": 7672,
+ "text_loss": 0.23270413279533386
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 36.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.00019765935431673444,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12375115.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003343774238601327,
+ "skip_count": 0.0,
+ "step": 7674,
+ "text_loss": 0.1686355322599411
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 36.03756970942178,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.00019741289240311755,
+ "loss": 0.0058,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 12379089.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021328814327716827,
+ "skip_count": 4.0,
+ "step": 7676,
+ "text_loss": 0.9312577247619629
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.00019716654644145104,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12383115.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004511173174250871,
+ "skip_count": 0.0,
+ "step": 7678,
+ "text_loss": 0.3305695056915283
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.00019692031652613522,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12386064.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006190002430230379,
+ "skip_count": 0.0,
+ "step": 7680,
+ "text_loss": 0.4829687178134918
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 36.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.00019667420275152575,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 12389743.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.004575030412524939,
+ "skip_count": 1.0,
+ "step": 7682,
+ "text_loss": 0.5751548409461975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 0.0001964282052119341,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12392481.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002718796720728278,
+ "skip_count": 0.0,
+ "step": 7684,
+ "text_loss": 0.5349925756454468
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.084531846199,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0001961823240016269,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12395207.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027528523933142424,
+ "skip_count": 0.0,
+ "step": 7686,
+ "text_loss": 0.5322592258453369
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 36.09392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.00019593655921482624,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12398232.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008105970919132233,
+ "skip_count": 0.0,
+ "step": 7688,
+ "text_loss": 0.3192061185836792
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 36.10331670090989,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.00019569091094570967,
+ "loss": 0.0069,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 12400862.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.024075545370578766,
+ "skip_count": 1.0,
+ "step": 7690,
+ "text_loss": 0.3189752697944641
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 36.11270912826534,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0001954453792884101,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12404039.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007513802964240313,
+ "skip_count": 3.0,
+ "step": 7692,
+ "text_loss": 0.5985093712806702
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0001951999643370157,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 12407085.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009606506675481796,
+ "skip_count": 2.0,
+ "step": 7694,
+ "text_loss": 0.2050790935754776
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.00019495466618556996,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12411377.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007978329667821527,
+ "skip_count": 0.0,
+ "step": 7696,
+ "text_loss": 0.4705570638179779
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00019470948492807154,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12414427.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010737364646047354,
+ "skip_count": 0.0,
+ "step": 7698,
+ "text_loss": 0.6105324029922485
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.00019446442065847448,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12417442.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001762967323884368,
+ "skip_count": 0.0,
+ "step": 7700,
+ "text_loss": 0.5638618469238281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00019421947347068774,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12420862.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015798417152836919,
+ "skip_count": 0.0,
+ "step": 7702,
+ "text_loss": 0.1939864307641983
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00019397464345857562,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12423876.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005659835878759623,
+ "skip_count": 1.0,
+ "step": 7704,
+ "text_loss": 0.20829300582408905
+ },
+ {
+ "acc_repeat": 0.75,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 36.17845611975345,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.8571428656578064,
+ "f1_skip": 1.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 0.00019372993071595723,
+ "loss": 0.0072,
+ "macro_f1": 0.9449735879898071,
+ "num_tokens": 12427639.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.018665846437215805,
+ "skip_count": 2.0,
+ "step": 7706,
+ "text_loss": 0.47913849353790283
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.00019348533533660727,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12431520.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006690093432553113,
+ "skip_count": 0.0,
+ "step": 7708,
+ "text_loss": 0.494870662689209
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.00019324085741425511,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12434213.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004067352041602135,
+ "skip_count": 1.0,
+ "step": 7710,
+ "text_loss": 0.7631711959838867
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 36.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.00019299649704258504,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12437437.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01157623715698719,
+ "skip_count": 0.0,
+ "step": 7712,
+ "text_loss": 0.3145926296710968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.21602582917523,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0001927522543152364,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12440507.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001888492377474904,
+ "skip_count": 0.0,
+ "step": 7714,
+ "text_loss": 0.576301097869873
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.00019250812932580352,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12443484.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00042988534551113844,
+ "skip_count": 0.0,
+ "step": 7716,
+ "text_loss": 0.5716445446014404
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.00019226412216783557,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12446460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005063199903815985,
+ "skip_count": 1.0,
+ "step": 7718,
+ "text_loss": 0.2700924873352051
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0001920202329348365,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12449346.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010775640839710832,
+ "skip_count": 0.0,
+ "step": 7720,
+ "text_loss": 0.5162558555603027
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.00019177646172026513,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12452680.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014514096546918154,
+ "skip_count": 0.0,
+ "step": 7722,
+ "text_loss": 0.5753642916679382
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0177001953125,
+ "learning_rate": 0.00019153280861753497,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12455348.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002202774863690138,
+ "skip_count": 1.0,
+ "step": 7724,
+ "text_loss": 0.5751997232437134
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.00019128927372001454,
+ "loss": 0.0032,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12458098.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005171069409698248,
+ "skip_count": 0.0,
+ "step": 7726,
+ "text_loss": 0.22252975404262543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00019104585712102678,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12460958.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0041033923625946045,
+ "skip_count": 0.0,
+ "step": 7728,
+ "text_loss": 0.18611937761306763
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 36.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.00019080255891384945,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12463596.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0012201941572129726,
+ "skip_count": 0.0,
+ "step": 7730,
+ "text_loss": 0.47347909212112427
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 36.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0001905593791917148,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 12467021.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005837214644998312,
+ "skip_count": 2.0,
+ "step": 7732,
+ "text_loss": 0.2055564969778061
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.30995010272967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.00019031631804780974,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12469743.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010269953636452556,
+ "skip_count": 0.0,
+ "step": 7734,
+ "text_loss": 0.45995602011680603
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00019007337557527582,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12473082.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00436213007196784,
+ "skip_count": 1.0,
+ "step": 7736,
+ "text_loss": 0.4515823721885681
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.32873495744057,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.00018983055186720888,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12476100.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003051829058676958,
+ "skip_count": 2.0,
+ "step": 7738,
+ "text_loss": 0.12298467755317688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.33812738479601,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0001895878470166597,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12480231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008164191618561745,
+ "skip_count": 2.0,
+ "step": 7740,
+ "text_loss": 0.17456457018852234
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.347519812151454,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046630859375,
+ "learning_rate": 0.00018934526111663314,
+ "loss": 0.0069,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 12483894.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008653721772134304,
+ "skip_count": 1.0,
+ "step": 7742,
+ "text_loss": 0.7125775814056396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 36.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.00018910279426008857,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12488077.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005024447571486235,
+ "skip_count": 6.0,
+ "step": 7744,
+ "text_loss": 0.833778977394104
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.00018886044653993966,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12490999.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002690888475626707,
+ "skip_count": 0.0,
+ "step": 7746,
+ "text_loss": 0.15594039857387543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.375697094217784,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00018861821804905466,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12494765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006087568122893572,
+ "skip_count": 0.0,
+ "step": 7748,
+ "text_loss": 0.2696777880191803
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.385089521573235,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00018837610888025586,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12497741.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014629303477704525,
+ "skip_count": 0.0,
+ "step": 7750,
+ "text_loss": 0.6801294684410095
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11865234375,
+ "learning_rate": 0.00018813411912631996,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12500585.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001163579523563385,
+ "skip_count": 0.0,
+ "step": 7752,
+ "text_loss": 0.41069695353507996
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 36.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.00018789224887997796,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12503579.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.009436148218810558,
+ "skip_count": 0.0,
+ "step": 7754,
+ "text_loss": 0.6993107795715332
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.00018765049823391472,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 12506698.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002098206663504243,
+ "skip_count": 2.0,
+ "step": 7756,
+ "text_loss": 0.5704247951507568
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00018740886728077,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12509869.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002066673245280981,
+ "skip_count": 1.0,
+ "step": 7758,
+ "text_loss": 0.7605635523796082
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.43205165835045,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.00018716735611313707,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12513433.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023439819924533367,
+ "skip_count": 1.0,
+ "step": 7760,
+ "text_loss": 0.4746153950691223
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.441444085705896,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.00018692596482356333,
+ "loss": 0.0057,
+ "macro_f1": 0.9255813956260681,
+ "num_tokens": 12516817.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.039019811898469925,
+ "skip_count": 4.0,
+ "step": 7762,
+ "text_loss": 0.3105330467224121
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.45083651306135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.00018668469350455048,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12519357.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002269966993480921,
+ "skip_count": 0.0,
+ "step": 7764,
+ "text_loss": 0.3700210452079773
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00018644354224855414,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12522072.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001265842467546463,
+ "skip_count": 0.0,
+ "step": 7766,
+ "text_loss": 0.6737633943557739
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00018620251114798386,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12524999.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006547329016029835,
+ "skip_count": 1.0,
+ "step": 7768,
+ "text_loss": 0.24906545877456665
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.47901379512768,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0001859616002952033,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 12527785.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.010791841894388199,
+ "skip_count": 3.0,
+ "step": 7770,
+ "text_loss": 0.3069820702075958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.48840622248312,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.0001857208097825299,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12530801.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00492103723809123,
+ "skip_count": 2.0,
+ "step": 7772,
+ "text_loss": 0.2524295151233673
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0001854801397022351,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12533919.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001942967064678669,
+ "skip_count": 0.0,
+ "step": 7774,
+ "text_loss": 0.7855241894721985
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 36.507191077194015,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.00018523959014654407,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 12537265.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.00987488217651844,
+ "skip_count": 2.0,
+ "step": 7776,
+ "text_loss": 0.2767317593097687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.00018499916120763582,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12539695.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0054283770732581615,
+ "skip_count": 1.0,
+ "step": 7778,
+ "text_loss": 0.43287888169288635
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 36.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00018475885297764306,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12542881.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.00797359924763441,
+ "skip_count": 0.0,
+ "step": 7780,
+ "text_loss": 0.3738224506378174
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0001845186655486527,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12546530.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0045951665379107,
+ "skip_count": 0.0,
+ "step": 7782,
+ "text_loss": 0.2511517107486725
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 36.54476078661579,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.00018427859901270482,
+ "loss": 0.0055,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 12549439.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02312052994966507,
+ "skip_count": 4.0,
+ "step": 7784,
+ "text_loss": 0.3837030827999115
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 36.55415321397123,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.059814453125,
+ "learning_rate": 0.00018403865346179344,
+ "loss": 0.0066,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 12553211.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.014698561280965805,
+ "skip_count": 3.0,
+ "step": 7786,
+ "text_loss": 0.510159432888031
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 36.563545641326684,
+ "f1_execute": 0.9743589162826538,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.00018379882898786603,
+ "loss": 0.0075,
+ "macro_f1": 0.8803418874740601,
+ "num_tokens": 12556497.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.023926246911287308,
+ "skip_count": 7.0,
+ "step": 7788,
+ "text_loss": 0.44811317324638367
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.57293806868213,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.00018355912568282384,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12559778.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011187797645106912,
+ "skip_count": 0.0,
+ "step": 7790,
+ "text_loss": 0.32099616527557373
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.00018331954363852166,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12562610.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005356677575036883,
+ "skip_count": 0.0,
+ "step": 7792,
+ "text_loss": 0.9754356145858765
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 36.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 0.0001830800829467677,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12565886.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0017101728590205312,
+ "skip_count": 0.0,
+ "step": 7794,
+ "text_loss": 0.4234761595726013
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.00018284074369932386,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12568728.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012841494753956795,
+ "skip_count": 0.0,
+ "step": 7796,
+ "text_loss": 0.41109147667884827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0001826015259879053,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12572231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022388407960534096,
+ "skip_count": 0.0,
+ "step": 7798,
+ "text_loss": 0.5459926128387451
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.619900205459345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.00018236242990418074,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12574968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019992550369352102,
+ "skip_count": 0.0,
+ "step": 7800,
+ "text_loss": 0.5028481483459473
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.629292632814796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.0001821234555397722,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12579074.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002936388598755002,
+ "skip_count": 2.0,
+ "step": 7802,
+ "text_loss": 0.2377086579799652
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 36.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.00018188460298625503,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12581912.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0026762608904391527,
+ "skip_count": 0.0,
+ "step": 7804,
+ "text_loss": 0.13887254893779755
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 36.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00018164587233515824,
+ "loss": 0.0057,
+ "macro_f1": 1.0,
+ "num_tokens": 12585020.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.003901638789102435,
+ "skip_count": 1.0,
+ "step": 7806,
+ "text_loss": 0.35454171895980835
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.00018140726367796373,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12588310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031358697451651096,
+ "skip_count": 2.0,
+ "step": 7808,
+ "text_loss": 0.3567306697368622
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.66686234223657,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.00018116877710610673,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12591735.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002310588024556637,
+ "skip_count": 1.0,
+ "step": 7810,
+ "text_loss": 0.45357072353363037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.00018093041271097582,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12595232.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005600228440016508,
+ "skip_count": 2.0,
+ "step": 7812,
+ "text_loss": 0.4179847836494446
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.685647196947464,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.00018069217058391267,
+ "loss": 0.006,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 12598367.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04015933722257614,
+ "skip_count": 1.0,
+ "step": 7814,
+ "text_loss": 0.17874565720558167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.00018045405081621214,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12601864.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005119446665048599,
+ "skip_count": 1.0,
+ "step": 7816,
+ "text_loss": 0.6867854595184326
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.00018021605349912207,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12605268.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005990012432448566,
+ "skip_count": 0.0,
+ "step": 7818,
+ "text_loss": 0.9084970355033875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.00017997817872384358,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12608093.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008712377399206161,
+ "skip_count": 1.0,
+ "step": 7820,
+ "text_loss": 0.19413328170776367
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.00017974042658153066,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12611001.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007535711396485567,
+ "skip_count": 1.0,
+ "step": 7822,
+ "text_loss": 0.2672932744026184
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.73260933372468,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0001795027971632905,
+ "loss": 0.0042,
+ "macro_f1": 1.0,
+ "num_tokens": 12614584.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006770546548068523,
+ "skip_count": 3.0,
+ "step": 7824,
+ "text_loss": 0.22805163264274597
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0189208984375,
+ "learning_rate": 0.00017926529056018297,
+ "loss": 0.0031,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12617519.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010458873584866524,
+ "skip_count": 0.0,
+ "step": 7826,
+ "text_loss": 0.385499507188797
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 36.751394188435576,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.00017902790686322102,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12621566.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00634258147329092,
+ "skip_count": 0.0,
+ "step": 7828,
+ "text_loss": 0.8044118285179138
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 36.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.00017879064616337076,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12624751.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0053052278235554695,
+ "skip_count": 3.0,
+ "step": 7830,
+ "text_loss": 0.264322966337204
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.77017904314646,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 0.00017855350855155088,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12628478.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028291696216911077,
+ "skip_count": 0.0,
+ "step": 7832,
+ "text_loss": 0.20611460506916046
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.00017831649411863287,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12632027.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009586421074345708,
+ "skip_count": 1.0,
+ "step": 7834,
+ "text_loss": 0.4119716286659241
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00017807960295544118,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12635144.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012304541654884815,
+ "skip_count": 2.0,
+ "step": 7836,
+ "text_loss": 0.28647977113723755
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0001778428351527529,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12638719.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005212076939642429,
+ "skip_count": 2.0,
+ "step": 7838,
+ "text_loss": 0.630459189414978
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.807748752568244,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0001776061908012979,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12642119.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00183707510586828,
+ "skip_count": 0.0,
+ "step": 7840,
+ "text_loss": 0.5905961990356445
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 36.81714117992369,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0001773696699917588,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12645077.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0058263009414076805,
+ "skip_count": 0.0,
+ "step": 7842,
+ "text_loss": 0.41949576139450073
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.00017713327281477077,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12648964.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001586507773026824,
+ "skip_count": 0.0,
+ "step": 7844,
+ "text_loss": 0.5048848390579224
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.00017689699936092163,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12651934.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002397194504737854,
+ "skip_count": 0.0,
+ "step": 7846,
+ "text_loss": 0.23879878222942352
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 36.84531846199002,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.0001766608497207518,
+ "loss": 0.0054,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 12654907.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016742069274187088,
+ "skip_count": 2.0,
+ "step": 7848,
+ "text_loss": 0.23400072753429413
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0001764248239847544,
+ "loss": 0.0085,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12658765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007037387229502201,
+ "skip_count": 2.0,
+ "step": 7850,
+ "text_loss": 0.26165497303009033
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 36.86410331670091,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.017822265625,
+ "learning_rate": 0.00017618892224337463,
+ "loss": 0.0044,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 12662024.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017352160066366196,
+ "skip_count": 2.0,
+ "step": 7852,
+ "text_loss": 0.23813043534755707
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 36.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.00017595314458701084,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12665751.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005349365528672934,
+ "skip_count": 3.0,
+ "step": 7854,
+ "text_loss": 0.14920757710933685
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.00017571749110601337,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12668823.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037689812015742064,
+ "skip_count": 2.0,
+ "step": 7856,
+ "text_loss": 0.2198697030544281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.89228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.00017548196189068506,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12672367.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006363615393638611,
+ "skip_count": 0.0,
+ "step": 7858,
+ "text_loss": 0.5338839888572693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00017524655703128112,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12675217.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002691479865461588,
+ "skip_count": 0.0,
+ "step": 7860,
+ "text_loss": 0.17463763058185577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00017501127661800908,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12678796.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002262329449877143,
+ "skip_count": 0.0,
+ "step": 7862,
+ "text_loss": 0.4637797474861145
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.92045788083358,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.00017477612074102899,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12681631.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00115531450137496,
+ "skip_count": 0.0,
+ "step": 7864,
+ "text_loss": 0.6089238524436951
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.00017454108949045295,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12685647.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00260268640704453,
+ "skip_count": 0.0,
+ "step": 7866,
+ "text_loss": 0.5876018404960632
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.93924273554447,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.00017430618295634514,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12688995.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002731681102886796,
+ "skip_count": 0.0,
+ "step": 7868,
+ "text_loss": 0.35076001286506653
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 36.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.00017407140122872262,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 12692100.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003314645728096366,
+ "skip_count": 1.0,
+ "step": 7870,
+ "text_loss": 0.5313478112220764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.958027590255355,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00017383674439755393,
+ "loss": 0.0069,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 12695117.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010385016910731792,
+ "skip_count": 1.0,
+ "step": 7872,
+ "text_loss": 0.5092368125915527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.00017360221255276016,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12697678.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001273582922294736,
+ "skip_count": 0.0,
+ "step": 7874,
+ "text_loss": 0.5282881855964661
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.97681244496625,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.00017336780578421418,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12702132.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007510313298553228,
+ "skip_count": 0.0,
+ "step": 7876,
+ "text_loss": 0.49093571305274963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 0.0001731335241817412,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12705413.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005138787440955639,
+ "skip_count": 2.0,
+ "step": 7878,
+ "text_loss": 0.7503541111946106
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 36.99559729967714,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0001728993678351184,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12708310.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.004379773512482643,
+ "skip_count": 0.0,
+ "step": 7880,
+ "text_loss": 0.5942456126213074
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.004696213677725,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0001726653368340747,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12711043.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005271450616419315,
+ "skip_count": 2.0,
+ "step": 7882,
+ "text_loss": 0.348360538482666
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 37.01408864103317,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00017243143126829163,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 12714473.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015764752170071006,
+ "skip_count": 1.0,
+ "step": 7884,
+ "text_loss": 0.45971861481666565
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.02348106838861,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.000172197651227402,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12717832.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00040649910806678236,
+ "skip_count": 0.0,
+ "step": 7886,
+ "text_loss": 0.5996841788291931
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00017196399680099078,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12720479.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00473182974383235,
+ "skip_count": 2.0,
+ "step": 7888,
+ "text_loss": 0.40346208214759827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.00017173046807859483,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12723104.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020138369873166084,
+ "skip_count": 0.0,
+ "step": 7890,
+ "text_loss": 0.6878634095191956
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.05165835045494,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0001714970651497027,
+ "loss": 0.005,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 12725967.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008381367661058903,
+ "skip_count": 1.0,
+ "step": 7892,
+ "text_loss": 0.9161711931228638
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 37.061050777810394,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 0.00017126378810375498,
+ "loss": 0.0033,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12728819.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0037658829241991043,
+ "skip_count": 0.0,
+ "step": 7894,
+ "text_loss": 0.4447716772556305
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.07044320516584,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.00017103063703014372,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12731806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022742559667676687,
+ "skip_count": 0.0,
+ "step": 7896,
+ "text_loss": 0.9140825867652893
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.00017079761201821298,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12734649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002157264854758978,
+ "skip_count": 0.0,
+ "step": 7898,
+ "text_loss": 0.268303781747818
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 37.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.0001705647131572583,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 12737889.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01064873393625021,
+ "skip_count": 1.0,
+ "step": 7900,
+ "text_loss": 0.36009490489959717
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 37.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.00017033194053652685,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12740821.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0062920586206018925,
+ "skip_count": 0.0,
+ "step": 7902,
+ "text_loss": 0.5301805138587952
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 37.10801291458761,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.00017009929424521782,
+ "loss": 0.0063,
+ "macro_f1": 1.0,
+ "num_tokens": 12743876.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0033694824669510126,
+ "skip_count": 1.0,
+ "step": 7904,
+ "text_loss": 1.026949167251587
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.117405341943055,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.00016986677437248155,
+ "loss": 0.0071,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 12747623.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.05076088383793831,
+ "skip_count": 3.0,
+ "step": 7906,
+ "text_loss": 0.33465588092803955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.126797769298506,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.00016963438100742014,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12751255.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005921403644606471,
+ "skip_count": 0.0,
+ "step": 7908,
+ "text_loss": 0.3498881757259369
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.00016940211423908713,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12754297.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004132566973567009,
+ "skip_count": 0.0,
+ "step": 7910,
+ "text_loss": 0.2874198853969574
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.14558262400939,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0001691699741564876,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12756969.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024724705144762993,
+ "skip_count": 1.0,
+ "step": 7912,
+ "text_loss": 0.10593545436859131
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.00016893796084857806,
+ "loss": 0.0031,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12760261.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002991671208292246,
+ "skip_count": 0.0,
+ "step": 7914,
+ "text_loss": 0.1331545114517212
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.16436747872028,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 0.00016870607440426643,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12762971.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018167285015806556,
+ "skip_count": 0.0,
+ "step": 7916,
+ "text_loss": 0.496826171875
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 37.17375990607572,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.00016847431491241207,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12765949.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0033364067785441875,
+ "skip_count": 0.0,
+ "step": 7918,
+ "text_loss": 0.43522849678993225
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.183152333431174,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.0001682426824618256,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12769201.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001313596498221159,
+ "skip_count": 0.0,
+ "step": 7920,
+ "text_loss": 0.8691539168357849
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.19254476078662,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.00016801117714126908,
+ "loss": 0.0108,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 12773308.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02579287625849247,
+ "skip_count": 1.0,
+ "step": 7922,
+ "text_loss": 0.275301069021225
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.00016777979903945568,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12776166.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010501758195459843,
+ "skip_count": 1.0,
+ "step": 7924,
+ "text_loss": 0.32124993205070496
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.0001675485482450499,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12779965.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0063389060087502,
+ "skip_count": 2.0,
+ "step": 7926,
+ "text_loss": 0.2527695894241333
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00016731742484666774,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12783019.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002796935848891735,
+ "skip_count": 0.0,
+ "step": 7928,
+ "text_loss": 0.18767669796943665
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.23011447020839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0001670864289328759,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12786291.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007973561994731426,
+ "skip_count": 2.0,
+ "step": 7930,
+ "text_loss": 0.29628485441207886
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.00016685556059219253,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 12789566.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.011405733413994312,
+ "skip_count": 6.0,
+ "step": 7932,
+ "text_loss": 0.16635073721408844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.248899324919286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00016662481991308682,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12792533.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012368770549073815,
+ "skip_count": 1.0,
+ "step": 7934,
+ "text_loss": 0.4196353852748871
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.000166394206983979,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12795619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036002211272716522,
+ "skip_count": 1.0,
+ "step": 7936,
+ "text_loss": 0.17559808492660522
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 37.26768417963017,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.00016616372189324035,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12799702.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0039332108572125435,
+ "skip_count": 0.0,
+ "step": 7938,
+ "text_loss": 0.603410542011261
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.27707660698562,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00016593336472919324,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12802704.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008303318754769862,
+ "skip_count": 0.0,
+ "step": 7940,
+ "text_loss": 0.5331749320030212
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.28646903434106,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.00016570313558011098,
+ "loss": 0.0058,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 12805630.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.05092398822307587,
+ "skip_count": 2.0,
+ "step": 7942,
+ "text_loss": 0.17398510873317719
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.295861461696504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.00016547303453421774,
+ "loss": 0.0031,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12809065.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006886976188980043,
+ "skip_count": 0.0,
+ "step": 7944,
+ "text_loss": 0.3419797718524933
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.305253889051954,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.00016524306167968878,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 12812641.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005634502973407507,
+ "skip_count": 3.0,
+ "step": 7946,
+ "text_loss": 0.5877651572227478
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.3146463164074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.00016501321710465005,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12815527.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020598487462848425,
+ "skip_count": 0.0,
+ "step": 7948,
+ "text_loss": 0.3558528423309326
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 0.0001647835008971783,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12819103.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005946476943790913,
+ "skip_count": 2.0,
+ "step": 7950,
+ "text_loss": 0.5800213813781738
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.00016455391314530154,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12822423.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010360358282923698,
+ "skip_count": 2.0,
+ "step": 7952,
+ "text_loss": 0.278255820274353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.00016432445393699802,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12826180.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003017681185156107,
+ "skip_count": 0.0,
+ "step": 7954,
+ "text_loss": 0.1571389138698578
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.35221602582917,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00016409512336019698,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12829196.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008854938205331564,
+ "skip_count": 0.0,
+ "step": 7956,
+ "text_loss": 0.2776578366756439
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.00016386592150277834,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12831983.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023990103509277105,
+ "skip_count": 0.0,
+ "step": 7958,
+ "text_loss": 0.46686989068984985
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 37.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0001636368484525727,
+ "loss": 0.0035,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12834889.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009835032746195793,
+ "skip_count": 5.0,
+ "step": 7960,
+ "text_loss": 0.22224856913089752
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.00016340790429736118,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12837950.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018618656322360039,
+ "skip_count": 0.0,
+ "step": 7962,
+ "text_loss": 0.5101882815361023
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 37.38978573525095,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 0.00016317908912487578,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 12840981.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001275144051760435,
+ "skip_count": 1.0,
+ "step": 7964,
+ "text_loss": 0.40567103028297424
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.3991781626064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.00016295040302279873,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12844044.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003117429558187723,
+ "skip_count": 2.0,
+ "step": 7966,
+ "text_loss": 0.6888198852539062
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.40857058996184,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.00016272184607876312,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 12847350.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006585797294974327,
+ "skip_count": 4.0,
+ "step": 7968,
+ "text_loss": 0.19813506305217743
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 37.41796301731729,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0001624934183803523,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 12850285.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0043576788157224655,
+ "skip_count": 1.0,
+ "step": 7970,
+ "text_loss": 0.6108269691467285
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 37.427355444672735,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00016226512001510024,
+ "loss": 0.0039,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 12853993.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011879517696797848,
+ "skip_count": 2.0,
+ "step": 7972,
+ "text_loss": 0.42478689551353455
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.43674787202818,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00016203695107049117,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12857022.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016375730047002435,
+ "skip_count": 0.0,
+ "step": 7974,
+ "text_loss": 0.5130020976066589
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0001618089116339601,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12860764.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006649247952736914,
+ "skip_count": 0.0,
+ "step": 7976,
+ "text_loss": 1.0629136562347412
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.455532726739065,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.00016158100179289208,
+ "loss": 0.0062,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 12864066.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03140667825937271,
+ "skip_count": 1.0,
+ "step": 7978,
+ "text_loss": 0.4241345226764679
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 37.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0001613532216346226,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12867555.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010257012210786343,
+ "skip_count": 4.0,
+ "step": 7980,
+ "text_loss": 0.6085613369941711
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.47431758144996,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0001611255712464374,
+ "loss": 0.0037,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12871415.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00783725269138813,
+ "skip_count": 1.0,
+ "step": 7982,
+ "text_loss": 0.15661844611167908
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.017578125,
+ "learning_rate": 0.00016089805071557256,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 12874195.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0027650597039610147,
+ "skip_count": 2.0,
+ "step": 7984,
+ "text_loss": 0.4938865005970001
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.49310243616085,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.00016067066012921439,
+ "loss": 0.0083,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 12878084.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04647083953022957,
+ "skip_count": 0.0,
+ "step": 7986,
+ "text_loss": 0.2973119020462036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 0.00016044339957449938,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12881182.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002192265819758177,
+ "skip_count": 0.0,
+ "step": 7988,
+ "text_loss": 0.2623208165168762
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.00016021626913851418,
+ "loss": 0.0031,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12884028.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023096329532563686,
+ "skip_count": 0.0,
+ "step": 7990,
+ "text_loss": 0.3752247989177704
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.52127971822718,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.00015998926890829562,
+ "loss": 0.0046,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 12887759.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03038526326417923,
+ "skip_count": 1.0,
+ "step": 7992,
+ "text_loss": 0.2609226405620575
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.53067214558262,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0001597623989708306,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12890976.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015199477784335613,
+ "skip_count": 0.0,
+ "step": 7994,
+ "text_loss": 0.6512867212295532
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.54006457293807,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.00015953565941305615,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12894112.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024166766088455915,
+ "skip_count": 0.0,
+ "step": 7996,
+ "text_loss": 0.5539866089820862
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.0001593090503218591,
+ "loss": 0.0069,
+ "macro_f1": 1.0,
+ "num_tokens": 12896857.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005081235896795988,
+ "skip_count": 2.0,
+ "step": 7998,
+ "text_loss": 0.6631022691726685
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 37.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.00015908257178407682,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12900075.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0024711282458156347,
+ "skip_count": 0.0,
+ "step": 8000,
+ "text_loss": 0.3309785723686218
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.5682418550044,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.00015885622388649617,
+ "loss": 0.0059,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 12903845.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04024988412857056,
+ "skip_count": 2.0,
+ "step": 8002,
+ "text_loss": 0.2384071946144104
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.577634282359845,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 0.00015863000671585405,
+ "loss": 0.008,
+ "macro_f1": 1.0,
+ "num_tokens": 12907694.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001953886589035392,
+ "skip_count": 2.0,
+ "step": 8004,
+ "text_loss": 0.5001366138458252
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.58702670971529,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.00015840392035883726,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12910871.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002982128644362092,
+ "skip_count": 2.0,
+ "step": 8006,
+ "text_loss": 0.2589346170425415
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.59641913707074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0001581779649020827,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12914484.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009384988807141781,
+ "skip_count": 0.0,
+ "step": 8008,
+ "text_loss": 0.5727795362472534
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.00015795214043217654,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12917480.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008854437619447708,
+ "skip_count": 2.0,
+ "step": 8010,
+ "text_loss": 0.24354904890060425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.61520399178163,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.00015772644703565563,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12920383.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001689503900706768,
+ "skip_count": 0.0,
+ "step": 8012,
+ "text_loss": 0.5372336506843567
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.00015750088479900588,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12923886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002284591319039464,
+ "skip_count": 0.0,
+ "step": 8014,
+ "text_loss": 0.1708722710609436
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 37.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.00015727545380866316,
+ "loss": 0.0042,
+ "macro_f1": 1.0,
+ "num_tokens": 12926998.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004594483878463507,
+ "skip_count": 4.0,
+ "step": 8016,
+ "text_loss": 0.26784324645996094
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 37.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0001570501541510131,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12929726.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0021998141892254353,
+ "skip_count": 0.0,
+ "step": 8018,
+ "text_loss": 0.8051869869232178
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.65277370120341,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00015682498591239086,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12932182.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032623414881527424,
+ "skip_count": 1.0,
+ "step": 8020,
+ "text_loss": 0.8431181907653809
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00015659994917908144,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12935338.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014909361489117146,
+ "skip_count": 1.0,
+ "step": 8022,
+ "text_loss": 0.6168642640113831
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0001563750440373191,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12938484.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010295510292053223,
+ "skip_count": 0.0,
+ "step": 8024,
+ "text_loss": 0.2694014608860016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 37.68095098326974,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00015615027057328828,
+ "loss": 0.0066,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 12942045.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018341995775699615,
+ "skip_count": 2.0,
+ "step": 8026,
+ "text_loss": 0.8151478171348572
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 37.69034341062518,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0001559256288731224,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 12945547.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0023289949167519808,
+ "skip_count": 1.0,
+ "step": 8028,
+ "text_loss": 0.613464891910553
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.699735837980626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 0.00015570111902290463,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12949544.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006635872647166252,
+ "skip_count": 2.0,
+ "step": 8030,
+ "text_loss": 0.17417465150356293
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 37.70912826533607,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.00015547674110866756,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 12952838.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006023989990353584,
+ "skip_count": 1.0,
+ "step": 8032,
+ "text_loss": 0.4801837205886841
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.71852069269152,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.00015525249521639319,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12956329.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005706884432584047,
+ "skip_count": 0.0,
+ "step": 8034,
+ "text_loss": 0.2028084248304367
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.72791312004696,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.000155028381432013,
+ "loss": 0.0034,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12959122.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003527123713865876,
+ "skip_count": 2.0,
+ "step": 8036,
+ "text_loss": 0.39474430680274963
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.73730554740241,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0179443359375,
+ "learning_rate": 0.00015480439984140776,
+ "loss": 0.0029,
+ "macro_f1": 1.0,
+ "num_tokens": 12962546.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010415437631309032,
+ "skip_count": 2.0,
+ "step": 8038,
+ "text_loss": 0.20412345230579376
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0001545805505304077,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12965861.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001566931139677763,
+ "skip_count": 0.0,
+ "step": 8040,
+ "text_loss": 0.5129821300506592
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 37.756090402113294,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.0001543568335847923,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12968677.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.0037196793127804995,
+ "skip_count": 0.0,
+ "step": 8042,
+ "text_loss": 0.755020260810852
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.00015413324909029031,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12972001.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010940275387838483,
+ "skip_count": 0.0,
+ "step": 8044,
+ "text_loss": 0.48672133684158325
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.77487525682419,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.00015390979713257968,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12974765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011106903664767742,
+ "skip_count": 1.0,
+ "step": 8046,
+ "text_loss": 0.1727766990661621
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 25.0,
+ "epoch": 37.78426768417963,
+ "f1_execute": 0.949999988079071,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.00015368647779728757,
+ "loss": 0.006,
+ "macro_f1": 0.886363685131073,
+ "num_tokens": 12979127.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.05134248360991478,
+ "skip_count": 6.0,
+ "step": 8048,
+ "text_loss": 0.33233317732810974
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.00015346329116999057,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12982812.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027500339783728123,
+ "skip_count": 0.0,
+ "step": 8050,
+ "text_loss": 0.8176849484443665
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.80305253889052,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.00015324023733621412,
+ "loss": 0.005,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 12985740.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030734945088624954,
+ "skip_count": 2.0,
+ "step": 8052,
+ "text_loss": 0.38721024990081787
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.81244496624596,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 0.00015301731638143285,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12988646.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002358534839004278,
+ "skip_count": 2.0,
+ "step": 8054,
+ "text_loss": 0.5656245946884155
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.821837393601406,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.060791015625,
+ "learning_rate": 0.0001527945283910705,
+ "loss": 0.0074,
+ "macro_f1": 1.0,
+ "num_tokens": 12991518.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.007991814985871315,
+ "skip_count": 3.0,
+ "step": 8056,
+ "text_loss": 0.26438817381858826
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 37.83122982095686,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.00015257187345049983,
+ "loss": 0.0079,
+ "macro_f1": 1.0,
+ "num_tokens": 12994847.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011761264875531197,
+ "skip_count": 1.0,
+ "step": 8058,
+ "text_loss": 0.1801673173904419
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 37.8406222483123,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 0.0001523493516450427,
+ "loss": 0.004,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 12997874.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.021669765934348106,
+ "skip_count": 2.0,
+ "step": 8060,
+ "text_loss": 0.3278379738330841
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0001521269630599698,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13000504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002388916676864028,
+ "skip_count": 0.0,
+ "step": 8062,
+ "text_loss": 0.5396623611450195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.85940710302319,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00015190470778050086,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13003620.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007719808723777533,
+ "skip_count": 1.0,
+ "step": 8064,
+ "text_loss": 0.1989232450723648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00015168258589180462,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13007410.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007461659261025488,
+ "skip_count": 0.0,
+ "step": 8066,
+ "text_loss": 0.5293997526168823
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 37.878191957734074,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.00015146059747899848,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13010240.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005515575874596834,
+ "skip_count": 0.0,
+ "step": 8068,
+ "text_loss": 0.2776186466217041
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.887584385089525,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00015123874262714892,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13012728.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026730166282504797,
+ "skip_count": 0.0,
+ "step": 8070,
+ "text_loss": 0.5902766585350037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.89697681244497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04833984375,
+ "learning_rate": 0.00015101702142127088,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13015616.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002244985429570079,
+ "skip_count": 0.0,
+ "step": 8072,
+ "text_loss": 0.21447396278381348
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 0.00015079543394632878,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13019846.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001963787479326129,
+ "skip_count": 0.0,
+ "step": 8074,
+ "text_loss": 0.22974267601966858
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 37.915761667155856,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.00015057398028723513,
+ "loss": 0.0064,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 13023036.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02271878905594349,
+ "skip_count": 2.0,
+ "step": 8076,
+ "text_loss": 0.26458361744880676
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 0.00015035266052885137,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13025840.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011732397833839059,
+ "skip_count": 0.0,
+ "step": 8078,
+ "text_loss": 0.44129177927970886
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.93454652186674,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0001501314747559877,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 13030031.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015655985102057457,
+ "skip_count": 2.0,
+ "step": 8080,
+ "text_loss": 0.28889161348342896
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.943938949222186,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.00014991042305340286,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13033603.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012988687958568335,
+ "skip_count": 0.0,
+ "step": 8082,
+ "text_loss": 0.16362667083740234
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.95333137657764,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00014968950550580434,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13036931.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002425852930173278,
+ "skip_count": 0.0,
+ "step": 8084,
+ "text_loss": 0.35900676250457764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0001494687221978482,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13040637.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004092676565051079,
+ "skip_count": 1.0,
+ "step": 8086,
+ "text_loss": 0.20662656426429749
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00014924807321413893,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13043855.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009040542645379901,
+ "skip_count": 0.0,
+ "step": 8088,
+ "text_loss": 0.30341213941574097
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.98150865864397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0001490275586392296,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13046903.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019248841563239694,
+ "skip_count": 0.0,
+ "step": 8090,
+ "text_loss": 0.4299648702144623
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.000148807178557622,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13050219.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008314658771269023,
+ "skip_count": 0.0,
+ "step": 8092,
+ "text_loss": 0.4521652162075043
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00014858693305376598,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13053076.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007470731507055461,
+ "skip_count": 0.0,
+ "step": 8094,
+ "text_loss": 0.46265852451324463
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 38.00939242735544,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.00014836682221206,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13056170.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003292408073320985,
+ "skip_count": 0.0,
+ "step": 8096,
+ "text_loss": 0.6483868956565857
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.00014814684611685124,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13059181.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001357200788334012,
+ "skip_count": 0.0,
+ "step": 8098,
+ "text_loss": 0.43141183257102966
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0177001953125,
+ "learning_rate": 0.00014792700485243476,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13062124.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030062920413911343,
+ "skip_count": 0.0,
+ "step": 8100,
+ "text_loss": 0.26022693514823914
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0001477072985030542,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13065273.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006919128354638815,
+ "skip_count": 0.0,
+ "step": 8102,
+ "text_loss": 0.5927232503890991
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.00014748772715290144,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13068346.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005062389187514782,
+ "skip_count": 0.0,
+ "step": 8104,
+ "text_loss": 0.1255214959383011
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 0.00014726829088611664,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13071384.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005492564523592591,
+ "skip_count": 0.0,
+ "step": 8106,
+ "text_loss": 0.6445038914680481
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.00014704898978678817,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13074667.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002470226027071476,
+ "skip_count": 0.0,
+ "step": 8108,
+ "text_loss": 0.5019628405570984
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.00014682982393895256,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13077566.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008262090268544853,
+ "skip_count": 0.0,
+ "step": 8110,
+ "text_loss": 0.6075460314750671
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.084531846199,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 0.00014661079342659467,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13081042.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00034181721275672317,
+ "skip_count": 0.0,
+ "step": 8112,
+ "text_loss": 0.7349393963813782
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.09392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0001463918983336474,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 13084151.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01406828872859478,
+ "skip_count": 2.0,
+ "step": 8114,
+ "text_loss": 0.3122454285621643
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017822265625,
+ "learning_rate": 0.00014617313874399173,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13086998.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002714085392653942,
+ "skip_count": 0.0,
+ "step": 8116,
+ "text_loss": 0.6545852422714233
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.11270912826534,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.00014595451474145677,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13090017.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0073202489875257015,
+ "skip_count": 0.0,
+ "step": 8118,
+ "text_loss": 0.5487201809883118
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.00014573602640981947,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13093651.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000667977670673281,
+ "skip_count": 0.0,
+ "step": 8120,
+ "text_loss": 0.672166109085083
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.00014551767383280535,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13097139.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020584615413099527,
+ "skip_count": 0.0,
+ "step": 8122,
+ "text_loss": 0.1996239423751831
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 38.14088641033167,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.00014529945709408726,
+ "loss": 0.0069,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 13100493.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013855135068297386,
+ "skip_count": 3.0,
+ "step": 8124,
+ "text_loss": 0.4099486768245697
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0001450813762772863,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13103488.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014984552981331944,
+ "skip_count": 0.0,
+ "step": 8126,
+ "text_loss": 0.6307108402252197
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 38.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.00014486343146597152,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13106445.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00430954247713089,
+ "skip_count": 0.0,
+ "step": 8128,
+ "text_loss": 0.6226127743721008
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07177734375,
+ "learning_rate": 0.00014464562274365972,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13109258.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003711461555212736,
+ "skip_count": 1.0,
+ "step": 8130,
+ "text_loss": 0.17819052934646606
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.17845611975345,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00014442795019381567,
+ "loss": 0.0064,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 13114206.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015719098970294,
+ "skip_count": 1.0,
+ "step": 8132,
+ "text_loss": 0.28450697660446167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.00014421041389985184,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13117351.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013113922905176878,
+ "skip_count": 0.0,
+ "step": 8134,
+ "text_loss": 0.310830682516098
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 38.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.00014399301394512858,
+ "loss": 0.0051,
+ "macro_f1": 1.0,
+ "num_tokens": 13120228.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001965439412742853,
+ "skip_count": 1.0,
+ "step": 8136,
+ "text_loss": 0.8635116815567017
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.00014377575041295393,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 13123380.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004898902028799057,
+ "skip_count": 2.0,
+ "step": 8138,
+ "text_loss": 0.5302467346191406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.21602582917523,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0179443359375,
+ "learning_rate": 0.0001435586233865836,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13126875.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00031845085322856903,
+ "skip_count": 0.0,
+ "step": 8140,
+ "text_loss": 0.5913560390472412
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 38.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0001433416329492213,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 13129563.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00298812473192811,
+ "skip_count": 1.0,
+ "step": 8142,
+ "text_loss": 0.5153398513793945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00014312477918401807,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13132608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026608197949826717,
+ "skip_count": 1.0,
+ "step": 8144,
+ "text_loss": 0.4554155766963959
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 38.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.00014290806217407272,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 13136204.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0027651884593069553,
+ "skip_count": 1.0,
+ "step": 8146,
+ "text_loss": 0.6349515318870544
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.00014269148200243148,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13138895.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006579195614904165,
+ "skip_count": 0.0,
+ "step": 8148,
+ "text_loss": 0.4629364013671875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.26298796595245,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.00014247503875208846,
+ "loss": 0.0059,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 13142500.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.023065708577632904,
+ "skip_count": 0.0,
+ "step": 8150,
+ "text_loss": 0.4962928593158722
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.00014225873250598496,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13146203.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007397830951958895,
+ "skip_count": 1.0,
+ "step": 8152,
+ "text_loss": 0.3225953280925751
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00014204256334700988,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13149517.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004839105997234583,
+ "skip_count": 1.0,
+ "step": 8154,
+ "text_loss": 0.18435558676719666
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 38.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.00014182653135799995,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13152643.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028303388971835375,
+ "skip_count": 4.0,
+ "step": 8156,
+ "text_loss": 0.5836900472640991
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0001416106366217389,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13155213.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004012314020656049,
+ "skip_count": 0.0,
+ "step": 8158,
+ "text_loss": 0.3723861575126648
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 20.0,
+ "epoch": 38.30995010272967,
+ "f1_execute": 0.9714285731315613,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0001413948792209579,
+ "loss": 0.0065,
+ "macro_f1": 0.8793651461601257,
+ "num_tokens": 13158440.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04377155378460884,
+ "skip_count": 9.0,
+ "step": 8160,
+ "text_loss": 0.32476910948753357
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0001411792592383357,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13162651.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011163362069055438,
+ "skip_count": 0.0,
+ "step": 8162,
+ "text_loss": 0.4890389144420624
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.32873495744057,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.00014096377675649823,
+ "loss": 0.0055,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 13165406.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012117774225771427,
+ "skip_count": 1.0,
+ "step": 8164,
+ "text_loss": 0.7763246893882751
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 38.33812738479601,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00014074843185801883,
+ "loss": 0.004,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 13168402.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.009951545856893063,
+ "skip_count": 2.0,
+ "step": 8166,
+ "text_loss": 0.5038266777992249
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 38.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.00014053322462541802,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 13171423.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0021372761111706495,
+ "skip_count": 1.0,
+ "step": 8168,
+ "text_loss": 0.5634724497795105
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.00014031815514116354,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13174713.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007417177548632026,
+ "skip_count": 0.0,
+ "step": 8170,
+ "text_loss": 0.4009707272052765
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 38.36630466686234,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.00014010322348767057,
+ "loss": 0.0077,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 13178012.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01619168184697628,
+ "skip_count": 3.0,
+ "step": 8172,
+ "text_loss": 0.29182371497154236
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.375697094217784,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.00013988842974730137,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13181096.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037969043478369713,
+ "skip_count": 0.0,
+ "step": 8174,
+ "text_loss": 0.275851845741272
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.385089521573235,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.00013967377400236515,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13184116.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007759644067846239,
+ "skip_count": 0.0,
+ "step": 8176,
+ "text_loss": 0.7569663524627686
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.00013945925633511848,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13187319.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002708743792027235,
+ "skip_count": 0.0,
+ "step": 8178,
+ "text_loss": 0.4733831286430359
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.00013924487682776492,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13190796.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005060714902356267,
+ "skip_count": 0.0,
+ "step": 8180,
+ "text_loss": 0.5663171410560608
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.413266803639566,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0001390306355624551,
+ "loss": 0.0049,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 13193705.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02932601235806942,
+ "skip_count": 1.0,
+ "step": 8182,
+ "text_loss": 0.30700045824050903
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0001388165326212867,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13196393.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011637522839009762,
+ "skip_count": 0.0,
+ "step": 8184,
+ "text_loss": 0.6897354125976562
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.43205165835045,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00013860256808630427,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13199526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017184355529025197,
+ "skip_count": 0.0,
+ "step": 8186,
+ "text_loss": 0.6246579885482788
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.441444085705896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.00013838874203949954,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13202963.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026622721925377846,
+ "skip_count": 0.0,
+ "step": 8188,
+ "text_loss": 0.506066083908081
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.45083651306135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.00013817505456281099,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13207408.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000543750764336437,
+ "skip_count": 0.0,
+ "step": 8190,
+ "text_loss": 0.5192428231239319
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0001379615057381241,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13211073.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010060713393613696,
+ "skip_count": 0.0,
+ "step": 8192,
+ "text_loss": 0.5640166401863098
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.00013774809564727104,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13214203.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005152868572622538,
+ "skip_count": 2.0,
+ "step": 8194,
+ "text_loss": 0.8643819689750671
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.47901379512768,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 0.0001375348243720312,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13217748.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017722113989293575,
+ "skip_count": 2.0,
+ "step": 8196,
+ "text_loss": 0.40500834584236145
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.48840622248312,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0001373216919941304,
+ "loss": 0.005,
+ "macro_f1": 1.0,
+ "num_tokens": 13221341.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00999271310865879,
+ "skip_count": 3.0,
+ "step": 8198,
+ "text_loss": 0.2317391037940979
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.00013710869859524143,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13224288.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016836341237649322,
+ "skip_count": 0.0,
+ "step": 8200,
+ "text_loss": 0.31873467564582825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.507191077194015,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.00013689584425698376,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13227342.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002255793660879135,
+ "skip_count": 0.0,
+ "step": 8202,
+ "text_loss": 0.13513202965259552
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 38.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0001366831290609235,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 13230912.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0062925987876951694,
+ "skip_count": 4.0,
+ "step": 8204,
+ "text_loss": 0.3692396581172943
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 38.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00013647055308857353,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13233961.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0020471401512622833,
+ "skip_count": 0.0,
+ "step": 8206,
+ "text_loss": 0.5655510425567627
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.0001362581164213934,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13237170.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009666495025157928,
+ "skip_count": 0.0,
+ "step": 8208,
+ "text_loss": 0.720582902431488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.00013604581914078922,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13241020.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006306356517598033,
+ "skip_count": 0.0,
+ "step": 8210,
+ "text_loss": 0.5686481595039368
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 38.55415321397123,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.00013583366132811374,
+ "loss": 0.0058,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 13244491.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.016230134293437004,
+ "skip_count": 0.0,
+ "step": 8212,
+ "text_loss": 0.55678790807724
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.563545641326684,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.00013562164306466624,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13247551.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003904943587258458,
+ "skip_count": 2.0,
+ "step": 8214,
+ "text_loss": 0.6521575450897217
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.57293806868213,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.00013540976443169244,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13250863.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002239734400063753,
+ "skip_count": 1.0,
+ "step": 8216,
+ "text_loss": 0.29757481813430786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.00013519802551038452,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13254215.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004978829529136419,
+ "skip_count": 2.0,
+ "step": 8218,
+ "text_loss": 0.30598193407058716
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.00013498642638188157,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13257269.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0040260558016598225,
+ "skip_count": 0.0,
+ "step": 8220,
+ "text_loss": 0.39327144622802734
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.00013477496712726862,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13260573.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002124674618244171,
+ "skip_count": 0.0,
+ "step": 8222,
+ "text_loss": 0.38342708349227905
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00013456364782757718,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13263684.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00087209593039006,
+ "skip_count": 0.0,
+ "step": 8224,
+ "text_loss": 0.6338301301002502
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 38.619900205459345,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00013435246856378526,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13266879.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003183641703799367,
+ "skip_count": 0.0,
+ "step": 8226,
+ "text_loss": 0.6073583364486694
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.629292632814796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0169677734375,
+ "learning_rate": 0.00013414142941681718,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13270679.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001859338372014463,
+ "skip_count": 0.0,
+ "step": 8228,
+ "text_loss": 0.5427029132843018
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0001339305304675435,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13273275.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000655558833386749,
+ "skip_count": 0.0,
+ "step": 8230,
+ "text_loss": 0.29442915320396423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 0.00013371977179678113,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13276205.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011499621905386448,
+ "skip_count": 0.0,
+ "step": 8232,
+ "text_loss": 0.5601125359535217
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00013350915348529313,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13279242.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019823790062218904,
+ "skip_count": 0.0,
+ "step": 8234,
+ "text_loss": 0.43674135208129883
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 38.66686234223657,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.00013329867561378888,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13282531.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005772443953901529,
+ "skip_count": 3.0,
+ "step": 8236,
+ "text_loss": 0.4838809072971344
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.00013308833826292395,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13286219.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038314659614115953,
+ "skip_count": 2.0,
+ "step": 8238,
+ "text_loss": 0.5002569556236267
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 38.685647196947464,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.00013287814151329987,
+ "loss": 0.0075,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 13290348.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04819172993302345,
+ "skip_count": 4.0,
+ "step": 8240,
+ "text_loss": 0.3099883198738098
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.00013266808544546438,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13293644.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010334883816540241,
+ "skip_count": 2.0,
+ "step": 8242,
+ "text_loss": 0.17672912776470184
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.00013245817013991164,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13296721.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00162201386410743,
+ "skip_count": 0.0,
+ "step": 8244,
+ "text_loss": 0.7664286494255066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.00013224839567708142,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13299704.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0039452011696994305,
+ "skip_count": 0.0,
+ "step": 8246,
+ "text_loss": 0.1827820986509323
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 38.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.00013203876213735972,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 13302553.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006701917387545109,
+ "skip_count": 7.0,
+ "step": 8248,
+ "text_loss": 0.6020278930664062
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.73260933372468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.0001318292696010785,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13305875.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00968079548329115,
+ "skip_count": 2.0,
+ "step": 8250,
+ "text_loss": 0.2693248987197876
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 38.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.00013161991814851571,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 13309115.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.008890608325600624,
+ "skip_count": 2.0,
+ "step": 8252,
+ "text_loss": 0.6325297355651855
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 38.751394188435576,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 0.00013141070785989517,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 13312219.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00825794693082571,
+ "skip_count": 4.0,
+ "step": 8254,
+ "text_loss": 0.284396767616272
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.00013120163881538677,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13315214.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003378969384357333,
+ "skip_count": 1.0,
+ "step": 8256,
+ "text_loss": 0.20296992361545563
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.77017904314646,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.00013099271109510603,
+ "loss": 0.005,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 13319117.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0164186954498291,
+ "skip_count": 0.0,
+ "step": 8258,
+ "text_loss": 0.21940068900585175
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 38.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0001307839247791145,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13321631.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0053979759104549885,
+ "skip_count": 3.0,
+ "step": 8260,
+ "text_loss": 0.19442199170589447
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.00013057527994741946,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13324759.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024567479267716408,
+ "skip_count": 0.0,
+ "step": 8262,
+ "text_loss": 0.5528824925422668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0001303667766799741,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13327554.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002819873159751296,
+ "skip_count": 1.0,
+ "step": 8264,
+ "text_loss": 0.4418395757675171
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.807748752568244,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.00013015841505667703,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13331838.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030280952341854572,
+ "skip_count": 1.0,
+ "step": 8266,
+ "text_loss": 0.5263079404830933
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 38.81714117992369,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0185546875,
+ "learning_rate": 0.0001299501951573731,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13334968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001774887670762837,
+ "skip_count": 4.0,
+ "step": 8268,
+ "text_loss": 0.47985130548477173
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00012974211706185247,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13338052.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007027842104434967,
+ "skip_count": 1.0,
+ "step": 8270,
+ "text_loss": 0.6588287949562073
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.00012953418084985107,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13341653.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026854060124605894,
+ "skip_count": 1.0,
+ "step": 8272,
+ "text_loss": 0.43156498670578003
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.00012932638660105038,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13345173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033325920812785625,
+ "skip_count": 0.0,
+ "step": 8274,
+ "text_loss": 0.1679086685180664
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.00012911873439507766,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13348635.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016183287370949984,
+ "skip_count": 0.0,
+ "step": 8276,
+ "text_loss": 0.5907418131828308
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.00012891122431150549,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13351120.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0049970983527600765,
+ "skip_count": 1.0,
+ "step": 8278,
+ "text_loss": 0.5437678694725037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048095703125,
+ "learning_rate": 0.00012870385642985222,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13353774.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027123154141008854,
+ "skip_count": 0.0,
+ "step": 8280,
+ "text_loss": 0.5742796659469604
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.00012849663082958158,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13358236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0062842960469424725,
+ "skip_count": 0.0,
+ "step": 8282,
+ "text_loss": 0.2340863049030304
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.89228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.00012828954759010265,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13360994.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006564505747519433,
+ "skip_count": 0.0,
+ "step": 8284,
+ "text_loss": 0.45432794094085693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0001280826067907705,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13363665.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001298630959354341,
+ "skip_count": 0.0,
+ "step": 8286,
+ "text_loss": 0.7439755201339722
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00012787580851088493,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13367412.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00464112963527441,
+ "skip_count": 0.0,
+ "step": 8288,
+ "text_loss": 0.2854461669921875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.92045788083358,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0001276691528296916,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13370745.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006090773968026042,
+ "skip_count": 0.0,
+ "step": 8290,
+ "text_loss": 0.6663011312484741
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.00012746263982638123,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13373396.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038922233507037163,
+ "skip_count": 0.0,
+ "step": 8292,
+ "text_loss": 0.3858443796634674
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.93924273554447,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.00012725626958009007,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13376172.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016941255889832973,
+ "skip_count": 0.0,
+ "step": 8294,
+ "text_loss": 0.4758119285106659
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 38.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.0001270500421698994,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13379002.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001703770598396659,
+ "skip_count": 0.0,
+ "step": 8296,
+ "text_loss": 0.7464606165885925
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.00012684395767483626,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13382221.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001474690856412053,
+ "skip_count": 1.0,
+ "step": 8298,
+ "text_loss": 0.37309199571609497
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 38.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.00012663801617387245,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13385276.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004561704583466053,
+ "skip_count": 3.0,
+ "step": 8300,
+ "text_loss": 0.43284836411476135
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 38.97681244496625,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.00012643221774592518,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 13388321.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005136100109666586,
+ "skip_count": 1.0,
+ "step": 8302,
+ "text_loss": 0.669730007648468
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.00012622656246985675,
+ "loss": 0.0101,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13391222.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028521555941551924,
+ "skip_count": 0.0,
+ "step": 8304,
+ "text_loss": 0.16773155331611633
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.99559729967714,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.00012602105042447471,
+ "loss": 0.0087,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13395297.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033424890134483576,
+ "skip_count": 2.0,
+ "step": 8306,
+ "text_loss": 0.1650846153497696
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.004696213677725,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0001258156816885316,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13398482.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012481207959353924,
+ "skip_count": 0.0,
+ "step": 8308,
+ "text_loss": 0.37225499749183655
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 39.01408864103317,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.00012561045634072515,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13402199.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006243644282221794,
+ "skip_count": 3.0,
+ "step": 8310,
+ "text_loss": 0.16000206768512726
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.02348106838861,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00012540537445969807,
+ "loss": 0.0087,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13404950.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004267443902790546,
+ "skip_count": 2.0,
+ "step": 8312,
+ "text_loss": 0.400174081325531
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.00012520043612403815,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13407883.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005013707559555769,
+ "skip_count": 2.0,
+ "step": 8314,
+ "text_loss": 0.1331731230020523
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 39.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.00012499564141227798,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13410563.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00463570561259985,
+ "skip_count": 0.0,
+ "step": 8316,
+ "text_loss": 0.5098661184310913
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 39.05165835045494,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.052978515625,
+ "learning_rate": 0.0001247909904028956,
+ "loss": 0.0078,
+ "macro_f1": 1.0,
+ "num_tokens": 13413730.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007066591177135706,
+ "skip_count": 1.0,
+ "step": 8318,
+ "text_loss": 0.8059925436973572
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 39.061050777810394,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00012458648317431348,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13416425.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004210594110190868,
+ "skip_count": 3.0,
+ "step": 8320,
+ "text_loss": 0.6559522151947021
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.07044320516584,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0001243821198048992,
+ "loss": 0.0045,
+ "macro_f1": 1.0,
+ "num_tokens": 13419851.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005613257177174091,
+ "skip_count": 2.0,
+ "step": 8322,
+ "text_loss": 0.2783811688423157
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.00012417790037296523,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13422588.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00233642989769578,
+ "skip_count": 1.0,
+ "step": 8324,
+ "text_loss": 0.7659147381782532
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.00012397382495676874,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13425275.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013295465614646673,
+ "skip_count": 0.0,
+ "step": 8326,
+ "text_loss": 0.5693745017051697
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 39.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.0001237698936345119,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 13428314.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005712272133678198,
+ "skip_count": 1.0,
+ "step": 8328,
+ "text_loss": 0.8581340909004211
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.10801291458761,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.00012356610648434153,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13431453.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015835616504773498,
+ "skip_count": 0.0,
+ "step": 8330,
+ "text_loss": 0.1395341008901596
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.117405341943055,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.00012336246358434928,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13434566.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012973316479474306,
+ "skip_count": 0.0,
+ "step": 8332,
+ "text_loss": 0.7125005125999451
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.126797769298506,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.00012315896501257145,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13438056.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005822008824907243,
+ "skip_count": 0.0,
+ "step": 8334,
+ "text_loss": 0.7730510234832764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.00012295561084698915,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13441390.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00547185679897666,
+ "skip_count": 1.0,
+ "step": 8336,
+ "text_loss": 0.3927873373031616
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.14558262400939,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.000122752401165528,
+ "loss": 0.0022,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13443864.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011191967641934752,
+ "skip_count": 0.0,
+ "step": 8338,
+ "text_loss": 0.3996548354625702
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.00012254933604605828,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13447070.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005196621641516685,
+ "skip_count": 0.0,
+ "step": 8340,
+ "text_loss": 0.5597847104072571
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.16436747872028,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.00012234641556639508,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13450522.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003857341594994068,
+ "skip_count": 2.0,
+ "step": 8342,
+ "text_loss": 0.14400488138198853
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.17375990607572,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.00012214363980429793,
+ "loss": 0.0056,
+ "macro_f1": 1.0,
+ "num_tokens": 13453578.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006664265412837267,
+ "skip_count": 3.0,
+ "step": 8344,
+ "text_loss": 0.27675092220306396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.183152333431174,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0595703125,
+ "learning_rate": 0.00012194100883747078,
+ "loss": 0.0095,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13456480.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003549816319718957,
+ "skip_count": 0.0,
+ "step": 8346,
+ "text_loss": 0.21776801347732544
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.19254476078662,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.00012173852274356217,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 13459859.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00446992926299572,
+ "skip_count": 3.0,
+ "step": 8348,
+ "text_loss": 0.1828736811876297
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.00012153618160016527,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13463104.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024826989974826574,
+ "skip_count": 1.0,
+ "step": 8350,
+ "text_loss": 0.15649555623531342
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0186767578125,
+ "learning_rate": 0.0001213339854848175,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13467051.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021385846193879843,
+ "skip_count": 1.0,
+ "step": 8352,
+ "text_loss": 0.49281737208366394
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.00012113193447500081,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13470411.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014382716035470366,
+ "skip_count": 1.0,
+ "step": 8354,
+ "text_loss": 0.5984349846839905
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.23011447020839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.00012093002864814151,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13474666.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008536498062312603,
+ "skip_count": 1.0,
+ "step": 8356,
+ "text_loss": 0.2851131856441498
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.00012072826808161036,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13477754.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027286717668175697,
+ "skip_count": 0.0,
+ "step": 8358,
+ "text_loss": 0.5987376570701599
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.248899324919286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0001205266528527223,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13481151.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002780565759167075,
+ "skip_count": 1.0,
+ "step": 8360,
+ "text_loss": 0.1847199648618698
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00012032518303873674,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13484050.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006186611135490239,
+ "skip_count": 0.0,
+ "step": 8362,
+ "text_loss": 0.6229772567749023
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 39.26768417963017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 0.00012012385871685716,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13488551.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00956071075052023,
+ "skip_count": 5.0,
+ "step": 8364,
+ "text_loss": 0.2810790538787842
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.27707660698562,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.00011992267996423162,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13491420.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008410792797803879,
+ "skip_count": 2.0,
+ "step": 8366,
+ "text_loss": 0.20509617030620575
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.28646903434106,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00011972164685795212,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13494736.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00762166129425168,
+ "skip_count": 1.0,
+ "step": 8368,
+ "text_loss": 0.24739402532577515
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.295861461696504,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.00011952075947505486,
+ "loss": 0.0051,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 13498363.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010674391873180866,
+ "skip_count": 1.0,
+ "step": 8370,
+ "text_loss": 0.31931644678115845
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 39.305253889051954,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0001193200178925204,
+ "loss": 0.0036,
+ "macro_f1": 1.0,
+ "num_tokens": 13501029.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0041843741200864315,
+ "skip_count": 1.0,
+ "step": 8372,
+ "text_loss": 0.5103049278259277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.3146463164074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00011911942218727312,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13503854.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006344785797409713,
+ "skip_count": 0.0,
+ "step": 8374,
+ "text_loss": 0.4914432764053345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.00011891897243618183,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13508316.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003527739318087697,
+ "skip_count": 0.0,
+ "step": 8376,
+ "text_loss": 0.5317551493644714
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00011871866871605913,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13512603.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001071247854270041,
+ "skip_count": 0.0,
+ "step": 8378,
+ "text_loss": 0.6693558096885681
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.00011851851110366185,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13515928.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000924977008253336,
+ "skip_count": 1.0,
+ "step": 8380,
+ "text_loss": 0.8004939556121826
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.35221602582917,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 0.0001183184996756908,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13518548.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017637151759117842,
+ "skip_count": 0.0,
+ "step": 8382,
+ "text_loss": 0.5012105107307434
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 39.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.00011811863450879063,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13522155.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0011129514314234257,
+ "skip_count": 0.0,
+ "step": 8384,
+ "text_loss": 0.3866073489189148
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 39.371000880540066,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.00011791891567955009,
+ "loss": 0.0046,
+ "macro_f1": 0.8814815282821655,
+ "num_tokens": 13525352.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.042801812291145325,
+ "skip_count": 4.0,
+ "step": 8386,
+ "text_loss": 0.18817944824695587
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018798828125,
+ "learning_rate": 0.00011771934326450173,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13528537.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006869474309496582,
+ "skip_count": 0.0,
+ "step": 8388,
+ "text_loss": 0.6407818794250488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.38978573525095,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.00011751991734012229,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13531650.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008001072565093637,
+ "skip_count": 0.0,
+ "step": 8390,
+ "text_loss": 0.5149344205856323
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.3991781626064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.00011732063798283204,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13535071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006921148742549121,
+ "skip_count": 0.0,
+ "step": 8392,
+ "text_loss": 0.5906356573104858
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.40857058996184,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.00011712150526899523,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13537741.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005221226718276739,
+ "skip_count": 2.0,
+ "step": 8394,
+ "text_loss": 0.3381146192550659
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 39.41796301731729,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.00011692251927491987,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 13541189.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0023983579594641924,
+ "skip_count": 1.0,
+ "step": 8396,
+ "text_loss": 0.7345486283302307
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.427355444672735,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.00011672368007685774,
+ "loss": 0.0069,
+ "macro_f1": 1.0,
+ "num_tokens": 13545210.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005362956319004297,
+ "skip_count": 2.0,
+ "step": 8398,
+ "text_loss": 0.6522865295410156
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.43674787202818,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.00011652498775100445,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13548260.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002955642296001315,
+ "skip_count": 0.0,
+ "step": 8400,
+ "text_loss": 0.3200102150440216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.00011632644237349927,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13551519.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001079231034964323,
+ "skip_count": 0.0,
+ "step": 8402,
+ "text_loss": 0.7251807451248169
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 39.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.00011612804402042509,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13555241.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013860360719263554,
+ "skip_count": 0.0,
+ "step": 8404,
+ "text_loss": 0.159539595246315
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 25.0,
+ "epoch": 39.46492515409451,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.888888955116272,
+ "grad_norm": 0.054931640625,
+ "learning_rate": 0.00011592979276780857,
+ "loss": 0.0055,
+ "macro_f1": 0.9555556178092957,
+ "num_tokens": 13558389.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.017025530338287354,
+ "skip_count": 5.0,
+ "step": 8406,
+ "text_loss": 0.5154430270195007
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.47431758144996,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.00011573168869162004,
+ "loss": 0.0037,
+ "macro_f1": 1.0,
+ "num_tokens": 13561237.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007349071092903614,
+ "skip_count": 2.0,
+ "step": 8408,
+ "text_loss": 0.20888492465019226
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00011553373186777327,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 13564080.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003303215140476823,
+ "skip_count": 2.0,
+ "step": 8410,
+ "text_loss": 0.21808166801929474
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.49310243616085,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.00011533592237212558,
+ "loss": 0.0035,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13566649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005856195464730263,
+ "skip_count": 1.0,
+ "step": 8412,
+ "text_loss": 0.28037169575691223
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.0001151382602804782,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13570015.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007515792385675013,
+ "skip_count": 0.0,
+ "step": 8414,
+ "text_loss": 0.8517835736274719
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00011494074566857549,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13573262.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0043421462178230286,
+ "skip_count": 0.0,
+ "step": 8416,
+ "text_loss": 0.27418580651283264
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.00011474337861210544,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 13576104.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0108594736084342,
+ "skip_count": 2.0,
+ "step": 8418,
+ "text_loss": 0.4724268317222595
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.53067214558262,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.00011454615918669948,
+ "loss": 0.008,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 13579138.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04178442806005478,
+ "skip_count": 0.0,
+ "step": 8420,
+ "text_loss": 0.4065103530883789
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.54006457293807,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.00011434908746793238,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13582818.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004756448790431023,
+ "skip_count": 2.0,
+ "step": 8422,
+ "text_loss": 0.2932167947292328
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00011415216353132252,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13586261.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033427432645112276,
+ "skip_count": 1.0,
+ "step": 8424,
+ "text_loss": 0.47670233249664307
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.0001139553874523313,
+ "loss": 0.003,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13589765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006597383879125118,
+ "skip_count": 1.0,
+ "step": 8426,
+ "text_loss": 0.31448885798454285
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.5682418550044,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.00011375875930636403,
+ "loss": 0.005,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 13592741.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011398134753108025,
+ "skip_count": 1.0,
+ "step": 8428,
+ "text_loss": 0.17429469525814056
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 39.577634282359845,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.00011356227916876877,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13595763.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038021153304725885,
+ "skip_count": 0.0,
+ "step": 8430,
+ "text_loss": 0.6043882966041565
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.58702670971529,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.00011336594711483712,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13598274.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00044314167462289333,
+ "skip_count": 0.0,
+ "step": 8432,
+ "text_loss": 0.3818575143814087
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.59641913707074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.00011316976321980388,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13601510.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001956664025783539,
+ "skip_count": 0.0,
+ "step": 8434,
+ "text_loss": 0.48483794927597046
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0001129737275588471,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13604410.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005170237272977829,
+ "skip_count": 0.0,
+ "step": 8436,
+ "text_loss": 0.21759741008281708
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.61520399178163,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.00011277784020708803,
+ "loss": 0.0045,
+ "macro_f1": 1.0,
+ "num_tokens": 13607207.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002223948948085308,
+ "skip_count": 2.0,
+ "step": 8438,
+ "text_loss": 0.6877034306526184
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 0.00011258210123959089,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13610981.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017733481945469975,
+ "skip_count": 1.0,
+ "step": 8440,
+ "text_loss": 0.7250658273696899
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 39.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00011238651073136358,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 13614194.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00155889883171767,
+ "skip_count": 1.0,
+ "step": 8442,
+ "text_loss": 0.6742649078369141
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.00011219106875735652,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13618011.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011234934208914638,
+ "skip_count": 0.0,
+ "step": 8444,
+ "text_loss": 0.8105526566505432
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 39.65277370120341,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 0.00011199577539246347,
+ "loss": 0.0055,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 13621852.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02346695400774479,
+ "skip_count": 1.0,
+ "step": 8446,
+ "text_loss": 0.22664032876491547
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.0001118006307115213,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13624711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012819754891097546,
+ "skip_count": 2.0,
+ "step": 8448,
+ "text_loss": 0.31696105003356934
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.00011160563478930969,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13627561.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0060531035996973515,
+ "skip_count": 2.0,
+ "step": 8450,
+ "text_loss": 0.2935826778411865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00011141078770055152,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13630445.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004288572818040848,
+ "skip_count": 0.0,
+ "step": 8452,
+ "text_loss": 0.5720692873001099
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.69034341062518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.00011121608951991252,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13633496.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005682424642145634,
+ "skip_count": 1.0,
+ "step": 8454,
+ "text_loss": 0.28466710448265076
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.699735837980626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00011102154032200146,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13635938.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009555552969686687,
+ "skip_count": 0.0,
+ "step": 8456,
+ "text_loss": 0.47744694352149963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.70912826533607,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.00011082714018136985,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13638863.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023627313785254955,
+ "skip_count": 0.0,
+ "step": 8458,
+ "text_loss": 0.5212090611457825
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.71852069269152,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0189208984375,
+ "learning_rate": 0.00011063288917251235,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 13641874.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00791920255869627,
+ "skip_count": 2.0,
+ "step": 8460,
+ "text_loss": 0.31359919905662537
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 39.72791312004696,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.00011043878736986607,
+ "loss": 0.0077,
+ "macro_f1": 1.0,
+ "num_tokens": 13644970.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0033252311404794455,
+ "skip_count": 1.0,
+ "step": 8462,
+ "text_loss": 0.33621230721473694
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.73730554740241,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.00011024483484781144,
+ "loss": 0.0077,
+ "macro_f1": 1.0,
+ "num_tokens": 13648103.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005567418877035379,
+ "skip_count": 2.0,
+ "step": 8464,
+ "text_loss": 0.48708856105804443
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.00011005103168067143,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13651085.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00047958645154722035,
+ "skip_count": 0.0,
+ "step": 8466,
+ "text_loss": 0.4151248633861542
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.756090402113294,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.00010985737794271161,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13654175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009806647431105375,
+ "skip_count": 0.0,
+ "step": 8468,
+ "text_loss": 0.7322396039962769
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.00010966387370814057,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13657058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009820344857871532,
+ "skip_count": 0.0,
+ "step": 8470,
+ "text_loss": 0.6350769400596619
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 39.77487525682419,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.00010947051905110945,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13660203.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.002065197564661503,
+ "skip_count": 0.0,
+ "step": 8472,
+ "text_loss": 0.6025850176811218
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.00010927731404571211,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13664021.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009939799783751369,
+ "skip_count": 0.0,
+ "step": 8474,
+ "text_loss": 0.3040087819099426
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0001090842587659851,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13667055.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008282510680146515,
+ "skip_count": 0.0,
+ "step": 8476,
+ "text_loss": 0.7306531667709351
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0001088913532859076,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13669940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008349589770659804,
+ "skip_count": 0.0,
+ "step": 8478,
+ "text_loss": 0.32041916251182556
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.81244496624596,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.00010869859767940133,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13672955.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007435405277647078,
+ "skip_count": 0.0,
+ "step": 8480,
+ "text_loss": 0.5343614816665649
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.821837393601406,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.00010850599202033051,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13676173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002763360273092985,
+ "skip_count": 0.0,
+ "step": 8482,
+ "text_loss": 0.6071668267250061
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.83122982095686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.00010831353638250213,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13680121.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00202178000472486,
+ "skip_count": 0.0,
+ "step": 8484,
+ "text_loss": 0.42487844824790955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.8406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.00010812123083966535,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13683504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0056348275393247604,
+ "skip_count": 1.0,
+ "step": 8486,
+ "text_loss": 0.17678795754909515
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.00010792907546551229,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13686870.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003331703832373023,
+ "skip_count": 0.0,
+ "step": 8488,
+ "text_loss": 0.32238465547561646
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.85940710302319,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00010773707033367708,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13690429.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011620528530329466,
+ "skip_count": 0.0,
+ "step": 8490,
+ "text_loss": 0.4141998291015625
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 39.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.00010754521551773655,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13693747.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005236583761870861,
+ "skip_count": 0.0,
+ "step": 8492,
+ "text_loss": 0.557283878326416
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 39.878191957734074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.00010735351109120972,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13696837.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005507425405085087,
+ "skip_count": 6.0,
+ "step": 8494,
+ "text_loss": 0.7394861578941345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.887584385089525,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 0.00010716195712755821,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13700080.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008621517335996032,
+ "skip_count": 0.0,
+ "step": 8496,
+ "text_loss": 0.7079368233680725
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.89697681244497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.00010697055370018572,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13704088.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004489862476475537,
+ "skip_count": 0.0,
+ "step": 8498,
+ "text_loss": 0.5672308206558228
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.00010677930088243847,
+ "loss": 0.0077,
+ "macro_f1": 1.0,
+ "num_tokens": 13707391.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009171495214104652,
+ "skip_count": 2.0,
+ "step": 8500,
+ "text_loss": 0.6851600408554077
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.00010658819874760495,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13711238.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016714727971702814,
+ "skip_count": 1.0,
+ "step": 8502,
+ "text_loss": 0.7102733850479126
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00010639724736891576,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13714553.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012916292762383819,
+ "skip_count": 0.0,
+ "step": 8504,
+ "text_loss": 0.4234752953052521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.93454652186674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0001062064468195439,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13718046.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005265420186333358,
+ "skip_count": 0.0,
+ "step": 8506,
+ "text_loss": 0.5576326251029968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.943938949222186,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0001060157971726045,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13720687.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023503501433879137,
+ "skip_count": 1.0,
+ "step": 8508,
+ "text_loss": 0.5259605646133423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.95333137657764,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 0.00010582529850115469,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13723946.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007593657355755568,
+ "skip_count": 0.0,
+ "step": 8510,
+ "text_loss": 0.3795129954814911
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.00010563495087819419,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13727589.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005672222469002008,
+ "skip_count": 0.0,
+ "step": 8512,
+ "text_loss": 0.685897946357727
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 39.972116231288524,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00010544475437666445,
+ "loss": 0.0049,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 13730579.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.01708158478140831,
+ "skip_count": 2.0,
+ "step": 8514,
+ "text_loss": 0.8044925332069397
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.98150865864397,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.00010525470906944917,
+ "loss": 0.0113,
+ "macro_f1": 1.0,
+ "num_tokens": 13733563.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010253295302391052,
+ "skip_count": 2.0,
+ "step": 8516,
+ "text_loss": 0.3999447524547577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.00010506481502937398,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13736645.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004293019883334637,
+ "skip_count": 0.0,
+ "step": 8518,
+ "text_loss": 0.3128681778907776
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 40.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.00010487507232920674,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 13740080.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030790462624281645,
+ "skip_count": 1.0,
+ "step": 8520,
+ "text_loss": 0.39142900705337524
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.00939242735544,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.00010468548104165709,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13743085.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007342757890000939,
+ "skip_count": 0.0,
+ "step": 8522,
+ "text_loss": 0.7652465105056763
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 0.00010449604123937689,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13746513.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030496022664010525,
+ "skip_count": 0.0,
+ "step": 8524,
+ "text_loss": 0.6259746551513672
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 40.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.00010430675299495973,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 13749391.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010060965083539486,
+ "skip_count": 1.0,
+ "step": 8526,
+ "text_loss": 0.2266668826341629
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.0001041176163809413,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 13752449.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002234962536022067,
+ "skip_count": 2.0,
+ "step": 8528,
+ "text_loss": 0.9742465019226074
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.00010392863146979903,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13755572.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003572004789020866,
+ "skip_count": 0.0,
+ "step": 8530,
+ "text_loss": 0.5757357478141785
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.00010373979833395242,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13759198.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011161680333316326,
+ "skip_count": 0.0,
+ "step": 8532,
+ "text_loss": 0.6268131136894226
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00010355111704576236,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13761914.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002053353004157543,
+ "skip_count": 0.0,
+ "step": 8534,
+ "text_loss": 0.22388778626918793
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.00010336258767753232,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13765371.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003634720342233777,
+ "skip_count": 2.0,
+ "step": 8536,
+ "text_loss": 0.5802993178367615
+ },
+ {
+ "acc_repeat": 0.800000011920929,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.084531846199,
+ "f1_execute": 0.9729729890823364,
+ "f1_repeat": 0.888888955116272,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.00010317421030150692,
+ "loss": 0.0072,
+ "macro_f1": 0.9539539813995361,
+ "num_tokens": 13768276.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.053806692361831665,
+ "skip_count": 5.0,
+ "step": 8538,
+ "text_loss": 0.10888377577066422
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.09392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07275390625,
+ "learning_rate": 0.00010298598498987266,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13772369.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00501362606883049,
+ "skip_count": 1.0,
+ "step": 8540,
+ "text_loss": 0.5794995427131653
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.00010279791181475795,
+ "loss": 0.0082,
+ "macro_f1": 1.0,
+ "num_tokens": 13776595.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002230882178992033,
+ "skip_count": 2.0,
+ "step": 8542,
+ "text_loss": 0.5503702163696289
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.11270912826534,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.00010260999084823264,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13779993.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012205395614728332,
+ "skip_count": 0.0,
+ "step": 8544,
+ "text_loss": 0.7248672842979431
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.00010242222216230856,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13782683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003966465883422643,
+ "skip_count": 0.0,
+ "step": 8546,
+ "text_loss": 0.7446619272232056
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00010223460582893889,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13785534.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004968565888702869,
+ "skip_count": 1.0,
+ "step": 8548,
+ "text_loss": 0.22457796335220337
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.00010204714192001863,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13788608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033054195810109377,
+ "skip_count": 2.0,
+ "step": 8550,
+ "text_loss": 0.418837308883667
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 0.00010185983050738434,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13791553.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001166256028227508,
+ "skip_count": 0.0,
+ "step": 8552,
+ "text_loss": 0.4060337543487549
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.00010167267166281402,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13795304.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003844029037281871,
+ "skip_count": 2.0,
+ "step": 8554,
+ "text_loss": 0.17412975430488586
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.00010148566545802718,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13798445.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033507589250802994,
+ "skip_count": 0.0,
+ "step": 8556,
+ "text_loss": 0.24744336307048798
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.00010129881196468527,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13801338.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004076482728123665,
+ "skip_count": 0.0,
+ "step": 8558,
+ "text_loss": 0.6542767882347107
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01806640625,
+ "learning_rate": 0.00010111211125439069,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13804157.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005654391716234386,
+ "skip_count": 0.0,
+ "step": 8560,
+ "text_loss": 0.527079701423645
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00010092556339868758,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13807411.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004915264435112476,
+ "skip_count": 1.0,
+ "step": 8562,
+ "text_loss": 0.721017599105835
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.00010073916846906139,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13810489.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005571382585912943,
+ "skip_count": 1.0,
+ "step": 8564,
+ "text_loss": 0.5802517533302307
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.21602582917523,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.00010055292653693903,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13813526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001321605988778174,
+ "skip_count": 0.0,
+ "step": 8566,
+ "text_loss": 0.5485247373580933
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.00010036683767368859,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13817225.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001876185997389257,
+ "skip_count": 0.0,
+ "step": 8568,
+ "text_loss": 0.08957820385694504
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.00010018090195061997,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13820667.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004593426361680031,
+ "skip_count": 0.0,
+ "step": 8570,
+ "text_loss": 0.24580086767673492
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 9.999511943898398e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13824505.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022372701205313206,
+ "skip_count": 0.0,
+ "step": 8572,
+ "text_loss": 0.20976831018924713
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 9.980949020997276e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13827623.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030519715510308743,
+ "skip_count": 0.0,
+ "step": 8574,
+ "text_loss": 0.7638732194900513
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 9.962401433471985e-05,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13831013.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005036211106926203,
+ "skip_count": 1.0,
+ "step": 8576,
+ "text_loss": 0.3791790306568146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 9.943869188429989e-05,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13833611.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002071794355288148,
+ "skip_count": 2.0,
+ "step": 8578,
+ "text_loss": 0.5480846166610718
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 40.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 9.925352292972884e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13836678.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008119060657918453,
+ "skip_count": 0.0,
+ "step": 8580,
+ "text_loss": 0.21605457365512848
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 9.906850754196379e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13839255.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004017427563667297,
+ "skip_count": 2.0,
+ "step": 8582,
+ "text_loss": 0.4473285973072052
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 9.888364579190285e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13842034.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005163116846233606,
+ "skip_count": 1.0,
+ "step": 8584,
+ "text_loss": 0.21627424657344818
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.30995010272967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 9.869893775038557e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13844648.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0044358340092003345,
+ "skip_count": 1.0,
+ "step": 8586,
+ "text_loss": 0.5660704970359802
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 9.851438348819247e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13847629.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00038135924842208624,
+ "skip_count": 1.0,
+ "step": 8588,
+ "text_loss": 0.6401235461235046
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.32873495744057,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 9.832998307604495e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13851409.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004005341790616512,
+ "skip_count": 1.0,
+ "step": 8590,
+ "text_loss": 0.43975043296813965
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.33812738479601,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 9.814573658460562e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13854031.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006872966885566711,
+ "skip_count": 2.0,
+ "step": 8592,
+ "text_loss": 0.6000451445579529
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 9.796164408447811e-05,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13856813.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019872859120368958,
+ "skip_count": 0.0,
+ "step": 8594,
+ "text_loss": 0.6026073098182678
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 9.777770564620698e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13859805.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013098123483359814,
+ "skip_count": 2.0,
+ "step": 8596,
+ "text_loss": 0.3294500708580017
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 40.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 9.759392134027783e-05,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 13863119.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001011171261779964,
+ "skip_count": 1.0,
+ "step": 8598,
+ "text_loss": 0.4078965187072754
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.375697094217784,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 9.741029123711708e-05,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13866239.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003267963184043765,
+ "skip_count": 0.0,
+ "step": 8600,
+ "text_loss": 0.5064641833305359
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.385089521573235,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 9.722681540709228e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 13869647.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02431299351155758,
+ "skip_count": 2.0,
+ "step": 8602,
+ "text_loss": 0.2512950301170349
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 9.704349392051155e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13873128.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019577480852603912,
+ "skip_count": 1.0,
+ "step": 8604,
+ "text_loss": 0.425156831741333
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 9.686032684762408e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13876603.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001554530463181436,
+ "skip_count": 1.0,
+ "step": 8606,
+ "text_loss": 0.3596082329750061
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01519775390625,
+ "learning_rate": 9.667731425861975e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13879602.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027400986291468143,
+ "skip_count": 0.0,
+ "step": 8608,
+ "text_loss": 0.12101534754037857
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 9.649445622362957e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13882204.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001957559958100319,
+ "skip_count": 2.0,
+ "step": 8610,
+ "text_loss": 0.382834255695343
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.43205165835045,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 9.631175281272491e-05,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 13886397.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009613300673663616,
+ "skip_count": 3.0,
+ "step": 8612,
+ "text_loss": 0.24718235433101654
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.441444085705896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 9.612920409591813e-05,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13889625.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015159029280766845,
+ "skip_count": 0.0,
+ "step": 8614,
+ "text_loss": 0.406452476978302
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 40.45083651306135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 9.59468101431622e-05,
+ "loss": 0.0034,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13892518.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008069832809269428,
+ "skip_count": 3.0,
+ "step": 8616,
+ "text_loss": 0.19740329682826996
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0157470703125,
+ "learning_rate": 9.576457102435082e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13895822.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024340536911040545,
+ "skip_count": 0.0,
+ "step": 8618,
+ "text_loss": 0.44761306047439575
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 40.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 9.558248680931841e-05,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 13898829.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0053517078049480915,
+ "skip_count": 1.0,
+ "step": 8620,
+ "text_loss": 0.37335118651390076
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.47901379512768,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 9.540055756783994e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.9255813956260681,
+ "num_tokens": 13902122.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.03885587304830551,
+ "skip_count": 4.0,
+ "step": 8622,
+ "text_loss": 0.21311092376708984
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.48840622248312,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 9.521878336963108e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13904874.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007965708151459694,
+ "skip_count": 1.0,
+ "step": 8624,
+ "text_loss": 0.27229398488998413
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 9.5037164284348e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13907755.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019825168419629335,
+ "skip_count": 0.0,
+ "step": 8626,
+ "text_loss": 0.6535577178001404
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.507191077194015,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 9.485570038158747e-05,
+ "loss": 0.0085,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 13910619.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.017803344875574112,
+ "skip_count": 0.0,
+ "step": 8628,
+ "text_loss": 0.26617178320884705
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 9.467439173088687e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13914098.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025836096610873938,
+ "skip_count": 0.0,
+ "step": 8630,
+ "text_loss": 0.44465285539627075
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 9.44932384017238e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13917192.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004438584204763174,
+ "skip_count": 2.0,
+ "step": 8632,
+ "text_loss": 0.33622798323631287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 9.431224046351688e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13920067.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017312567681074142,
+ "skip_count": 2.0,
+ "step": 8634,
+ "text_loss": 0.31870952248573303
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 9.413139798562476e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13922887.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019389945082366467,
+ "skip_count": 0.0,
+ "step": 8636,
+ "text_loss": 0.18223261833190918
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.55415321397123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 9.395071103734648e-05,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13926545.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011485094437375665,
+ "skip_count": 0.0,
+ "step": 8638,
+ "text_loss": 0.48031774163246155
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 40.563545641326684,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 9.377017968792179e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13931171.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003448521951213479,
+ "skip_count": 0.0,
+ "step": 8640,
+ "text_loss": 0.7585139870643616
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 40.57293806868213,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 9.35898040065305e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 13934369.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017959754914045334,
+ "skip_count": 2.0,
+ "step": 8642,
+ "text_loss": 0.49708613753318787
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 9.3409584062293e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13938166.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004092653747648001,
+ "skip_count": 1.0,
+ "step": 8644,
+ "text_loss": 0.20662656426429749
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 9.322951992426992e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13941922.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026206092443317175,
+ "skip_count": 0.0,
+ "step": 8646,
+ "text_loss": 0.4735889434814453
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 40.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 9.304961166146209e-05,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 13945569.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.005156307481229305,
+ "skip_count": 2.0,
+ "step": 8648,
+ "text_loss": 0.5630270838737488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 9.286985934281079e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13948357.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004913610871881247,
+ "skip_count": 1.0,
+ "step": 8650,
+ "text_loss": 0.4053497016429901
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.619900205459345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0186767578125,
+ "learning_rate": 9.26902630371974e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13952543.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003946282435208559,
+ "skip_count": 2.0,
+ "step": 8652,
+ "text_loss": 0.40166863799095154
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.629292632814796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 9.251082281344358e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13955917.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009605551022104919,
+ "skip_count": 0.0,
+ "step": 8654,
+ "text_loss": 0.20477983355522156
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 40.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 9.233153874031102e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13960071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004408199340105057,
+ "skip_count": 3.0,
+ "step": 8656,
+ "text_loss": 0.3349814713001251
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 9.215241088650194e-05,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 13963125.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005541396792978048,
+ "skip_count": 2.0,
+ "step": 8658,
+ "text_loss": 0.6602919697761536
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 9.197343932065843e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13966130.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001636760076507926,
+ "skip_count": 0.0,
+ "step": 8660,
+ "text_loss": 0.7704628109931946
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.66686234223657,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 9.179462411136263e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13969791.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006453761598095298,
+ "skip_count": 0.0,
+ "step": 8662,
+ "text_loss": 0.3898075520992279
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 40.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 9.161596532713695e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13972987.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005081792362034321,
+ "skip_count": 4.0,
+ "step": 8664,
+ "text_loss": 0.8477506041526794
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.685647196947464,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 9.143746303644374e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13976505.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032063762191683054,
+ "skip_count": 0.0,
+ "step": 8666,
+ "text_loss": 0.23729658126831055
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 9.125911730768543e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13980061.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00043821477447636425,
+ "skip_count": 0.0,
+ "step": 8668,
+ "text_loss": 0.4233637750148773
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 9.108092820920438e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13983407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007779054809361696,
+ "skip_count": 2.0,
+ "step": 8670,
+ "text_loss": 0.5050316452980042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 9.090289580928307e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13986725.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018697676714509726,
+ "skip_count": 1.0,
+ "step": 8672,
+ "text_loss": 1.0568488836288452
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 9.072502017614382e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13990765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002077789744362235,
+ "skip_count": 0.0,
+ "step": 8674,
+ "text_loss": 0.48911142349243164
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 40.73260933372468,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 9.054730137794887e-05,
+ "loss": 0.0081,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 13994083.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.044373031705617905,
+ "skip_count": 3.0,
+ "step": 8676,
+ "text_loss": 0.3420281708240509
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 9.036973948280048e-05,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13997500.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015431724023073912,
+ "skip_count": 0.0,
+ "step": 8678,
+ "text_loss": 0.21514096856117249
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.751394188435576,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 9.019233455874049e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14000460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006088062655180693,
+ "skip_count": 1.0,
+ "step": 8680,
+ "text_loss": 0.43932875990867615
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 9.001508667375107e-05,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 14003537.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01006145216524601,
+ "skip_count": 3.0,
+ "step": 8682,
+ "text_loss": 0.2192728966474533
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.77017904314646,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 8.983799589575393e-05,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14005943.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001044525415636599,
+ "skip_count": 0.0,
+ "step": 8684,
+ "text_loss": 0.8686383962631226
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 8.96610622926104e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14008954.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004876079503446817,
+ "skip_count": 2.0,
+ "step": 8686,
+ "text_loss": 0.2513524889945984
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 8.948428593212193e-05,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 14012268.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007909095846116543,
+ "skip_count": 2.0,
+ "step": 8688,
+ "text_loss": 0.17117907106876373
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 8.930766688202946e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14015192.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022194553166627884,
+ "skip_count": 0.0,
+ "step": 8690,
+ "text_loss": 0.637697160243988
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 40.807748752568244,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0162353515625,
+ "learning_rate": 8.913120521001383e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14018055.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0023777696769684553,
+ "skip_count": 0.0,
+ "step": 8692,
+ "text_loss": 0.39099860191345215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.81714117992369,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 8.895490098369535e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14021035.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002676652278751135,
+ "skip_count": 1.0,
+ "step": 8694,
+ "text_loss": 0.6112156510353088
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 8.877875427063431e-05,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14023759.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001040685223415494,
+ "skip_count": 0.0,
+ "step": 8696,
+ "text_loss": 0.3562681972980499
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 40.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 8.86027651383302e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14026090.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0011444527190178633,
+ "skip_count": 0.0,
+ "step": 8698,
+ "text_loss": 0.6152632236480713
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.84531846199002,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 8.842693365422266e-05,
+ "loss": 0.008,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 14029570.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.024327632039785385,
+ "skip_count": 3.0,
+ "step": 8700,
+ "text_loss": 0.2170596867799759
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 8.825125988569061e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14032418.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00048010432510636747,
+ "skip_count": 0.0,
+ "step": 8702,
+ "text_loss": 0.4421340525150299
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 8.807574390005241e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14035610.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010498231276869774,
+ "skip_count": 0.0,
+ "step": 8704,
+ "text_loss": 0.3656717538833618
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.873495744056356,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 8.790038576456627e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 14039354.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019302964210510254,
+ "skip_count": 1.0,
+ "step": 8706,
+ "text_loss": 0.6150856018066406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 8.772518554642972e-05,
+ "loss": 0.0029,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14042353.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004211598541587591,
+ "skip_count": 0.0,
+ "step": 8708,
+ "text_loss": 0.17178772389888763
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.89228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 8.755014331277972e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14045704.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007902922225184739,
+ "skip_count": 0.0,
+ "step": 8710,
+ "text_loss": 0.6289885640144348
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 8.737525913069277e-05,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 14048743.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007915202528238297,
+ "skip_count": 2.0,
+ "step": 8712,
+ "text_loss": 0.2778690457344055
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 40.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 8.720053306718506e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14052762.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027877227403223515,
+ "skip_count": 3.0,
+ "step": 8714,
+ "text_loss": 0.3615926504135132
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.92045788083358,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0478515625,
+ "learning_rate": 8.702596518921175e-05,
+ "loss": 0.0086,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 14056645.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03460995852947235,
+ "skip_count": 1.0,
+ "step": 8716,
+ "text_loss": 0.19412031769752502
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 8.685155556366763e-05,
+ "loss": 0.0064,
+ "macro_f1": 1.0,
+ "num_tokens": 14059604.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0026834046002477407,
+ "skip_count": 2.0,
+ "step": 8718,
+ "text_loss": 0.4414670169353485
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 40.93924273554447,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 8.667730425738679e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14062170.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01547359861433506,
+ "skip_count": 4.0,
+ "step": 8720,
+ "text_loss": 0.2850716710090637
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 8.650321133714267e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14065526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020194994285702705,
+ "skip_count": 0.0,
+ "step": 8722,
+ "text_loss": 0.1776508241891861
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 8.632927686964798e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14068525.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037195945624262094,
+ "skip_count": 0.0,
+ "step": 8724,
+ "text_loss": 0.2786005735397339
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 40.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 8.615550092155477e-05,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 14071830.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008169961161911488,
+ "skip_count": 4.0,
+ "step": 8726,
+ "text_loss": 0.43228310346603394
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.97681244496625,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 8.598188355945424e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14074977.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006407112814486027,
+ "skip_count": 1.0,
+ "step": 8728,
+ "text_loss": 0.24443474411964417
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 8.580842484987689e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14078104.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001878641895018518,
+ "skip_count": 1.0,
+ "step": 8730,
+ "text_loss": 0.4559098184108734
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.99559729967714,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 8.563512485929253e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14081934.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0056114462204277515,
+ "skip_count": 0.0,
+ "step": 8732,
+ "text_loss": 0.3063429594039917
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.004696213677725,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 8.546198365411007e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14085097.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001542840269394219,
+ "skip_count": 0.0,
+ "step": 8734,
+ "text_loss": 0.7624274492263794
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.01408864103317,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 8.528900130067741e-05,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14088630.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002677374053746462,
+ "skip_count": 0.0,
+ "step": 8736,
+ "text_loss": 0.18395234644412994
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.02348106838861,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 8.511617786528175e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14091513.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004059800878167152,
+ "skip_count": 0.0,
+ "step": 8738,
+ "text_loss": 0.4567817449569702
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 41.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 8.494351341414947e-05,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 14094500.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0023724427446722984,
+ "skip_count": 1.0,
+ "step": 8740,
+ "text_loss": 0.6925744414329529
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0155029296875,
+ "learning_rate": 8.477100801344573e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14097518.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013842503540217876,
+ "skip_count": 2.0,
+ "step": 8742,
+ "text_loss": 0.6574832201004028
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.05165835045494,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 8.459866172927505e-05,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14101219.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003597316099330783,
+ "skip_count": 2.0,
+ "step": 8744,
+ "text_loss": 0.785912036895752
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 24.0,
+ "epoch": 41.061050777810394,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.888888955116272,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 8.442647462768082e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.6225374937057495,
+ "num_tokens": 14104460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01929798349738121,
+ "skip_count": 5.0,
+ "step": 8746,
+ "text_loss": 0.2111714482307434
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.07044320516584,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 8.425444677464545e-05,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14107404.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00048497592797502875,
+ "skip_count": 0.0,
+ "step": 8748,
+ "text_loss": 0.4764930307865143
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 8.408257823609033e-05,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 14109917.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007886217907071114,
+ "skip_count": 2.0,
+ "step": 8750,
+ "text_loss": 0.2771969735622406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 8.391086907787587e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14112649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006535434629768133,
+ "skip_count": 0.0,
+ "step": 8752,
+ "text_loss": 0.1550854742527008
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 8.373931936580114e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14116044.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002130605047568679,
+ "skip_count": 0.0,
+ "step": 8754,
+ "text_loss": 0.4055478870868683
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.10801291458761,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 8.356792916560457e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14119097.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005611231899820268,
+ "skip_count": 0.0,
+ "step": 8756,
+ "text_loss": 0.47804903984069824
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 41.117405341943055,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 8.339669854296316e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14122079.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005650801584124565,
+ "skip_count": 0.0,
+ "step": 8758,
+ "text_loss": 0.1968296617269516
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.126797769298506,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 8.322562756349273e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14124910.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035948604345321655,
+ "skip_count": 1.0,
+ "step": 8760,
+ "text_loss": 0.4988253712654114
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 8.305471629274802e-05,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14127767.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012090947711840272,
+ "skip_count": 0.0,
+ "step": 8762,
+ "text_loss": 0.6330704689025879
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.14558262400939,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 8.288396479622262e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14130766.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010853242129087448,
+ "skip_count": 1.0,
+ "step": 8764,
+ "text_loss": 0.43057000637054443
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 8.271337313934868e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14133804.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037055034190416336,
+ "skip_count": 2.0,
+ "step": 8766,
+ "text_loss": 0.31973564624786377
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.16436747872028,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 8.254294138749741e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14137164.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005338407587260008,
+ "skip_count": 0.0,
+ "step": 8768,
+ "text_loss": 0.5066531896591187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.17375990607572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 8.237266960597844e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14140119.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014707009540870786,
+ "skip_count": 1.0,
+ "step": 8770,
+ "text_loss": 0.553493857383728
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.183152333431174,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 8.220255786004033e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14143223.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002113121096044779,
+ "skip_count": 0.0,
+ "step": 8772,
+ "text_loss": 0.40016281604766846
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.19254476078662,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0179443359375,
+ "learning_rate": 8.203260621487019e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14146366.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002210963051766157,
+ "skip_count": 1.0,
+ "step": 8774,
+ "text_loss": 0.44022905826568604
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 8.186281473559382e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14150009.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011857844656333327,
+ "skip_count": 0.0,
+ "step": 8776,
+ "text_loss": 0.572823703289032
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 8.169318348727544e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14153343.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020397785119712353,
+ "skip_count": 1.0,
+ "step": 8778,
+ "text_loss": 0.5724276900291443
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 8.152371253491841e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14156392.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001745635992847383,
+ "skip_count": 0.0,
+ "step": 8780,
+ "text_loss": 0.14162923395633698
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.23011447020839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 8.135440194346416e-05,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14159616.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002799858106300235,
+ "skip_count": 0.0,
+ "step": 8782,
+ "text_loss": 0.18205340206623077
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 8.118525177779284e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14163531.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0029223538003861904,
+ "skip_count": 0.0,
+ "step": 8784,
+ "text_loss": 0.4107058644294739
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.248899324919286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 8.101626210272311e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14166776.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001209643087349832,
+ "skip_count": 0.0,
+ "step": 8786,
+ "text_loss": 0.6441596746444702
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 8.084743298301211e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14169586.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015196573222056031,
+ "skip_count": 0.0,
+ "step": 8788,
+ "text_loss": 0.35585930943489075
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.26768417963017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 8.067876448335549e-05,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14174180.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004388966190163046,
+ "skip_count": 0.0,
+ "step": 8790,
+ "text_loss": 0.31594613194465637
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.27707660698562,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 8.05102566683873e-05,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14177950.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0031201441306620836,
+ "skip_count": 0.0,
+ "step": 8792,
+ "text_loss": 0.3161006569862366
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.28646903434106,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 8.034190960268012e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14180642.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001848527928814292,
+ "skip_count": 0.0,
+ "step": 8794,
+ "text_loss": 0.47571417689323425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.295861461696504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 8.017372335074486e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14183743.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0043064444325864315,
+ "skip_count": 1.0,
+ "step": 8796,
+ "text_loss": 0.5976942777633667
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.305253889051954,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 8.000569797703072e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14187742.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005383181851357222,
+ "skip_count": 2.0,
+ "step": 8798,
+ "text_loss": 0.2692606449127197
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.3146463164074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 7.983783354592544e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14191211.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001401974936015904,
+ "skip_count": 0.0,
+ "step": 8800,
+ "text_loss": 0.38108205795288086
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 7.967013012175478e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14194992.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001168998540379107,
+ "skip_count": 0.0,
+ "step": 8802,
+ "text_loss": 0.5201764106750488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05322265625,
+ "learning_rate": 7.950258776878332e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14198059.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032015808392316103,
+ "skip_count": 2.0,
+ "step": 8804,
+ "text_loss": 0.6014752984046936
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 7.933520655121351e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14202313.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009403078584000468,
+ "skip_count": 0.0,
+ "step": 8806,
+ "text_loss": 0.54194176197052
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.35221602582917,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 7.916798653318607e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14205534.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027781077660620213,
+ "skip_count": 1.0,
+ "step": 8808,
+ "text_loss": 0.7181227803230286
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 7.900092777878004e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14209357.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034586815163493156,
+ "skip_count": 1.0,
+ "step": 8810,
+ "text_loss": 0.21651209890842438
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 41.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 7.883403035201265e-05,
+ "loss": 0.0056,
+ "macro_f1": 1.0,
+ "num_tokens": 14212328.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01194343063980341,
+ "skip_count": 4.0,
+ "step": 8812,
+ "text_loss": 0.20523512363433838
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 41.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0157470703125,
+ "learning_rate": 7.866729431683938e-05,
+ "loss": 0.0038,
+ "macro_f1": 1.0,
+ "num_tokens": 14214979.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0045132869854569435,
+ "skip_count": 1.0,
+ "step": 8814,
+ "text_loss": 0.4066837728023529
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.38978573525095,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0181884765625,
+ "learning_rate": 7.850071973715368e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14219030.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005109346006065607,
+ "skip_count": 2.0,
+ "step": 8816,
+ "text_loss": 0.12459450960159302
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.3991781626064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 7.833430667678737e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14222117.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036401136312633753,
+ "skip_count": 0.0,
+ "step": 8818,
+ "text_loss": 0.3759046494960785
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 41.40857058996184,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 7.816805519951008e-05,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 14225546.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006177824921905994,
+ "skip_count": 1.0,
+ "step": 8820,
+ "text_loss": 0.4031941592693329
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 41.41796301731729,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 7.800196536902987e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14228731.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009549650363624096,
+ "skip_count": 5.0,
+ "step": 8822,
+ "text_loss": 0.2895966172218323
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.427355444672735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 7.783603724899258e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14231796.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005532847251743078,
+ "skip_count": 2.0,
+ "step": 8824,
+ "text_loss": 0.32433390617370605
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.43674787202818,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 7.767027090298206e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14235869.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011165215400978923,
+ "skip_count": 0.0,
+ "step": 8826,
+ "text_loss": 0.41239091753959656
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 7.750466639452059e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14238830.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007845646468922496,
+ "skip_count": 0.0,
+ "step": 8828,
+ "text_loss": 0.5113243460655212
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 7.733922378706787e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14241672.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029602700378745794,
+ "skip_count": 1.0,
+ "step": 8830,
+ "text_loss": 0.22004501521587372
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 41.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 7.717394314402199e-05,
+ "loss": 0.0037,
+ "macro_f1": 1.0,
+ "num_tokens": 14244522.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005297200754284859,
+ "skip_count": 1.0,
+ "step": 8832,
+ "text_loss": 0.6039504408836365
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.47431758144996,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 7.700882452871872e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14246964.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018059068825095892,
+ "skip_count": 2.0,
+ "step": 8834,
+ "text_loss": 0.46563026309013367
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 7.684386800443177e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14249387.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005659483838826418,
+ "skip_count": 2.0,
+ "step": 8836,
+ "text_loss": 0.31516948342323303
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.49310243616085,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 7.667907363437288e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14252438.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011170750483870506,
+ "skip_count": 1.0,
+ "step": 8838,
+ "text_loss": 0.22867503762245178
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 7.651444148169157e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14255490.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004106760956346989,
+ "skip_count": 2.0,
+ "step": 8840,
+ "text_loss": 0.5757828950881958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 7.634997160947499e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14258430.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008562540751881897,
+ "skip_count": 0.0,
+ "step": 8842,
+ "text_loss": 0.5166661143302917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 7.618566408074862e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14261275.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012901517329737544,
+ "skip_count": 0.0,
+ "step": 8844,
+ "text_loss": 0.7376981973648071
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.53067214558262,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 7.602151895847526e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14264698.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00267209205776453,
+ "skip_count": 0.0,
+ "step": 8846,
+ "text_loss": 0.5249470472335815
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 41.54006457293807,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 7.585753630555565e-05,
+ "loss": 0.009,
+ "macro_f1": 1.0,
+ "num_tokens": 14267887.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015334542840719223,
+ "skip_count": 7.0,
+ "step": 8848,
+ "text_loss": 1.1539889574050903
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017578125,
+ "learning_rate": 7.569371618482818e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14271392.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010222389828413725,
+ "skip_count": 0.0,
+ "step": 8850,
+ "text_loss": 0.33968010544776917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 7.553005865906914e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14274658.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006116362637840211,
+ "skip_count": 0.0,
+ "step": 8852,
+ "text_loss": 0.7514221668243408
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.5682418550044,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 7.536656379099221e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14277763.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036474792286753654,
+ "skip_count": 0.0,
+ "step": 8854,
+ "text_loss": 0.3964846134185791
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.577634282359845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 7.520323164324921e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14281165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005498840939253569,
+ "skip_count": 1.0,
+ "step": 8856,
+ "text_loss": 0.2235594391822815
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 41.58702670971529,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 7.504006227842919e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14284761.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006513409782201052,
+ "skip_count": 0.0,
+ "step": 8858,
+ "text_loss": 0.45196816325187683
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.59641913707074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 7.48770557590589e-05,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14287844.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013065916718915105,
+ "skip_count": 0.0,
+ "step": 8860,
+ "text_loss": 0.2188033014535904
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 7.471421214760287e-05,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14291280.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016644994029775262,
+ "skip_count": 0.0,
+ "step": 8862,
+ "text_loss": 0.7049906253814697
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.61520399178163,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 7.455153150646299e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14294330.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002664943691343069,
+ "skip_count": 0.0,
+ "step": 8864,
+ "text_loss": 0.2160239815711975
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 7.43890138979788e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14298355.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0035776710137724876,
+ "skip_count": 0.0,
+ "step": 8866,
+ "text_loss": 0.4922088384628296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 7.422665938442741e-05,
+ "loss": 0.0033,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14301452.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029914912302047014,
+ "skip_count": 2.0,
+ "step": 8868,
+ "text_loss": 0.5828475952148438
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 7.406446802802331e-05,
+ "loss": 0.0045,
+ "macro_f1": 1.0,
+ "num_tokens": 14304667.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0010031569981947541,
+ "skip_count": 2.0,
+ "step": 8870,
+ "text_loss": 0.657244861125946
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.65277370120341,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 7.390243989091849e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14307397.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007960405200719833,
+ "skip_count": 1.0,
+ "step": 8872,
+ "text_loss": 0.3147352635860443
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 7.37405750352026e-05,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 14310687.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007953251712024212,
+ "skip_count": 3.0,
+ "step": 8874,
+ "text_loss": 0.30315887928009033
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 7.357887352290227e-05,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14314007.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012103051412850618,
+ "skip_count": 0.0,
+ "step": 8876,
+ "text_loss": 0.6356115341186523
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 7.341733541598217e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14316696.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017898730002343655,
+ "skip_count": 1.0,
+ "step": 8878,
+ "text_loss": 0.35877764225006104
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.69034341062518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 7.325596077634383e-05,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14320172.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007144945557229221,
+ "skip_count": 0.0,
+ "step": 8880,
+ "text_loss": 0.7939266562461853
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.699735837980626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 7.309474966582635e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14323262.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001255290349945426,
+ "skip_count": 0.0,
+ "step": 8882,
+ "text_loss": 0.7115976810455322
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.70912826533607,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 7.293370214620616e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14326826.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028131126891821623,
+ "skip_count": 2.0,
+ "step": 8884,
+ "text_loss": 0.24073036015033722
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.71852069269152,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 7.277281827919691e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14329658.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024797592777758837,
+ "skip_count": 1.0,
+ "step": 8886,
+ "text_loss": 0.47276070713996887
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 41.72791312004696,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 7.26120981264496e-05,
+ "loss": 0.0081,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 14333584.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.023670634254813194,
+ "skip_count": 3.0,
+ "step": 8888,
+ "text_loss": 0.47537583112716675
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.73730554740241,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 7.245154174955254e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14336850.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009583478095009923,
+ "skip_count": 0.0,
+ "step": 8890,
+ "text_loss": 0.5258943438529968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 41.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 7.229114921003116e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14339940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006664840504527092,
+ "skip_count": 3.0,
+ "step": 8892,
+ "text_loss": 0.20986922085285187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.756090402113294,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 7.213092056934833e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14342737.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005362578085623682,
+ "skip_count": 0.0,
+ "step": 8894,
+ "text_loss": 0.5174402594566345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 7.197085588890383e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14345769.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006428950000554323,
+ "skip_count": 1.0,
+ "step": 8896,
+ "text_loss": 0.657136857509613
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.77487525682419,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 7.181095523003478e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14348563.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015549053205177188,
+ "skip_count": 0.0,
+ "step": 8898,
+ "text_loss": 0.49799686670303345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.78426768417963,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 7.165121865401535e-05,
+ "loss": 0.0068,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 14353134.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030110027641057968,
+ "skip_count": 2.0,
+ "step": 8900,
+ "text_loss": 0.3644331693649292
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 41.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 7.149164622205712e-05,
+ "loss": 0.0072,
+ "macro_f1": 1.0,
+ "num_tokens": 14356031.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0014812488807365298,
+ "skip_count": 1.0,
+ "step": 8902,
+ "text_loss": 0.46983054280281067
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 7.133223799530836e-05,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14358941.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001170543720945716,
+ "skip_count": 0.0,
+ "step": 8904,
+ "text_loss": 0.7030026316642761
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 41.81244496624596,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 7.117299403485466e-05,
+ "loss": 0.0085,
+ "macro_f1": 1.0,
+ "num_tokens": 14361807.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0011649372754618526,
+ "skip_count": 1.0,
+ "step": 8906,
+ "text_loss": 0.44989535212516785
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.821837393601406,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 7.101391440171856e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14365464.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028165180701762438,
+ "skip_count": 0.0,
+ "step": 8908,
+ "text_loss": 0.487165629863739
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.83122982095686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 7.085499915685978e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14368149.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001956705003976822,
+ "skip_count": 2.0,
+ "step": 8910,
+ "text_loss": 0.3717629909515381
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.8406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 7.069624836117484e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14371440.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027164234779775143,
+ "skip_count": 1.0,
+ "step": 8912,
+ "text_loss": 0.3683965802192688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 7.053766207549734e-05,
+ "loss": 0.009,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14374965.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005999395158141851,
+ "skip_count": 2.0,
+ "step": 8914,
+ "text_loss": 0.6271854639053345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.85940710302319,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 7.037924036059789e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14378445.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000978486379608512,
+ "skip_count": 0.0,
+ "step": 8916,
+ "text_loss": 0.5927628874778748
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 7.022098327718401e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14382851.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012569266371428967,
+ "skip_count": 1.0,
+ "step": 8918,
+ "text_loss": 0.4092319905757904
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 41.878191957734074,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 7.006289088590007e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 14386959.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011032132431864738,
+ "skip_count": 2.0,
+ "step": 8920,
+ "text_loss": 0.6553854942321777
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.887584385089525,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048095703125,
+ "learning_rate": 6.990496324732737e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14390031.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001376329455524683,
+ "skip_count": 0.0,
+ "step": 8922,
+ "text_loss": 0.7792862057685852
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.89697681244497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 6.974720042198396e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14392966.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005924372002482414,
+ "skip_count": 2.0,
+ "step": 8924,
+ "text_loss": 0.4466548562049866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 6.958960247032515e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14395619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010054769925773144,
+ "skip_count": 2.0,
+ "step": 8926,
+ "text_loss": 0.24784758687019348
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 6.943216945274255e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14398891.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006864808965474367,
+ "skip_count": 0.0,
+ "step": 8928,
+ "text_loss": 0.5154114961624146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 6.927490142956489e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14402991.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000996887218207121,
+ "skip_count": 0.0,
+ "step": 8930,
+ "text_loss": 0.5888006091117859
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.93454652186674,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 6.911779846105753e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14406276.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0007863475475460291,
+ "skip_count": 0.0,
+ "step": 8932,
+ "text_loss": 0.6862632632255554
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.943938949222186,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 6.896086060742262e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14409005.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020060581155121326,
+ "skip_count": 1.0,
+ "step": 8934,
+ "text_loss": 0.8998132348060608
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.95333137657764,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 6.880408792879905e-05,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 14411902.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.008094016462564468,
+ "skip_count": 3.0,
+ "step": 8936,
+ "text_loss": 0.3411460518836975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 6.864748048526237e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14414683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004374993033707142,
+ "skip_count": 0.0,
+ "step": 8938,
+ "text_loss": 0.24222217500209808
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 6.84910383368249e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14417740.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003004335332661867,
+ "skip_count": 2.0,
+ "step": 8940,
+ "text_loss": 0.5524137020111084
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.98150865864397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 6.83347615434356e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14420678.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007001105695962906,
+ "skip_count": 2.0,
+ "step": 8942,
+ "text_loss": 0.3124033212661743
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 6.817865016497993e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14424259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038414683658629656,
+ "skip_count": 0.0,
+ "step": 8944,
+ "text_loss": 0.509667694568634
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 42.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060791015625,
+ "learning_rate": 6.80227042612801e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14427084.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008573584258556366,
+ "skip_count": 0.0,
+ "step": 8946,
+ "text_loss": 0.2533438205718994
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.00939242735544,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 6.786692389209482e-05,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 14429690.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003758789971470833,
+ "skip_count": 2.0,
+ "step": 8948,
+ "text_loss": 0.14571085572242737
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 6.771130911711953e-05,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14432983.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005996126215904951,
+ "skip_count": 2.0,
+ "step": 8950,
+ "text_loss": 0.24994049966335297
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 6.755585999598613e-05,
+ "loss": 0.0032,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14435772.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012271527666598558,
+ "skip_count": 0.0,
+ "step": 8952,
+ "text_loss": 0.3705698549747467
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 42.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 6.740057658826293e-05,
+ "loss": 0.0081,
+ "macro_f1": 1.0,
+ "num_tokens": 14438912.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017618577694520354,
+ "skip_count": 1.0,
+ "step": 8954,
+ "text_loss": 0.6691124439239502
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 6.72454589534548e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14441959.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016956349136307836,
+ "skip_count": 1.0,
+ "step": 8956,
+ "text_loss": 0.45412346720695496
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 6.709050715100324e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14444804.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017321301624178886,
+ "skip_count": 2.0,
+ "step": 8958,
+ "text_loss": 0.2668265998363495
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 6.69357212402859e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14447390.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005267233122140169,
+ "skip_count": 2.0,
+ "step": 8960,
+ "text_loss": 0.35546016693115234
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 42.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.017578125,
+ "learning_rate": 6.67811012806172e-05,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14451286.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0045175012201070786,
+ "skip_count": 3.0,
+ "step": 8962,
+ "text_loss": 0.14669834077358246
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.084531846199,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 6.662664733124768e-05,
+ "loss": 0.0064,
+ "macro_f1": 1.0,
+ "num_tokens": 14454335.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004905698820948601,
+ "skip_count": 3.0,
+ "step": 8964,
+ "text_loss": 0.28777357935905457
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 42.09392427355445,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 6.647235945136442e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 14457708.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.032136883586645126,
+ "skip_count": 1.0,
+ "step": 8966,
+ "text_loss": 0.2317836582660675
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 42.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 6.631823770009088e-05,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 14460721.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038611628115177155,
+ "skip_count": 1.0,
+ "step": 8968,
+ "text_loss": 0.28979742527008057
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.11270912826534,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 6.616428213648656e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14463467.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006560821202583611,
+ "skip_count": 0.0,
+ "step": 8970,
+ "text_loss": 0.3474387526512146
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 42.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 6.60104928195479e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14466586.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016879125032573938,
+ "skip_count": 0.0,
+ "step": 8972,
+ "text_loss": 0.5454491972923279
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 6.58568698082071e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14470125.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004945555119775236,
+ "skip_count": 0.0,
+ "step": 8974,
+ "text_loss": 0.4728975296020508
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 6.570341316133272e-05,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 14473887.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.010141569189727306,
+ "skip_count": 3.0,
+ "step": 8976,
+ "text_loss": 0.24756617844104767
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 6.555012293772967e-05,
+ "loss": 0.0051,
+ "macro_f1": 1.0,
+ "num_tokens": 14477046.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011950359679758549,
+ "skip_count": 2.0,
+ "step": 8978,
+ "text_loss": 0.25375646352767944
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 6.539699919613911e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14480638.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007824545609764755,
+ "skip_count": 0.0,
+ "step": 8980,
+ "text_loss": 0.6888379454612732
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 6.524404199523826e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14483723.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004318726249039173,
+ "skip_count": 1.0,
+ "step": 8982,
+ "text_loss": 0.3603152334690094
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.17845611975345,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 6.509125139364058e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 14486876.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010652635246515274,
+ "skip_count": 1.0,
+ "step": 8984,
+ "text_loss": 0.43394285440444946
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 6.493862744989587e-05,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14489944.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010475299786776304,
+ "skip_count": 0.0,
+ "step": 8986,
+ "text_loss": 0.5952020287513733
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 6.478617022248984e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14493094.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004329503979533911,
+ "skip_count": 1.0,
+ "step": 8988,
+ "text_loss": 0.7284399271011353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 6.463387976984437e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14496944.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019588395953178406,
+ "skip_count": 1.0,
+ "step": 8990,
+ "text_loss": 0.8103306889533997
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.21602582917523,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 6.448175615031749e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14499997.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008046228438615799,
+ "skip_count": 1.0,
+ "step": 8992,
+ "text_loss": 0.14758773148059845
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 42.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 6.432979942220319e-05,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14503247.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0028899910394102335,
+ "skip_count": 0.0,
+ "step": 8994,
+ "text_loss": 0.2568151652812958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 6.417800964373161e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14506244.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0042211092077195644,
+ "skip_count": 2.0,
+ "step": 8996,
+ "text_loss": 0.3506850600242615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 6.402638687306872e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14510502.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003309462917968631,
+ "skip_count": 0.0,
+ "step": 8998,
+ "text_loss": 0.5852319598197937
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 42.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 6.387493116831699e-05,
+ "loss": 0.005,
+ "macro_f1": 1.0,
+ "num_tokens": 14513679.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015246274881064892,
+ "skip_count": 5.0,
+ "step": 9000,
+ "text_loss": 0.4266709089279175
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 42.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 6.372364258751434e-05,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 14516862.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005648075137287378,
+ "skip_count": 2.0,
+ "step": 9002,
+ "text_loss": 0.34153711795806885
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 42.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 6.357252118863482e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14519660.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005153972655534744,
+ "skip_count": 3.0,
+ "step": 9004,
+ "text_loss": 0.3911980092525482
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 6.342156702958851e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14522261.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001209715730510652,
+ "skip_count": 0.0,
+ "step": 9006,
+ "text_loss": 0.45400822162628174
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 6.327078016822124e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14525368.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00367624219506979,
+ "skip_count": 1.0,
+ "step": 9008,
+ "text_loss": 0.5327706336975098
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 6.31201606623149e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14528253.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018971028039231896,
+ "skip_count": 0.0,
+ "step": 9010,
+ "text_loss": 0.19216643273830414
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 42.30995010272967,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 6.296970856958712e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14531214.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003927265293896198,
+ "skip_count": 0.0,
+ "step": 9012,
+ "text_loss": 0.3931650221347809
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 6.281942394769142e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14535063.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00801338441669941,
+ "skip_count": 0.0,
+ "step": 9014,
+ "text_loss": 0.1605554074048996
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.32873495744057,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 6.266930685421717e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14538690.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013267790200188756,
+ "skip_count": 0.0,
+ "step": 9016,
+ "text_loss": 0.4797641932964325
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.33812738479601,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 6.251935734668957e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14542591.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013866537483409047,
+ "skip_count": 1.0,
+ "step": 9018,
+ "text_loss": 0.4539037346839905
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 6.236957548256945e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14545259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001481749233789742,
+ "skip_count": 0.0,
+ "step": 9020,
+ "text_loss": 0.6693689227104187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 6.22199613192535e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14548362.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005995423533022404,
+ "skip_count": 1.0,
+ "step": 9022,
+ "text_loss": 0.6533607244491577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 42.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 6.207051491407428e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14551694.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015427720732986927,
+ "skip_count": 4.0,
+ "step": 9024,
+ "text_loss": 0.33537840843200684
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 42.375697094217784,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 6.192123632429986e-05,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14554614.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017432396998628974,
+ "skip_count": 0.0,
+ "step": 9026,
+ "text_loss": 0.9725127220153809
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.385089521573235,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 6.177212560713413e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14559474.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002909898292273283,
+ "skip_count": 2.0,
+ "step": 9028,
+ "text_loss": 0.16944198310375214
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 6.162318281971652e-05,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14563046.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00274385092779994,
+ "skip_count": 0.0,
+ "step": 9030,
+ "text_loss": 0.43176764249801636
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 42.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 6.147440801912218e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14565829.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0024230771232396364,
+ "skip_count": 0.0,
+ "step": 9032,
+ "text_loss": 0.5683854818344116
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 6.132580126236197e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14569016.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004686394706368446,
+ "skip_count": 1.0,
+ "step": 9034,
+ "text_loss": 0.5422781705856323
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 42.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 6.117736260638223e-05,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 14572558.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0010892068967223167,
+ "skip_count": 1.0,
+ "step": 9036,
+ "text_loss": 0.5740243196487427
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.43205165835045,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 6.102909210806495e-05,
+ "loss": 0.006,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 14575969.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0163960512727499,
+ "skip_count": 0.0,
+ "step": 9038,
+ "text_loss": 0.4803958535194397
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.441444085705896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 6.088098982422768e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14578746.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020733694545924664,
+ "skip_count": 0.0,
+ "step": 9040,
+ "text_loss": 0.30313390493392944
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.45083651306135,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 6.073305581162342e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 14581856.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.022739989683032036,
+ "skip_count": 2.0,
+ "step": 9042,
+ "text_loss": 0.5871608257293701
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 6.058529012694086e-05,
+ "loss": 0.0034,
+ "macro_f1": 1.0,
+ "num_tokens": 14584754.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012138293124735355,
+ "skip_count": 2.0,
+ "step": 9044,
+ "text_loss": 0.18492890894412994
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.053466796875,
+ "learning_rate": 6.0437692826803893e-05,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14587867.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009839123813435435,
+ "skip_count": 0.0,
+ "step": 9046,
+ "text_loss": 0.5532476902008057
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 42.47901379512768,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.11376953125,
+ "learning_rate": 6.029026396777237e-05,
+ "loss": 0.0082,
+ "macro_f1": 1.0,
+ "num_tokens": 14591521.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01392262615263462,
+ "skip_count": 5.0,
+ "step": 9048,
+ "text_loss": 0.20356278121471405
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.48840622248312,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 6.0143003606341174e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 14595358.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018218200653791428,
+ "skip_count": 1.0,
+ "step": 9050,
+ "text_loss": 0.3070164620876312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 5.9995911798940764e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14598696.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0002688709646463394,
+ "skip_count": 1.0,
+ "step": 9052,
+ "text_loss": 0.5637917518615723
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.507191077194015,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 5.984898860193694e-05,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14602301.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003135781968012452,
+ "skip_count": 0.0,
+ "step": 9054,
+ "text_loss": 0.345111608505249
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 5.9702234071631e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14606625.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002299862913787365,
+ "skip_count": 0.0,
+ "step": 9056,
+ "text_loss": 0.30707255005836487
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 42.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 5.9555648264259576e-05,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14610303.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0007164468406699598,
+ "skip_count": 0.0,
+ "step": 9058,
+ "text_loss": 0.56083083152771
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 5.940923123599462e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14613211.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00136603566352278,
+ "skip_count": 0.0,
+ "step": 9060,
+ "text_loss": 0.4455239474773407
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 5.926298304294336e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14615844.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001727075781673193,
+ "skip_count": 0.0,
+ "step": 9062,
+ "text_loss": 0.5928102731704712
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.55415321397123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 5.911690374114842e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14619190.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022300337441265583,
+ "skip_count": 0.0,
+ "step": 9064,
+ "text_loss": 0.9456163048744202
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.563545641326684,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 5.8970993386587676e-05,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14622304.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006507525686174631,
+ "skip_count": 2.0,
+ "step": 9066,
+ "text_loss": 0.1809750199317932
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.57293806868213,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 5.882525203517419e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14625386.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022866397630423307,
+ "skip_count": 0.0,
+ "step": 9068,
+ "text_loss": 0.1849939227104187
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048095703125,
+ "learning_rate": 5.867967974275629e-05,
+ "loss": 0.0097,
+ "macro_f1": 1.0,
+ "num_tokens": 14628472.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0058460538275539875,
+ "skip_count": 2.0,
+ "step": 9070,
+ "text_loss": 0.2627561688423157
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 5.853427656511773e-05,
+ "loss": 0.0071,
+ "macro_f1": 1.0,
+ "num_tokens": 14631187.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0085217310115695,
+ "skip_count": 2.0,
+ "step": 9072,
+ "text_loss": 0.18039973080158234
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 42.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 5.838904255797717e-05,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 14633919.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007423012051731348,
+ "skip_count": 4.0,
+ "step": 9074,
+ "text_loss": 0.23746201395988464
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 5.8243977776988585e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14636674.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011181328445672989,
+ "skip_count": 0.0,
+ "step": 9076,
+ "text_loss": 0.38140806555747986
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 42.619900205459345,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 5.8099082277741024e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 14639506.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.03306882083415985,
+ "skip_count": 2.0,
+ "step": 9078,
+ "text_loss": 0.2627770006656647
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.629292632814796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 5.795435611575872e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14642955.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014759303303435445,
+ "skip_count": 0.0,
+ "step": 9080,
+ "text_loss": 0.47112786769866943
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 5.78097993465011e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14646018.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003744201036170125,
+ "skip_count": 0.0,
+ "step": 9082,
+ "text_loss": 0.36873605847358704
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 5.7665412025362516e-05,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14649402.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002992798574268818,
+ "skip_count": 2.0,
+ "step": 9084,
+ "text_loss": 0.6350628137588501
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 5.752119420767243e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14652248.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005798593629151583,
+ "skip_count": 2.0,
+ "step": 9086,
+ "text_loss": 0.2512637972831726
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.66686234223657,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 5.7377145948695474e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14655060.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024162146728485823,
+ "skip_count": 0.0,
+ "step": 9088,
+ "text_loss": 0.4233066439628601
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 42.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 5.723326730363115e-05,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 14658873.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004826475866138935,
+ "skip_count": 4.0,
+ "step": 9090,
+ "text_loss": 0.45946353673934937
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.685647196947464,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 5.7089558327614036e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14661865.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020765739027410746,
+ "skip_count": 2.0,
+ "step": 9092,
+ "text_loss": 0.9425542950630188
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 5.694601907571356e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14666085.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012533976696431637,
+ "skip_count": 0.0,
+ "step": 9094,
+ "text_loss": 0.6307007670402527
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 42.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 5.680264960293446e-05,
+ "loss": 0.0063,
+ "macro_f1": 1.0,
+ "num_tokens": 14668992.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013796845450997353,
+ "skip_count": 5.0,
+ "step": 9096,
+ "text_loss": 0.21720129251480103
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 5.665944996421612e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14672365.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004391494672745466,
+ "skip_count": 0.0,
+ "step": 9098,
+ "text_loss": 0.28794240951538086
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 5.651642021443287e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14676232.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006779583054594696,
+ "skip_count": 0.0,
+ "step": 9100,
+ "text_loss": 0.45190441608428955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 23.0,
+ "epoch": 42.73260933372468,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 5.637356040839398e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6289562582969666,
+ "num_tokens": 14679582.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02379363216459751,
+ "skip_count": 6.0,
+ "step": 9102,
+ "text_loss": 0.3395652770996094
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 42.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 5.623087060084364e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14683438.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00344930961728096,
+ "skip_count": 4.0,
+ "step": 9104,
+ "text_loss": 0.4345538914203644
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 42.751394188435576,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 5.60883508464608e-05,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14686333.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005554547533392906,
+ "skip_count": 3.0,
+ "step": 9106,
+ "text_loss": 0.5202528238296509
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 5.594600119985932e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14690754.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004589532967656851,
+ "skip_count": 1.0,
+ "step": 9108,
+ "text_loss": 0.3040390610694885
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.77017904314646,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 5.580382171558784e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 14693793.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.029969461262226105,
+ "skip_count": 2.0,
+ "step": 9110,
+ "text_loss": 0.3644331693649292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 5.566181244812979e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14697290.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003387648146599531,
+ "skip_count": 0.0,
+ "step": 9112,
+ "text_loss": 0.5177932977676392
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 5.5519973451903404e-05,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14700597.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004790942650288343,
+ "skip_count": 1.0,
+ "step": 9114,
+ "text_loss": 0.2132686972618103
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 5.5378304781261715e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14703852.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007685191812925041,
+ "skip_count": 0.0,
+ "step": 9116,
+ "text_loss": 0.6690551042556763
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 42.807748752568244,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 5.523680649049234e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14707218.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0033531817607581615,
+ "skip_count": 0.0,
+ "step": 9118,
+ "text_loss": 0.26232191920280457
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.81714117992369,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 5.509547863381781e-05,
+ "loss": 0.0084,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 14710244.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.025616342201828957,
+ "skip_count": 0.0,
+ "step": 9120,
+ "text_loss": 0.2897983193397522
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 5.495432126539507e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14713495.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014400121290236712,
+ "skip_count": 0.0,
+ "step": 9122,
+ "text_loss": 0.4580271244049072
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 5.481333443931602e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14716703.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008548611658625305,
+ "skip_count": 0.0,
+ "step": 9124,
+ "text_loss": 0.5140601992607117
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.84531846199002,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 5.4672518209607e-05,
+ "loss": 0.0075,
+ "macro_f1": 0.9255813956260681,
+ "num_tokens": 14719443.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.02092800848186016,
+ "skip_count": 4.0,
+ "step": 9126,
+ "text_loss": 0.2842077314853668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 5.4531872630228965e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14722711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037711653858423233,
+ "skip_count": 0.0,
+ "step": 9128,
+ "text_loss": 0.3268158733844757
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 5.4391397755077784e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14725635.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005959369707852602,
+ "skip_count": 0.0,
+ "step": 9130,
+ "text_loss": 0.44725099205970764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0181884765625,
+ "learning_rate": 5.425109363798358e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14728945.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011272960109636188,
+ "skip_count": 0.0,
+ "step": 9132,
+ "text_loss": 0.45580998063087463
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0167236328125,
+ "learning_rate": 5.411096033271118e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14732271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015554855344817042,
+ "skip_count": 0.0,
+ "step": 9134,
+ "text_loss": 0.16767354309558868
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.89228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 5.3970997892959894e-05,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 14735462.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.007287262007594109,
+ "skip_count": 5.0,
+ "step": 9136,
+ "text_loss": 0.8925374746322632
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 5.383120637236366e-05,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14739288.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004336730111390352,
+ "skip_count": 0.0,
+ "step": 9138,
+ "text_loss": 0.29503148794174194
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 5.369158582449074e-05,
+ "loss": 0.0032,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14742058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004528806544840336,
+ "skip_count": 0.0,
+ "step": 9140,
+ "text_loss": 0.16937516629695892
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.92045788083358,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 5.3552136302844e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14745628.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005676734144799411,
+ "skip_count": 0.0,
+ "step": 9142,
+ "text_loss": 0.48764488101005554
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 5.3412857860860917e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14748482.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017468055011704564,
+ "skip_count": 0.0,
+ "step": 9144,
+ "text_loss": 0.46164339780807495
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.93924273554447,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 5.327375055191314e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 14751091.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007167307659983635,
+ "skip_count": 1.0,
+ "step": 9146,
+ "text_loss": 0.37566086649894714
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 5.3134814429306896e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14753850.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003801940008997917,
+ "skip_count": 2.0,
+ "step": 9148,
+ "text_loss": 0.17589576542377472
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 5.299604954628268e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14756779.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00396628538146615,
+ "skip_count": 1.0,
+ "step": 9150,
+ "text_loss": 0.4118746817111969
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 42.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 5.2857455956015544e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14759574.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.003950111567974091,
+ "skip_count": 0.0,
+ "step": 9152,
+ "text_loss": 0.5839328169822693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.97681244496625,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 5.271903371161479e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14762802.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006622051005251706,
+ "skip_count": 1.0,
+ "step": 9154,
+ "text_loss": 0.40162989497184753
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 5.2580782866124054e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14766136.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003140404587611556,
+ "skip_count": 0.0,
+ "step": 9156,
+ "text_loss": 0.2028028815984726
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.99559729967714,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 5.244270347252139e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14769306.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035792726557701826,
+ "skip_count": 1.0,
+ "step": 9158,
+ "text_loss": 0.5611430406570435
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.004696213677725,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 5.2304795583719034e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14771928.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007276696152985096,
+ "skip_count": 2.0,
+ "step": 9160,
+ "text_loss": 0.1382172554731369
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.01408864103317,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 5.2167059252563485e-05,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14775047.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003121814923360944,
+ "skip_count": 0.0,
+ "step": 9162,
+ "text_loss": 0.6130381226539612
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 31.0,
+ "epoch": 43.02348106838861,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 5.2029494531835695e-05,
+ "loss": 0.0071,
+ "macro_f1": 1.0,
+ "num_tokens": 14777746.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.006029475014656782,
+ "skip_count": 1.0,
+ "step": 9164,
+ "text_loss": 0.5901363492012024
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 43.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 5.189210147425061e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14780813.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034428017679601908,
+ "skip_count": 5.0,
+ "step": 9166,
+ "text_loss": 0.5909968018531799
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 5.1754880132457494e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14785178.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025068193208426237,
+ "skip_count": 2.0,
+ "step": 9168,
+ "text_loss": 0.20257101953029633
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.05165835045494,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 5.161783055904001e-05,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14788307.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003352245781570673,
+ "skip_count": 0.0,
+ "step": 9170,
+ "text_loss": 0.20024186372756958
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 43.061050777810394,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 5.1480952806515654e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14791053.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0009423785959370434,
+ "skip_count": 0.0,
+ "step": 9172,
+ "text_loss": 0.6944412589073181
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.07044320516584,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 5.13442469273363e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14794259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016676477389410138,
+ "skip_count": 0.0,
+ "step": 9174,
+ "text_loss": 0.10889370739459991
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 5.1207712973887875e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14797345.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005842766724526882,
+ "skip_count": 2.0,
+ "step": 9176,
+ "text_loss": 0.17763052880764008
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 5.107135099849042e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14800819.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004951528972014785,
+ "skip_count": 0.0,
+ "step": 9178,
+ "text_loss": 0.43891432881355286
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 5.093516105339818e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14803924.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031010014936327934,
+ "skip_count": 1.0,
+ "step": 9180,
+ "text_loss": 0.39177098870277405
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.10801291458761,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 5.079914319079931e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14807083.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00047361713950522244,
+ "skip_count": 0.0,
+ "step": 9182,
+ "text_loss": 0.39144888520240784
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.117405341943055,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 5.066329746281617e-05,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14810263.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018734827172011137,
+ "skip_count": 0.0,
+ "step": 9184,
+ "text_loss": 0.531446099281311
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.126797769298506,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 5.052762392150506e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14813761.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00503428652882576,
+ "skip_count": 0.0,
+ "step": 9186,
+ "text_loss": 0.19398775696754456
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 5.039212261885634e-05,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14817708.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010842647170647979,
+ "skip_count": 0.0,
+ "step": 9188,
+ "text_loss": 0.5365647077560425
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 43.14558262400939,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0172119140625,
+ "learning_rate": 5.025679360679442e-05,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 14820912.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.004775309935212135,
+ "skip_count": 2.0,
+ "step": 9190,
+ "text_loss": 0.6473321318626404
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 5.012163693717747e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14824115.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004022061824798584,
+ "skip_count": 0.0,
+ "step": 9192,
+ "text_loss": 0.24432586133480072
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.16436747872028,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 4.9986652661798025e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14827404.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00231996551156044,
+ "skip_count": 1.0,
+ "step": 9194,
+ "text_loss": 0.7459486722946167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.17375990607572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 4.98518408323822e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14830077.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000999651150777936,
+ "skip_count": 0.0,
+ "step": 9196,
+ "text_loss": 0.5136345624923706
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.183152333431174,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 4.971720150059012e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14833231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033226648811250925,
+ "skip_count": 2.0,
+ "step": 9198,
+ "text_loss": 0.1597593128681183
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.19254476078662,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 4.958273471801583e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14836534.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00400200579315424,
+ "skip_count": 0.0,
+ "step": 9200,
+ "text_loss": 0.16248664259910583
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 4.94484405361873e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14840301.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038636941462755203,
+ "skip_count": 0.0,
+ "step": 9202,
+ "text_loss": 0.20964740216732025
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 4.9314319006566296e-05,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14844094.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00593461561948061,
+ "skip_count": 2.0,
+ "step": 9204,
+ "text_loss": 0.43311986327171326
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0166015625,
+ "learning_rate": 4.918037018054844e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14847148.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007939442875795066,
+ "skip_count": 0.0,
+ "step": 9206,
+ "text_loss": 0.8805840015411377
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.23011447020839,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 4.904659410946311e-05,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 14851556.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0058822291903197765,
+ "skip_count": 4.0,
+ "step": 9208,
+ "text_loss": 0.2123873233795166
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 4.891299084457362e-05,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14855208.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024413811042904854,
+ "skip_count": 0.0,
+ "step": 9210,
+ "text_loss": 0.4408712685108185
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.248899324919286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 4.8779560437076983e-05,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14858433.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007487752009183168,
+ "skip_count": 1.0,
+ "step": 9212,
+ "text_loss": 0.7417129874229431
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 4.864630293810401e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14861739.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007972145453095436,
+ "skip_count": 2.0,
+ "step": 9214,
+ "text_loss": 0.3347324728965759
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.26768417963017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 4.851321839871908e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14865220.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006238576490432024,
+ "skip_count": 1.0,
+ "step": 9216,
+ "text_loss": 0.49660998582839966
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.27707660698562,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 4.838030686992062e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14868179.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003592922119423747,
+ "skip_count": 0.0,
+ "step": 9218,
+ "text_loss": 0.316535621881485
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 43.28646903434106,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 4.824756840264055e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14870950.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012321153655648232,
+ "skip_count": 3.0,
+ "step": 9220,
+ "text_loss": 0.270915150642395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.295861461696504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 4.8115003047744466e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14873749.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008396002231165767,
+ "skip_count": 0.0,
+ "step": 9222,
+ "text_loss": 0.4190096855163574
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.305253889051954,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0169677734375,
+ "learning_rate": 4.798261085603162e-05,
+ "loss": 0.0034,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14877349.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002983161248266697,
+ "skip_count": 1.0,
+ "step": 9224,
+ "text_loss": 0.8203139901161194
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.3146463164074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 4.785039187823503e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14881192.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003951616585254669,
+ "skip_count": 2.0,
+ "step": 9226,
+ "text_loss": 0.36447709798812866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 4.771834616502119e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14884608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001604852732270956,
+ "skip_count": 0.0,
+ "step": 9228,
+ "text_loss": 0.733951985836029
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.333431171118285,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 4.758647376699032e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 14887963.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.041028670966625214,
+ "skip_count": 2.0,
+ "step": 9230,
+ "text_loss": 0.1800784021615982
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 4.7454774734676074e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14890769.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027380166575312614,
+ "skip_count": 0.0,
+ "step": 9232,
+ "text_loss": 0.6017972230911255
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.35221602582917,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 4.732324911854591e-05,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14894162.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018064725445583463,
+ "skip_count": 2.0,
+ "step": 9234,
+ "text_loss": 0.5853637456893921
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 43.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 4.7191896969000617e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14897248.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005479716695845127,
+ "skip_count": 0.0,
+ "step": 9236,
+ "text_loss": 0.6206526756286621
+ },
+ {
+ "acc_repeat": 0.75,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 43.371000880540066,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.8571428656578064,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 4.706071833637454e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.9446290731430054,
+ "num_tokens": 14900186.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.013435420580208302,
+ "skip_count": 3.0,
+ "step": 9238,
+ "text_loss": 0.46402135491371155
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 43.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 4.692971327093559e-05,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 14903080.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007366253528743982,
+ "skip_count": 4.0,
+ "step": 9240,
+ "text_loss": 0.6870771646499634
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.38978573525095,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 4.6798881822885276e-05,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 14906837.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004979560151696205,
+ "skip_count": 2.0,
+ "step": 9242,
+ "text_loss": 0.46396589279174805
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.3991781626064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 4.666822404235838e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14909541.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00023516178771387786,
+ "skip_count": 0.0,
+ "step": 9244,
+ "text_loss": 0.5960518717765808
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 43.40857058996184,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 4.6537739979423174e-05,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 14912820.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0014796241885051131,
+ "skip_count": 1.0,
+ "step": 9246,
+ "text_loss": 0.48075684905052185
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.41796301731729,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 4.640742968408146e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14916283.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001386807532981038,
+ "skip_count": 0.0,
+ "step": 9248,
+ "text_loss": 0.3950015902519226
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 43.427355444672735,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.037109375,
+ "learning_rate": 4.627729320626833e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 14918958.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.020335515961050987,
+ "skip_count": 4.0,
+ "step": 9250,
+ "text_loss": 0.6995832324028015
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.43674787202818,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 4.6147330595852354e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14921888.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005387732293456793,
+ "skip_count": 2.0,
+ "step": 9252,
+ "text_loss": 0.2771800756454468
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 4.601754190263552e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14925135.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001703745685517788,
+ "skip_count": 1.0,
+ "step": 9254,
+ "text_loss": 0.7100088596343994
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 4.5887927176352875e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14929198.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0058114733546972275,
+ "skip_count": 2.0,
+ "step": 9256,
+ "text_loss": 0.21729083359241486
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 4.5758486466673244e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14932685.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026105218566954136,
+ "skip_count": 0.0,
+ "step": 9258,
+ "text_loss": 0.20695121586322784
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.47431758144996,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 4.5629219823198564e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14937901.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006947176996618509,
+ "skip_count": 2.0,
+ "step": 9260,
+ "text_loss": 0.15886647999286652
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 4.550012729546393e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14941406.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011366386897861958,
+ "skip_count": 0.0,
+ "step": 9262,
+ "text_loss": 0.49892309308052063
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 43.49310243616085,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 4.537120893293789e-05,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 14944200.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002686526160687208,
+ "skip_count": 1.0,
+ "step": 9264,
+ "text_loss": 0.6201852560043335
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 4.5242464785022256e-05,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14947592.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007816873257979751,
+ "skip_count": 0.0,
+ "step": 9266,
+ "text_loss": 0.49434536695480347
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 4.5113894901051944e-05,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14950382.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013167982688173652,
+ "skip_count": 0.0,
+ "step": 9268,
+ "text_loss": 0.696306586265564
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 43.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 4.498549933029511e-05,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14953424.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006240467075258493,
+ "skip_count": 3.0,
+ "step": 9270,
+ "text_loss": 0.14193731546401978
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.53067214558262,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 4.485727812195339e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14956937.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006212725769728422,
+ "skip_count": 2.0,
+ "step": 9272,
+ "text_loss": 0.40858668088912964
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.54006457293807,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 4.472923132516132e-05,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14960398.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003120801877230406,
+ "skip_count": 2.0,
+ "step": 9274,
+ "text_loss": 0.4740981459617615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 4.46013589889866e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14963037.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027343074325472116,
+ "skip_count": 0.0,
+ "step": 9276,
+ "text_loss": 0.1420614868402481
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 4.4473661162430176e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14965604.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006372901843860745,
+ "skip_count": 0.0,
+ "step": 9278,
+ "text_loss": 0.4628531336784363
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.5682418550044,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 4.4346137894426155e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14968803.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0062922025099396706,
+ "skip_count": 2.0,
+ "step": 9280,
+ "text_loss": 0.29813849925994873
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.577634282359845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 4.421878923384159e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14972557.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006071912590414286,
+ "skip_count": 2.0,
+ "step": 9282,
+ "text_loss": 0.19581027328968048
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 43.58702670971529,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 4.40916152294768e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14975358.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001606325968168676,
+ "skip_count": 0.0,
+ "step": 9284,
+ "text_loss": 0.6929896473884583
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.59641913707074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 4.3964615930065124e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14978045.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002845643786713481,
+ "skip_count": 1.0,
+ "step": 9286,
+ "text_loss": 0.49997636675834656
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 4.3837791384272744e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14981606.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005257320590317249,
+ "skip_count": 1.0,
+ "step": 9288,
+ "text_loss": 0.3391074538230896
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.61520399178163,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 4.3711141640699395e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 14984404.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.02914038859307766,
+ "skip_count": 2.0,
+ "step": 9290,
+ "text_loss": 0.29165980219841003
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 4.3584666747877254e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14987280.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005831835325807333,
+ "skip_count": 1.0,
+ "step": 9292,
+ "text_loss": 0.5312305688858032
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 4.345836675427184e-05,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14990071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035566375590860844,
+ "skip_count": 0.0,
+ "step": 9294,
+ "text_loss": 0.25595441460609436
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 4.333224170828149e-05,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14993809.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026552488561719656,
+ "skip_count": 0.0,
+ "step": 9296,
+ "text_loss": 0.18538808822631836
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 43.65277370120341,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 4.3206291658237586e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14996794.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010047328658401966,
+ "skip_count": 4.0,
+ "step": 9298,
+ "text_loss": 0.37891554832458496
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 4.308051665240442e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15000911.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030308531131595373,
+ "skip_count": 0.0,
+ "step": 9300,
+ "text_loss": 0.20204831659793854
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 4.295491673897922e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15004106.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003695673542097211,
+ "skip_count": 1.0,
+ "step": 9302,
+ "text_loss": 0.84013831615448
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 4.282949196609215e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15007482.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000820459274109453,
+ "skip_count": 0.0,
+ "step": 9304,
+ "text_loss": 0.4521652162075043
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.69034341062518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 4.2704242381806144e-05,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15010579.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006170184817165136,
+ "skip_count": 1.0,
+ "step": 9306,
+ "text_loss": 0.22438007593154907
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 43.699735837980626,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 4.25791680341171e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 15013835.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021745599806308746,
+ "skip_count": 4.0,
+ "step": 9308,
+ "text_loss": 0.5847432613372803
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.70912826533607,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 4.245426897095372e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15017268.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022570823784917593,
+ "skip_count": 1.0,
+ "step": 9310,
+ "text_loss": 0.345931738615036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.71852069269152,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 4.232954524017763e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15020095.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009895693510770798,
+ "skip_count": 0.0,
+ "step": 9312,
+ "text_loss": 0.5374923944473267
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.72791312004696,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 4.220499688958307e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15022763.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005146807990968227,
+ "skip_count": 0.0,
+ "step": 9314,
+ "text_loss": 0.7208939790725708
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.73730554740241,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 4.208062396689738e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15025926.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00369556387886405,
+ "skip_count": 1.0,
+ "step": 9316,
+ "text_loss": 0.36686572432518005
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 4.1956426519780435e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15029120.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00971714872866869,
+ "skip_count": 2.0,
+ "step": 9318,
+ "text_loss": 0.20697914063930511
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 43.756090402113294,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 4.183240459582488e-05,
+ "loss": 0.0036,
+ "macro_f1": 1.0,
+ "num_tokens": 15032000.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002361048012971878,
+ "skip_count": 1.0,
+ "step": 9320,
+ "text_loss": 0.6737313866615295
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 4.1708558242556207e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15034831.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001238204538822174,
+ "skip_count": 0.0,
+ "step": 9322,
+ "text_loss": 0.823642373085022
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.77487525682419,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 4.1584887507432556e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15037487.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005211949814110994,
+ "skip_count": 1.0,
+ "step": 9324,
+ "text_loss": 0.3821350634098053
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 4.146139243784475e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15040167.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007513152435421944,
+ "skip_count": 0.0,
+ "step": 9326,
+ "text_loss": 0.18124167621135712
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 4.133807308111637e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15043777.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029832208529114723,
+ "skip_count": 0.0,
+ "step": 9328,
+ "text_loss": 0.47313618659973145
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 4.1214929484503615e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15046622.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009155526757240295,
+ "skip_count": 1.0,
+ "step": 9330,
+ "text_loss": 0.20556017756462097
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.81244496624596,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 4.1091961695195304e-05,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15049543.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003529169363901019,
+ "skip_count": 0.0,
+ "step": 9332,
+ "text_loss": 0.18752245604991913
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.821837393601406,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 4.0969169760313005e-05,
+ "loss": 0.0078,
+ "macro_f1": 1.0,
+ "num_tokens": 15052924.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002136822324246168,
+ "skip_count": 2.0,
+ "step": 9334,
+ "text_loss": 0.85563725233078
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.83122982095686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053466796875,
+ "learning_rate": 4.084655372691076e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15056579.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003167972667142749,
+ "skip_count": 2.0,
+ "step": 9336,
+ "text_loss": 0.45709627866744995
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 43.8406222483123,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 4.07241136419752e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 15059739.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03742539510130882,
+ "skip_count": 2.0,
+ "step": 9338,
+ "text_loss": 0.19531641900539398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 4.06018495524258e-05,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15062795.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002699678996577859,
+ "skip_count": 0.0,
+ "step": 9340,
+ "text_loss": 0.31032654643058777
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.85940710302319,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 4.047976150511423e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15066591.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026099481619894505,
+ "skip_count": 0.0,
+ "step": 9342,
+ "text_loss": 0.4676157832145691
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 4.035784954682486e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15069509.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006772278342396021,
+ "skip_count": 1.0,
+ "step": 9344,
+ "text_loss": 0.23385995626449585
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 43.878191957734074,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 4.0236113724274713e-05,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15072898.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0005968905170448124,
+ "skip_count": 0.0,
+ "step": 9346,
+ "text_loss": 0.6250094175338745
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.887584385089525,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 4.011455408411302e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15075547.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012884319759905338,
+ "skip_count": 2.0,
+ "step": 9348,
+ "text_loss": 0.23720405995845795
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.89697681244497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 3.9993170672921794e-05,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15078902.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018171088304370642,
+ "skip_count": 0.0,
+ "step": 9350,
+ "text_loss": 0.23975110054016113
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 43.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 3.9871963537215284e-05,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 15082292.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001974726328626275,
+ "skip_count": 1.0,
+ "step": 9352,
+ "text_loss": 0.354034423828125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 3.975093272344038e-05,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15085288.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014760299818590283,
+ "skip_count": 0.0,
+ "step": 9354,
+ "text_loss": 0.6398947834968567
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 43.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 3.963007827797627e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15089089.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004467889666557312,
+ "skip_count": 3.0,
+ "step": 9356,
+ "text_loss": 0.26422595977783203
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.93454652186674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 3.950940024713462e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15092178.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0048953029327094555,
+ "skip_count": 1.0,
+ "step": 9358,
+ "text_loss": 0.7519236207008362
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 43.943938949222186,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 3.9388898677159446e-05,
+ "loss": 0.0065,
+ "macro_f1": 1.0,
+ "num_tokens": 15094825.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004229324869811535,
+ "skip_count": 1.0,
+ "step": 9360,
+ "text_loss": 0.522379457950592
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 43.95333137657764,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 3.9268573614227146e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15098119.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028480603359639645,
+ "skip_count": 3.0,
+ "step": 9362,
+ "text_loss": 0.47443902492523193
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 3.914842510444666e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15101362.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024998984299600124,
+ "skip_count": 1.0,
+ "step": 9364,
+ "text_loss": 0.6255060434341431
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0186767578125,
+ "learning_rate": 3.9028453193859006e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15104544.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008692052215337753,
+ "skip_count": 1.0,
+ "step": 9366,
+ "text_loss": 0.26974618434906006
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.98150865864397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 3.890865792843768e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15107619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002779777627438307,
+ "skip_count": 2.0,
+ "step": 9368,
+ "text_loss": 0.4157184064388275
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 3.878903935408845e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15111352.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010220289696007967,
+ "skip_count": 0.0,
+ "step": 9370,
+ "text_loss": 0.5674155950546265
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 3.866959751664939e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15114088.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004387985449284315,
+ "skip_count": 1.0,
+ "step": 9372,
+ "text_loss": 0.3638002276420593
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.00939242735544,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 3.8550332461890824e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15117271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005855522467754781,
+ "skip_count": 0.0,
+ "step": 9374,
+ "text_loss": 0.6257871389389038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 3.843124423551536e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15119936.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026496360078454018,
+ "skip_count": 0.0,
+ "step": 9376,
+ "text_loss": 0.7019506096839905
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 3.8312332883157774e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15123407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024072150699794292,
+ "skip_count": 0.0,
+ "step": 9378,
+ "text_loss": 0.45380696654319763
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 3.819359845038517e-05,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15126742.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00031929166289046407,
+ "skip_count": 0.0,
+ "step": 9380,
+ "text_loss": 0.5322204828262329
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 3.807504098269682e-05,
+ "loss": 0.0103,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15130854.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00177620945032686,
+ "skip_count": 0.0,
+ "step": 9382,
+ "text_loss": 0.5220870971679688
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 44.05635456413267,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 3.7956660525524156e-05,
+ "loss": 0.0071,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 15135054.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013358182273805141,
+ "skip_count": 2.0,
+ "step": 9384,
+ "text_loss": 0.39796701073646545
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 3.783845712423067e-05,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15139179.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030253338627517223,
+ "skip_count": 0.0,
+ "step": 9386,
+ "text_loss": 0.13592341542243958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 3.772043082411236e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15142436.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008311813580803573,
+ "skip_count": 0.0,
+ "step": 9388,
+ "text_loss": 0.7804215550422668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.084531846199,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 3.760258167039704e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15146071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012432600371539593,
+ "skip_count": 1.0,
+ "step": 9390,
+ "text_loss": 0.37692421674728394
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.8571428656578064,
+ "avg_layers": 23.0,
+ "epoch": 44.09392427355445,
+ "f1_execute": 0.9756097793579102,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.9230769276618958,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 3.748490970824464e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.9662289023399353,
+ "num_tokens": 15149020.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03158312290906906,
+ "skip_count": 7.0,
+ "step": 9392,
+ "text_loss": 0.6111845374107361
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0166015625,
+ "learning_rate": 3.7367414982747374e-05,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15151887.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000898235070053488,
+ "skip_count": 0.0,
+ "step": 9394,
+ "text_loss": 0.42988476157188416
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.11270912826534,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 3.7250097538929384e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15155395.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024584042839705944,
+ "skip_count": 1.0,
+ "step": 9396,
+ "text_loss": 0.4083070456981659
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 3.713295742174694e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15158275.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012269694125279784,
+ "skip_count": 0.0,
+ "step": 9398,
+ "text_loss": 0.529385507106781
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 3.701599467608835e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15161533.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002610012423247099,
+ "skip_count": 1.0,
+ "step": 9400,
+ "text_loss": 0.1785552203655243
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 44.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 3.6899209346773986e-05,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15164799.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0012146600056439638,
+ "skip_count": 0.0,
+ "step": 9402,
+ "text_loss": 0.9209059476852417
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 3.678260147855628e-05,
+ "loss": 0.0028,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15168111.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001716976286843419,
+ "skip_count": 1.0,
+ "step": 9404,
+ "text_loss": 0.5762659907341003
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 3.6666171116119474e-05,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 15171285.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005656248424202204,
+ "skip_count": 2.0,
+ "step": 9406,
+ "text_loss": 0.3065127432346344
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0186767578125,
+ "learning_rate": 3.6549918304079946e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15174838.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002362997969612479,
+ "skip_count": 2.0,
+ "step": 9408,
+ "text_loss": 0.5256759524345398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 3.643384308698594e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15177713.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002327109221369028,
+ "skip_count": 1.0,
+ "step": 9410,
+ "text_loss": 0.27613985538482666
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 44.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 3.6317945509317716e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15180863.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008501979522407055,
+ "skip_count": 0.0,
+ "step": 9412,
+ "text_loss": 0.3379829525947571
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 3.6202225615487525e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15184531.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004115676507353783,
+ "skip_count": 0.0,
+ "step": 9414,
+ "text_loss": 0.24313601851463318
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 3.6086683449839454e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15187699.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017425924306735396,
+ "skip_count": 0.0,
+ "step": 9416,
+ "text_loss": 0.47485142946243286
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 44.21602582917523,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 3.597131905664935e-05,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 15190528.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0031498887110501528,
+ "skip_count": 1.0,
+ "step": 9418,
+ "text_loss": 0.5356660485267639
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 3.585613248012515e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15194165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006833057850599289,
+ "skip_count": 1.0,
+ "step": 9420,
+ "text_loss": 0.21593274176120758
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 3.574112376440658e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15197612.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013788710348308086,
+ "skip_count": 1.0,
+ "step": 9422,
+ "text_loss": 0.5275097489356995
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 3.5626292953565175e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15201103.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021296890918165445,
+ "skip_count": 0.0,
+ "step": 9424,
+ "text_loss": 0.3420610725879669
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 3.551164009160429e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15204007.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025281559210270643,
+ "skip_count": 0.0,
+ "step": 9426,
+ "text_loss": 0.4756413996219635
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 3.539716522245917e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15208066.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008577071712352335,
+ "skip_count": 0.0,
+ "step": 9428,
+ "text_loss": 0.7672523260116577
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 44.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 3.528286838999672e-05,
+ "loss": 0.0032,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15211118.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002977409167215228,
+ "skip_count": 0.0,
+ "step": 9430,
+ "text_loss": 0.5010796785354614
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 44.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 3.5168749638015806e-05,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15214245.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0009552660631015897,
+ "skip_count": 0.0,
+ "step": 9432,
+ "text_loss": 0.6633321642875671
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 3.505480901024677e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15217449.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005598205607384443,
+ "skip_count": 2.0,
+ "step": 9434,
+ "text_loss": 0.545702338218689
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 44.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 3.494104655035213e-05,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15220391.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0154950562864542,
+ "skip_count": 4.0,
+ "step": 9436,
+ "text_loss": 0.211164191365242
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.30995010272967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 3.4827462301925735e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15224061.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001531782210804522,
+ "skip_count": 0.0,
+ "step": 9438,
+ "text_loss": 0.49369096755981445
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 3.471405630849328e-05,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15227586.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004152537789195776,
+ "skip_count": 1.0,
+ "step": 9440,
+ "text_loss": 0.1624782234430313
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.32873495744057,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046875,
+ "learning_rate": 3.4600828613512156e-05,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15230713.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026113570202142,
+ "skip_count": 0.0,
+ "step": 9442,
+ "text_loss": 0.1921689808368683
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 44.33812738479601,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 3.44877792603715e-05,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15233925.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008077848702669144,
+ "skip_count": 3.0,
+ "step": 9444,
+ "text_loss": 0.32417818903923035
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 3.437490829239193e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15236684.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005273211863823235,
+ "skip_count": 0.0,
+ "step": 9446,
+ "text_loss": 0.3497772812843323
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 3.4262215752825895e-05,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15239866.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015295564662665129,
+ "skip_count": 0.0,
+ "step": 9448,
+ "text_loss": 0.7613807320594788
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 44.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 3.414970168485737e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15243615.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0039047773461788893,
+ "skip_count": 0.0,
+ "step": 9450,
+ "text_loss": 0.3325706720352173
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.375697094217784,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 3.403736613160191e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 15246714.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0300968699157238,
+ "skip_count": 2.0,
+ "step": 9452,
+ "text_loss": 0.3441869020462036
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 44.385089521573235,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 3.392520913610681e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15249520.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0037529836408793926,
+ "skip_count": 0.0,
+ "step": 9454,
+ "text_loss": 0.5083104968070984
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 3.381323074135073e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15252527.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019368440844118595,
+ "skip_count": 2.0,
+ "step": 9456,
+ "text_loss": 0.49744489789009094
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 3.3701430990244085e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15255330.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033424650318920612,
+ "skip_count": 1.0,
+ "step": 9458,
+ "text_loss": 0.5603348016738892
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 3.35898099256286e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15257961.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006928095244802535,
+ "skip_count": 0.0,
+ "step": 9460,
+ "text_loss": 0.5270714163780212
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 3.347836759027789e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15261137.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030718250200152397,
+ "skip_count": 2.0,
+ "step": 9462,
+ "text_loss": 0.11651179939508438
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.43205165835045,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 3.33671040268968e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 15264234.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03508305177092552,
+ "skip_count": 2.0,
+ "step": 9464,
+ "text_loss": 0.14562347531318665
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.441444085705896,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 3.3256019278121717e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 15267047.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008365205489099026,
+ "skip_count": 1.0,
+ "step": 9466,
+ "text_loss": 0.8550931215286255
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.45083651306135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 3.3145113386520485e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15270442.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036910634953528643,
+ "skip_count": 0.0,
+ "step": 9468,
+ "text_loss": 0.24741731584072113
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 3.30343863945925e-05,
+ "loss": 0.0095,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15273845.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014966290909796953,
+ "skip_count": 0.0,
+ "step": 9470,
+ "text_loss": 0.5137372612953186
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 3.2923838344768534e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15277940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028104602824896574,
+ "skip_count": 0.0,
+ "step": 9472,
+ "text_loss": 0.5737728476524353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.47901379512768,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056396484375,
+ "learning_rate": 3.281346927941087e-05,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15281640.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007870957255363464,
+ "skip_count": 2.0,
+ "step": 9474,
+ "text_loss": 0.27684518694877625
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.48840622248312,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 3.270327924081301e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15284877.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006224945653229952,
+ "skip_count": 0.0,
+ "step": 9476,
+ "text_loss": 0.35599255561828613
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 3.259326827120013e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15287945.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001179040758870542,
+ "skip_count": 0.0,
+ "step": 9478,
+ "text_loss": 0.26802319288253784
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.507191077194015,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 3.2483436412728553e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15290754.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001992281526327133,
+ "skip_count": 0.0,
+ "step": 9480,
+ "text_loss": 0.40124714374542236
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 3.2373783707486057e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15294841.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012830843916162848,
+ "skip_count": 0.0,
+ "step": 9482,
+ "text_loss": 0.6739225387573242
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 3.226431019749171e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15298397.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003624147269874811,
+ "skip_count": 2.0,
+ "step": 9484,
+ "text_loss": 0.5250326991081238
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.016357421875,
+ "learning_rate": 3.2155015924696105e-05,
+ "loss": 0.0031,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15301499.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019682408310472965,
+ "skip_count": 0.0,
+ "step": 9486,
+ "text_loss": 0.5574567317962646
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 3.204590093098098e-05,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15304531.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002245094161480665,
+ "skip_count": 0.0,
+ "step": 9488,
+ "text_loss": 0.4065501093864441
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.55415321397123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 3.1936965258159366e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15307826.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002919224789366126,
+ "skip_count": 1.0,
+ "step": 9490,
+ "text_loss": 0.5183609127998352
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.563545641326684,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 3.1828208947975615e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15311420.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004961747210472822,
+ "skip_count": 1.0,
+ "step": 9492,
+ "text_loss": 0.1962234377861023
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.57293806868213,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 3.171963204210537e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15314196.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026044815313071012,
+ "skip_count": 0.0,
+ "step": 9494,
+ "text_loss": 0.223251610994339
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 3.161123458215553e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15317174.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029661289881914854,
+ "skip_count": 0.0,
+ "step": 9496,
+ "text_loss": 0.32970958948135376
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 3.150301660966415e-05,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15320343.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011696632718667388,
+ "skip_count": 0.0,
+ "step": 9498,
+ "text_loss": 0.8590811491012573
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 3.13949781661006e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15324138.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015035583637654781,
+ "skip_count": 0.0,
+ "step": 9500,
+ "text_loss": 0.6658036708831787
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 3.1287119292865375e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15328395.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001930502592585981,
+ "skip_count": 0.0,
+ "step": 9502,
+ "text_loss": 0.4104210138320923
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.619900205459345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 3.117944003129025e-05,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15332196.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010025398805737495,
+ "skip_count": 0.0,
+ "step": 9504,
+ "text_loss": 0.7272399663925171
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 44.629292632814796,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 3.107194042263806e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15335253.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004520092159509659,
+ "skip_count": 0.0,
+ "step": 9506,
+ "text_loss": 0.29173022508621216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 3.096462050810284e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15338129.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009707154240459204,
+ "skip_count": 0.0,
+ "step": 9508,
+ "text_loss": 0.6530287861824036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 3.0857480328809916e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15341487.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008689566748216748,
+ "skip_count": 0.0,
+ "step": 9510,
+ "text_loss": 0.36988505721092224
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 3.0750519925815565e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15344460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022587007842957973,
+ "skip_count": 0.0,
+ "step": 9512,
+ "text_loss": 0.2447768598794937
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.66686234223657,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 3.064373934010711e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15348135.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001986770424991846,
+ "skip_count": 0.0,
+ "step": 9514,
+ "text_loss": 0.43159469962120056
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 3.053713861260321e-05,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15351073.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003514432755764574,
+ "skip_count": 0.0,
+ "step": 9516,
+ "text_loss": 0.3638324737548828
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.685647196947464,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 3.043071778415335e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15353633.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003395392093807459,
+ "skip_count": 0.0,
+ "step": 9518,
+ "text_loss": 0.5728140473365784
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 3.03244768955383e-05,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15357322.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016641782131046057,
+ "skip_count": 0.0,
+ "step": 9520,
+ "text_loss": 0.666814386844635
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0177001953125,
+ "learning_rate": 3.021841598746966e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15360771.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024721708614379168,
+ "skip_count": 0.0,
+ "step": 9522,
+ "text_loss": 0.7148030400276184
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 3.01125351005902e-05,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15364281.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004133665468543768,
+ "skip_count": 0.0,
+ "step": 9524,
+ "text_loss": 0.2985752820968628
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 3.0006834275473737e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15367354.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003016186412423849,
+ "skip_count": 1.0,
+ "step": 9526,
+ "text_loss": 0.22689883410930634
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 44.73260933372468,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01531982421875,
+ "learning_rate": 2.9901313552624932e-05,
+ "loss": 0.003,
+ "macro_f1": 1.0,
+ "num_tokens": 15371027.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015333639457821846,
+ "skip_count": 7.0,
+ "step": 9528,
+ "text_loss": 0.8308720588684082
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 2.97959729724796e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15373948.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001420815708115697,
+ "skip_count": 0.0,
+ "step": 9530,
+ "text_loss": 0.5439777970314026
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.751394188435576,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 2.9690812575404456e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15377366.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007130459416657686,
+ "skip_count": 0.0,
+ "step": 9532,
+ "text_loss": 0.45405295491218567
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.76078661579102,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08349609375,
+ "learning_rate": 2.95858324016971e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 15380115.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04256885498762131,
+ "skip_count": 0.0,
+ "step": 9534,
+ "text_loss": 0.39998912811279297
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 44.77017904314646,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 2.9481032491586178e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15383205.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004944019019603729,
+ "skip_count": 4.0,
+ "step": 9536,
+ "text_loss": 0.1882237195968628
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 2.937641288523124e-05,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15386619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007820523343980312,
+ "skip_count": 1.0,
+ "step": 9538,
+ "text_loss": 0.26401394605636597
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 2.9271973622722603e-05,
+ "loss": 0.0026,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15389135.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010751578956842422,
+ "skip_count": 0.0,
+ "step": 9540,
+ "text_loss": 0.39813846349716187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 2.9167714744081643e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15392150.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031554463785141706,
+ "skip_count": 2.0,
+ "step": 9542,
+ "text_loss": 0.669784665107727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.807748752568244,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 2.9063636289260677e-05,
+ "loss": 0.0037,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15394974.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00287301791831851,
+ "skip_count": 1.0,
+ "step": 9544,
+ "text_loss": 0.176493301987648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.81714117992369,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 2.8959738298142635e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15398432.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011708475649356842,
+ "skip_count": 0.0,
+ "step": 9546,
+ "text_loss": 0.8762983083724976
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 2.885602081054145e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15401121.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003167103510349989,
+ "skip_count": 1.0,
+ "step": 9548,
+ "text_loss": 0.2538717985153198
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 44.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 2.8752483866201885e-05,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 15404105.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007552143186330795,
+ "skip_count": 5.0,
+ "step": 9550,
+ "text_loss": 0.37045153975486755
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 2.8649127504799423e-05,
+ "loss": 0.0046,
+ "macro_f1": 1.0,
+ "num_tokens": 15407232.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007718692068010569,
+ "skip_count": 2.0,
+ "step": 9552,
+ "text_loss": 0.15780900418758392
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 2.8545951765940547e-05,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15410425.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003527951193973422,
+ "skip_count": 0.0,
+ "step": 9554,
+ "text_loss": 0.5931823253631592
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 2.8442956689162193e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15413724.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00146177364513278,
+ "skip_count": 0.0,
+ "step": 9556,
+ "text_loss": 0.691118061542511
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 2.8340142313932448e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15416776.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010256811510771513,
+ "skip_count": 0.0,
+ "step": 9558,
+ "text_loss": 0.40814271569252014
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 2.823750867964997e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15419815.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0047921910881996155,
+ "skip_count": 0.0,
+ "step": 9560,
+ "text_loss": 0.28953713178634644
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.89228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 2.8135055825644072e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15422806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002010057680308819,
+ "skip_count": 1.0,
+ "step": 9562,
+ "text_loss": 0.8377944231033325
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 2.803278379117491e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15425405.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005009239539504051,
+ "skip_count": 1.0,
+ "step": 9564,
+ "text_loss": 0.5936337113380432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 2.793069261543335e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15428233.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007967893034219742,
+ "skip_count": 2.0,
+ "step": 9566,
+ "text_loss": 0.49891290068626404
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.92045788083358,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 2.7828782337540882e-05,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 15431095.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.00638923142105341,
+ "skip_count": 4.0,
+ "step": 9568,
+ "text_loss": 0.30928006768226624
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 44.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 2.7727052996549763e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15434933.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0060427505522966385,
+ "skip_count": 3.0,
+ "step": 9570,
+ "text_loss": 0.21274788677692413
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.93924273554447,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 2.762550463144281e-05,
+ "loss": 0.0031,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15437655.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012480237055569887,
+ "skip_count": 0.0,
+ "step": 9572,
+ "text_loss": 0.31049492955207825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 2.7524137281133567e-05,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15440643.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005919245071709156,
+ "skip_count": 0.0,
+ "step": 9574,
+ "text_loss": 0.16459886729717255
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 2.7422950984466233e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15443532.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0061412835493683815,
+ "skip_count": 2.0,
+ "step": 9576,
+ "text_loss": 0.7102797031402588
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 2.7321945780215573e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15447027.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001149018993601203,
+ "skip_count": 0.0,
+ "step": 9578,
+ "text_loss": 0.22778025269508362
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.97681244496625,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 2.722112170708696e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15450173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002216014079749584,
+ "skip_count": 0.0,
+ "step": 9580,
+ "text_loss": 0.21447396278381348
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06494140625,
+ "learning_rate": 2.7120478803716264e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15452838.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00498749827966094,
+ "skip_count": 0.0,
+ "step": 9582,
+ "text_loss": 0.1664455235004425
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.99559729967714,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 2.7020017108670246e-05,
+ "loss": 0.0064,
+ "macro_f1": 1.0,
+ "num_tokens": 15455928.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005886784754693508,
+ "skip_count": 3.0,
+ "step": 9584,
+ "text_loss": 0.3929266631603241
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.004696213677725,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 2.691973666044589e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15459447.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029895263724029064,
+ "skip_count": 1.0,
+ "step": 9586,
+ "text_loss": 0.27535343170166016
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.01408864103317,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 2.681963749747085e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15462340.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038893253076821566,
+ "skip_count": 0.0,
+ "step": 9588,
+ "text_loss": 0.6950465440750122
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.02348106838861,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 2.671971965810338e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15465432.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016947018448263407,
+ "skip_count": 0.0,
+ "step": 9590,
+ "text_loss": 0.41451266407966614
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 2.6619983180632134e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15468300.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011597154662013054,
+ "skip_count": 0.0,
+ "step": 9592,
+ "text_loss": 0.5846080780029297
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 2.6520428103276316e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15471084.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005555236246436834,
+ "skip_count": 2.0,
+ "step": 9594,
+ "text_loss": 0.4151473939418793
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.05165835045494,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 2.6421054464185633e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15474348.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015279205981642008,
+ "skip_count": 0.0,
+ "step": 9596,
+ "text_loss": 0.28742483258247375
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.061050777810394,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 2.6321862301440234e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15477493.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019169533625245094,
+ "skip_count": 0.0,
+ "step": 9598,
+ "text_loss": 0.338019460439682
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.07044320516584,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048095703125,
+ "learning_rate": 2.6222851653050773e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15480257.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015131557593122125,
+ "skip_count": 1.0,
+ "step": 9600,
+ "text_loss": 0.5982558727264404
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 2.612402255695828e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15482838.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026768618263304234,
+ "skip_count": 0.0,
+ "step": 9602,
+ "text_loss": 0.32012176513671875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 2.6025375051034306e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15485746.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002152341417968273,
+ "skip_count": 0.0,
+ "step": 9604,
+ "text_loss": 0.16942192614078522
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 45.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 2.5926909173080658e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15488669.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003325721947476268,
+ "skip_count": 3.0,
+ "step": 9606,
+ "text_loss": 0.47950080037117004
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.10801291458761,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 2.582862496082977e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15491512.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023114588111639023,
+ "skip_count": 1.0,
+ "step": 9608,
+ "text_loss": 0.3907585144042969
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.117405341943055,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 2.5730522451944292e-05,
+ "loss": 0.0033,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15494479.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003140041371807456,
+ "skip_count": 2.0,
+ "step": 9610,
+ "text_loss": 0.198005810379982
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.126797769298506,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 2.5632601684017264e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15497900.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015117402654141188,
+ "skip_count": 0.0,
+ "step": 9612,
+ "text_loss": 0.874154269695282
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 2.5534862694572114e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15501817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00551232136785984,
+ "skip_count": 2.0,
+ "step": 9614,
+ "text_loss": 0.1933375597000122
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.14558262400939,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 2.543730552106266e-05,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15504872.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001090583624318242,
+ "skip_count": 0.0,
+ "step": 9616,
+ "text_loss": 0.4030717611312866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 2.533993020087294e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15507727.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007001800462603569,
+ "skip_count": 0.0,
+ "step": 9618,
+ "text_loss": 0.4812186062335968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.16436747872028,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 2.5242736771317333e-05,
+ "loss": 0.0025,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15510689.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016861478798091412,
+ "skip_count": 0.0,
+ "step": 9620,
+ "text_loss": 0.4578339457511902
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.17375990607572,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05517578125,
+ "learning_rate": 2.514572526964065e-05,
+ "loss": 0.0068,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 15513419.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.050852373242378235,
+ "skip_count": 3.0,
+ "step": 9622,
+ "text_loss": 0.4038950204849243
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.183152333431174,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 2.5048895733017772e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15516289.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015001936117187142,
+ "skip_count": 0.0,
+ "step": 9624,
+ "text_loss": 0.8331962823867798
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.19254476078662,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 2.4952248198554073e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15519476.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009114370332099497,
+ "skip_count": 1.0,
+ "step": 9626,
+ "text_loss": 0.4997985363006592
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017822265625,
+ "learning_rate": 2.4855782703284925e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15523363.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011186953634023666,
+ "skip_count": 0.0,
+ "step": 9628,
+ "text_loss": 0.2572024464607239
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 45.211329615497505,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 2.4759499284176145e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 15526289.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019600817933678627,
+ "skip_count": 4.0,
+ "step": 9630,
+ "text_loss": 0.6323924660682678
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 45.22072204285295,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 2.466339797812378e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 15530260.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.02459629252552986,
+ "skip_count": 1.0,
+ "step": 9632,
+ "text_loss": 0.1824527233839035
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 45.23011447020839,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 2.4567478821954038e-05,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 15533916.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.009077859111130238,
+ "skip_count": 2.0,
+ "step": 9634,
+ "text_loss": 0.4518069326877594
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 2.4471741852423235e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15536958.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002355317585170269,
+ "skip_count": 0.0,
+ "step": 9636,
+ "text_loss": 0.8873519897460938
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.248899324919286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 2.437618710621803e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15540544.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001198371173813939,
+ "skip_count": 0.0,
+ "step": 9638,
+ "text_loss": 0.4845949709415436
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 2.4280814619955128e-05,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15543355.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009287866414524615,
+ "skip_count": 0.0,
+ "step": 9640,
+ "text_loss": 0.5979563593864441
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.26768417963017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 2.4185624430181464e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15547215.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028763876762241125,
+ "skip_count": 0.0,
+ "step": 9642,
+ "text_loss": 0.16279318928718567
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.27707660698562,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 2.4090616573374135e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15550412.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013361044693738222,
+ "skip_count": 0.0,
+ "step": 9644,
+ "text_loss": 0.2864333987236023
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 45.28646903434106,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 2.3995791085940244e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15553660.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0019316677935421467,
+ "skip_count": 0.0,
+ "step": 9646,
+ "text_loss": 0.6333117485046387
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.295861461696504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 2.390114800421722e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15556287.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011288017267361283,
+ "skip_count": 1.0,
+ "step": 9648,
+ "text_loss": 0.6050677299499512
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.305253889051954,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 2.380668736447239e-05,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15559246.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014249378582462668,
+ "skip_count": 0.0,
+ "step": 9650,
+ "text_loss": 0.9484158754348755
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 45.3146463164074,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 2.371240920290324e-05,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 15562251.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00741320988163352,
+ "skip_count": 4.0,
+ "step": 9652,
+ "text_loss": 0.24387991428375244
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 2.361831355563726e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15565704.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.000942508690059185,
+ "skip_count": 0.0,
+ "step": 9654,
+ "text_loss": 0.6523539423942566
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 45.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 2.352440045873233e-05,
+ "loss": 0.0091,
+ "macro_f1": 1.0,
+ "num_tokens": 15568797.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0064352210611104965,
+ "skip_count": 4.0,
+ "step": 9656,
+ "text_loss": 0.3206343650817871
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 2.3430669948175943e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15571855.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0013390982057899237,
+ "skip_count": 0.0,
+ "step": 9658,
+ "text_loss": 0.8397402763366699
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.35221602582917,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 2.3337122059885806e-05,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15575379.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012212366564199328,
+ "skip_count": 0.0,
+ "step": 9660,
+ "text_loss": 0.5116108655929565
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 2.324375682970975e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15578108.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003829900873824954,
+ "skip_count": 0.0,
+ "step": 9662,
+ "text_loss": 0.1423535794019699
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 45.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 2.3150574293425376e-05,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 15581830.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012756838463246822,
+ "skip_count": 1.0,
+ "step": 9664,
+ "text_loss": 0.24676625430583954
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 2.3057574486740507e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15584872.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020642473828047514,
+ "skip_count": 0.0,
+ "step": 9666,
+ "text_loss": 0.4851650893688202
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.38978573525095,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 2.2964757445292806e-05,
+ "loss": 0.0029,
+ "macro_f1": 1.0,
+ "num_tokens": 15588000.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.007441115565598011,
+ "skip_count": 3.0,
+ "step": 9668,
+ "text_loss": 0.6416954398155212
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.3991781626064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017333984375,
+ "learning_rate": 2.287212320464993e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15591065.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015504831681028008,
+ "skip_count": 0.0,
+ "step": 9670,
+ "text_loss": 0.5852687358856201
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 45.40857058996184,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 2.2779671800309433e-05,
+ "loss": 0.0046,
+ "macro_f1": 1.0,
+ "num_tokens": 15594631.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005648284684866667,
+ "skip_count": 2.0,
+ "step": 9672,
+ "text_loss": 0.7172279357910156
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.41796301731729,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 2.2687403267699024e-05,
+ "loss": 0.0057,
+ "macro_f1": 1.0,
+ "num_tokens": 15598664.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003756999270990491,
+ "skip_count": 2.0,
+ "step": 9674,
+ "text_loss": 0.18986566364765167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.427355444672735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 2.259531764217604e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15601616.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002155672525987029,
+ "skip_count": 0.0,
+ "step": 9676,
+ "text_loss": 0.4410690367221832
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.43674787202818,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 2.250341495902797e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15604291.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0020037787035107613,
+ "skip_count": 0.0,
+ "step": 9678,
+ "text_loss": 0.5565816164016724
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 2.241169525347203e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15607203.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014305647928267717,
+ "skip_count": 0.0,
+ "step": 9680,
+ "text_loss": 0.4879189729690552
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 2.2320158560655447e-05,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 15610475.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.016029199585318565,
+ "skip_count": 3.0,
+ "step": 9682,
+ "text_loss": 0.36342933773994446
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 2.2228804915655153e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15613810.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023584216833114624,
+ "skip_count": 0.0,
+ "step": 9684,
+ "text_loss": 0.18480375409126282
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.47431758144996,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 2.2137634353478043e-05,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15617854.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004325680434703827,
+ "skip_count": 1.0,
+ "step": 9686,
+ "text_loss": 0.5345974564552307
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 45.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 2.2046646909060996e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15620874.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.006946994923055172,
+ "skip_count": 0.0,
+ "step": 9688,
+ "text_loss": 0.29016008973121643
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.49310243616085,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 2.195584261727046e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15623875.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034732038620859385,
+ "skip_count": 1.0,
+ "step": 9690,
+ "text_loss": 0.2831312119960785
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 2.1865221512902766e-05,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15626371.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002495788736268878,
+ "skip_count": 1.0,
+ "step": 9692,
+ "text_loss": 0.6090453267097473
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 45.511887290871734,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0478515625,
+ "learning_rate": 2.1774783630684246e-05,
+ "loss": 0.0076,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 15630129.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.017551302909851074,
+ "skip_count": 1.0,
+ "step": 9694,
+ "text_loss": 0.5127915740013123
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 2.168452900527068e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15633179.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004413482965901494,
+ "skip_count": 0.0,
+ "step": 9696,
+ "text_loss": 0.5901434421539307
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.53067214558262,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 2.159445767124796e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15636508.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005992567166686058,
+ "skip_count": 1.0,
+ "step": 9698,
+ "text_loss": 0.8493689298629761
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.54006457293807,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 2.1504569663131523e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15639371.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0033268092665821314,
+ "skip_count": 0.0,
+ "step": 9700,
+ "text_loss": 0.2814267873764038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 2.1414865015366548e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15643025.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004418607335537672,
+ "skip_count": 0.0,
+ "step": 9702,
+ "text_loss": 0.2619725167751312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 45.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 2.1325343762328197e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15646996.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0050115580670535564,
+ "skip_count": 4.0,
+ "step": 9704,
+ "text_loss": 0.8204038143157959
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.5682418550044,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 2.123600593832109e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15650194.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018730501178652048,
+ "skip_count": 1.0,
+ "step": 9706,
+ "text_loss": 0.694500744342804
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.577634282359845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 2.1146851577579673e-05,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15653743.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016657712403684855,
+ "skip_count": 0.0,
+ "step": 9708,
+ "text_loss": 0.8211735486984253
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.58702670971529,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 2.1057880714268064e-05,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15657325.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029736643191426992,
+ "skip_count": 0.0,
+ "step": 9710,
+ "text_loss": 0.2846751809120178
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 45.59641913707074,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 2.0969093382479987e-05,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 15660522.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01233653537929058,
+ "skip_count": 4.0,
+ "step": 9712,
+ "text_loss": 0.23991759121418
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 2.0880489616239062e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15663254.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012792183551937342,
+ "skip_count": 0.0,
+ "step": 9714,
+ "text_loss": 0.6943771243095398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.61520399178163,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 2.0792069449498297e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15666283.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033134319819509983,
+ "skip_count": 0.0,
+ "step": 9716,
+ "text_loss": 0.4161235988140106
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 45.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 2.0703832916140476e-05,
+ "loss": 0.0034,
+ "macro_f1": 1.0,
+ "num_tokens": 15669774.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006201022770255804,
+ "skip_count": 1.0,
+ "step": 9718,
+ "text_loss": 0.42691144347190857
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 2.061578004997805e-05,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15672943.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033355073537677526,
+ "skip_count": 1.0,
+ "step": 9720,
+ "text_loss": 0.9724727869033813
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 2.0527910884753033e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15677847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019593657925724983,
+ "skip_count": 0.0,
+ "step": 9722,
+ "text_loss": 0.417218416929245
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.65277370120341,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 2.0440225454137097e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15681460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007862947881221771,
+ "skip_count": 2.0,
+ "step": 9724,
+ "text_loss": 0.24983589351177216
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 2.0352723791731364e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15685496.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004811233840882778,
+ "skip_count": 0.0,
+ "step": 9726,
+ "text_loss": 0.32930606603622437
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.8571428656578064,
+ "avg_layers": 22.0,
+ "epoch": 45.671558555914295,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.9230769276618958,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 2.0265405931066626e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.633273720741272,
+ "num_tokens": 15688661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02648334763944149,
+ "skip_count": 7.0,
+ "step": 9728,
+ "text_loss": 0.42316386103630066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 45.68095098326974,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 2.0178271905603395e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 15692778.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04439396783709526,
+ "skip_count": 3.0,
+ "step": 9730,
+ "text_loss": 0.32248371839523315
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.69034341062518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 2.0091321748731517e-05,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15695821.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020437403582036495,
+ "skip_count": 2.0,
+ "step": 9732,
+ "text_loss": 0.5959160923957825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.699735837980626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 2.000455549377045e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15699324.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0002844796108547598,
+ "skip_count": 0.0,
+ "step": 9734,
+ "text_loss": 0.45465928316116333
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.70912826533607,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 1.9917973173969204e-05,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15702044.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003548701060935855,
+ "skip_count": 0.0,
+ "step": 9736,
+ "text_loss": 0.7129027843475342
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 23.0,
+ "epoch": 45.71852069269152,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 1.9831574822506248e-05,
+ "loss": 0.0089,
+ "macro_f1": 0.6289562582969666,
+ "num_tokens": 15705474.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023800918832421303,
+ "skip_count": 6.0,
+ "step": 9738,
+ "text_loss": 0.28479668498039246
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.72791312004696,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 1.9745360472489648e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15708323.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01043168269097805,
+ "skip_count": 2.0,
+ "step": 9740,
+ "text_loss": 0.4760739803314209
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.73730554740241,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 1.9659330156956867e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15711390.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006430295296013355,
+ "skip_count": 2.0,
+ "step": 9742,
+ "text_loss": 0.13933971524238586
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 45.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 1.957348390887487e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15714077.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005738302133977413,
+ "skip_count": 3.0,
+ "step": 9744,
+ "text_loss": 0.49661460518836975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.756090402113294,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 1.948782176114017e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15716818.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011776578612625599,
+ "skip_count": 0.0,
+ "step": 9746,
+ "text_loss": 0.36066678166389465
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 1.9402343746578567e-05,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15720756.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005322427023202181,
+ "skip_count": 0.0,
+ "step": 9748,
+ "text_loss": 0.5549091696739197
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.77487525682419,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 1.931704989794547e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15724516.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001399765140376985,
+ "skip_count": 0.0,
+ "step": 9750,
+ "text_loss": 0.21269696950912476
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 1.9231940247925572e-05,
+ "loss": 0.0085,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15727142.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018337799701839685,
+ "skip_count": 1.0,
+ "step": 9752,
+ "text_loss": 0.18105024099349976
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 1.914701482913317e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15730023.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010057559702545404,
+ "skip_count": 0.0,
+ "step": 9754,
+ "text_loss": 0.477859228849411
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 45.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 1.906227367411173e-05,
+ "loss": 0.0035,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15733108.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002486895304173231,
+ "skip_count": 3.0,
+ "step": 9756,
+ "text_loss": 0.4802452027797699
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.81244496624596,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 1.8977716815334335e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15736130.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004353616386651993,
+ "skip_count": 0.0,
+ "step": 9758,
+ "text_loss": 0.5479429960250854
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.821837393601406,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 1.8893344285203228e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15738691.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031500225886702538,
+ "skip_count": 1.0,
+ "step": 9760,
+ "text_loss": 0.6871381402015686
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.83122982095686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 1.8809156116050164e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15741682.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023419202771037817,
+ "skip_count": 0.0,
+ "step": 9762,
+ "text_loss": 0.6725277900695801
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.8406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 1.8725152340136163e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15745314.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018769606249406934,
+ "skip_count": 0.0,
+ "step": 9764,
+ "text_loss": 0.4549144506454468
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 1.864133298965176e-05,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 15747982.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030958254355937243,
+ "skip_count": 2.0,
+ "step": 9766,
+ "text_loss": 0.4970727264881134
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.85940710302319,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 1.8557698096716534e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15750453.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020812496077269316,
+ "skip_count": 1.0,
+ "step": 9768,
+ "text_loss": 0.7540801167488098
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 1.847424769337963e-05,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15753857.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031040434259921312,
+ "skip_count": 0.0,
+ "step": 9770,
+ "text_loss": 0.5154248476028442
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.878191957734074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 1.8390981811619356e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15756742.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002128311200067401,
+ "skip_count": 0.0,
+ "step": 9772,
+ "text_loss": 0.7327702045440674
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.887584385089525,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 1.8307900483343354e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15759833.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003279880853369832,
+ "skip_count": 1.0,
+ "step": 9774,
+ "text_loss": 0.2673797607421875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.89697681244497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 1.8225003740388545e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15762768.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004170822445303202,
+ "skip_count": 0.0,
+ "step": 9776,
+ "text_loss": 0.1820847988128662
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.8888888955116272,
+ "avg_layers": 21.0,
+ "epoch": 45.90636923980041,
+ "f1_execute": 0.9729729890823364,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.9411765336990356,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 1.8142291614521132e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.9713832139968872,
+ "num_tokens": 15766965.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.022715313360095024,
+ "skip_count": 9.0,
+ "step": 9778,
+ "text_loss": 0.5590897798538208
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 1.8059764137436596e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15770199.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007280370220541954,
+ "skip_count": 1.0,
+ "step": 9780,
+ "text_loss": 0.28117987513542175
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 1.7977421340759582e-05,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15773367.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003529706271365285,
+ "skip_count": 0.0,
+ "step": 9782,
+ "text_loss": 0.18752245604991913
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.93454652186674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 1.7895263256044013e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15776976.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025916248559951782,
+ "skip_count": 1.0,
+ "step": 9784,
+ "text_loss": 0.6330561637878418
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.943938949222186,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 1.781328991477299e-05,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15780848.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0049234069883823395,
+ "skip_count": 1.0,
+ "step": 9786,
+ "text_loss": 0.15685316920280457
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 45.95333137657764,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 1.7731501348358882e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 15783808.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.011918511241674423,
+ "skip_count": 1.0,
+ "step": 9788,
+ "text_loss": 0.23963648080825806
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 1.7649897588143226e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15787421.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018508053617551923,
+ "skip_count": 0.0,
+ "step": 9790,
+ "text_loss": 0.49311593174934387
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 1.7568478665396736e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15790274.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006157457246445119,
+ "skip_count": 0.0,
+ "step": 9792,
+ "text_loss": 0.4567435085773468
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 45.98150865864397,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 1.7487244611319285e-05,
+ "loss": 0.0035,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15794462.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.0031584864482283592,
+ "skip_count": 0.0,
+ "step": 9794,
+ "text_loss": 0.4325876832008362
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 1.740619545703992e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15797775.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028455168940126896,
+ "skip_count": 0.0,
+ "step": 9796,
+ "text_loss": 0.1487245261669159
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 46.0,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 1.7325331233616847e-05,
+ "loss": 0.0078,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 15801092.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02560117095708847,
+ "skip_count": 4.0,
+ "step": 9798,
+ "text_loss": 0.5299228429794312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 46.00939242735544,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 1.7244651972037284e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 15804049.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010446238331496716,
+ "skip_count": 3.0,
+ "step": 9800,
+ "text_loss": 0.6591248512268066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 1.7164157703217886e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15807683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017791346181184053,
+ "skip_count": 0.0,
+ "step": 9802,
+ "text_loss": 0.45421653985977173
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 1.7083848458004035e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15810743.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008831496234051883,
+ "skip_count": 0.0,
+ "step": 9804,
+ "text_loss": 0.5535439848899841
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 1.7003724267170394e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15813880.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002800740534439683,
+ "skip_count": 0.0,
+ "step": 9806,
+ "text_loss": 0.5228974223136902
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 46.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 1.6923785161420845e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15816808.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006823428440839052,
+ "skip_count": 3.0,
+ "step": 9808,
+ "text_loss": 0.48018959164619446
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 1.6844031171388052e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15819803.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004808149300515652,
+ "skip_count": 0.0,
+ "step": 9810,
+ "text_loss": 0.31094294786453247
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 1.6764462327633955e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15822861.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026099751703441143,
+ "skip_count": 0.0,
+ "step": 9812,
+ "text_loss": 0.5534207224845886
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0478515625,
+ "learning_rate": 1.668507866064939e-05,
+ "loss": 0.0072,
+ "macro_f1": 1.0,
+ "num_tokens": 15825960.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008356450125575066,
+ "skip_count": 2.0,
+ "step": 9814,
+ "text_loss": 0.40162262320518494
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.084531846199,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 1.660588020085452e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15828906.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006548966746777296,
+ "skip_count": 2.0,
+ "step": 9816,
+ "text_loss": 0.2071811705827713
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.09392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 1.652686697859823e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15831935.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007895465241745114,
+ "skip_count": 0.0,
+ "step": 9818,
+ "text_loss": 0.6879562735557556
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 1.6448039024158534e-05,
+ "loss": 0.0037,
+ "macro_f1": 1.0,
+ "num_tokens": 15835745.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00370208453387022,
+ "skip_count": 2.0,
+ "step": 9820,
+ "text_loss": 0.6139163970947266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.11270912826534,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 1.6369396367742483e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15838373.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002627170644700527,
+ "skip_count": 0.0,
+ "step": 9822,
+ "text_loss": 0.3881947100162506
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.018798828125,
+ "learning_rate": 1.6290939039486084e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15841156.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005191941745579243,
+ "skip_count": 2.0,
+ "step": 9824,
+ "text_loss": 0.6564247608184814
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 46.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 1.621266706945429e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15843877.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003889352548867464,
+ "skip_count": 0.0,
+ "step": 9826,
+ "text_loss": 0.7128682136535645
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 46.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 1.6134580487641047e-05,
+ "loss": 0.0031,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15846880.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00674893194809556,
+ "skip_count": 4.0,
+ "step": 9828,
+ "text_loss": 0.30893367528915405
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 1.6056679323969425e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15850130.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009898045100271702,
+ "skip_count": 0.0,
+ "step": 9830,
+ "text_loss": 0.6550688743591309
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 46.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 1.5978963608291154e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15853578.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0046016750857234,
+ "skip_count": 0.0,
+ "step": 9832,
+ "text_loss": 0.43872204422950745
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 1.5901433370387132e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15857939.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004589201882481575,
+ "skip_count": 1.0,
+ "step": 9834,
+ "text_loss": 0.41940808296203613
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 1.5824088639967094e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15860584.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018899316200986505,
+ "skip_count": 1.0,
+ "step": 9836,
+ "text_loss": 0.5105440616607666
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 1.5746929446669556e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15864386.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006366848247125745,
+ "skip_count": 0.0,
+ "step": 9838,
+ "text_loss": 0.5686481595039368
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.017333984375,
+ "learning_rate": 1.5669955820062254e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15869103.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0043256948702037334,
+ "skip_count": 1.0,
+ "step": 9840,
+ "text_loss": 0.16309607028961182
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 1.5593167789641483e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15872384.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00406000716611743,
+ "skip_count": 1.0,
+ "step": 9842,
+ "text_loss": 0.21662485599517822
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 46.21602582917523,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 1.551656538483259e-05,
+ "loss": 0.0076,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 15875261.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.020087692886590958,
+ "skip_count": 2.0,
+ "step": 9844,
+ "text_loss": 0.6189377903938293
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 1.5440148634989826e-05,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15878132.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005302145145833492,
+ "skip_count": 0.0,
+ "step": 9846,
+ "text_loss": 0.34496018290519714
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 1.536391756939609e-05,
+ "loss": 0.0091,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15881381.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008405420929193497,
+ "skip_count": 2.0,
+ "step": 9848,
+ "text_loss": 0.2865080237388611
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 1.528787221726341e-05,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15884621.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016017532907426357,
+ "skip_count": 0.0,
+ "step": 9850,
+ "text_loss": 0.6104921102523804
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 1.5212012607732528e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15888157.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015318389050662518,
+ "skip_count": 0.0,
+ "step": 9852,
+ "text_loss": 0.2622036933898926
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 1.5136338769872915e-05,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 15891080.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006494096480309963,
+ "skip_count": 4.0,
+ "step": 9854,
+ "text_loss": 0.23415961861610413
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 1.5060850732682928e-05,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 15895486.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.007511078380048275,
+ "skip_count": 3.0,
+ "step": 9856,
+ "text_loss": 0.7389219999313354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 1.4985548525089709e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15898747.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004874013364315033,
+ "skip_count": 2.0,
+ "step": 9858,
+ "text_loss": 0.6853085160255432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 1.4910432175949285e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15902157.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009244410903193057,
+ "skip_count": 0.0,
+ "step": 9860,
+ "text_loss": 0.8172202110290527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 46.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 1.4835501714046296e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15905012.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00456853536888957,
+ "skip_count": 3.0,
+ "step": 9862,
+ "text_loss": 0.7527797818183899
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.30995010272967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 1.4760757168094275e-05,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15908302.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009686833946034312,
+ "skip_count": 0.0,
+ "step": 9864,
+ "text_loss": 0.5548131465911865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 1.4686198566735531e-05,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15911923.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008255072170868516,
+ "skip_count": 0.0,
+ "step": 9866,
+ "text_loss": 0.5995872020721436
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.32873495744057,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 1.4611825938540935e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15914858.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002459712326526642,
+ "skip_count": 0.0,
+ "step": 9868,
+ "text_loss": 0.6777655482292175
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.33812738479601,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017578125,
+ "learning_rate": 1.4537639312010298e-05,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15918091.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014664786867797375,
+ "skip_count": 0.0,
+ "step": 9870,
+ "text_loss": 0.42750120162963867
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 46.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 1.4463638715572103e-05,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 15920943.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005549794062972069,
+ "skip_count": 1.0,
+ "step": 9872,
+ "text_loss": 0.27477580308914185
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 1.4389824177583388e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15924212.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007967505604028702,
+ "skip_count": 2.0,
+ "step": 9874,
+ "text_loss": 0.3174900412559509
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 1.4316195726330139e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15929143.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014913028571754694,
+ "skip_count": 2.0,
+ "step": 9876,
+ "text_loss": 0.40919792652130127
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.375697094217784,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 1.4242753390026953e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15931702.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003994424478150904,
+ "skip_count": 0.0,
+ "step": 9878,
+ "text_loss": 0.35346853733062744
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.385089521573235,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 1.4169497196816983e-05,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 15935225.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008424114435911179,
+ "skip_count": 3.0,
+ "step": 9880,
+ "text_loss": 0.230825275182724
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 1.4096427174772164e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15938630.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004314251709729433,
+ "skip_count": 1.0,
+ "step": 9882,
+ "text_loss": 0.8749642968177795
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 1.4023543351893043e-05,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15941779.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008999531855806708,
+ "skip_count": 0.0,
+ "step": 9884,
+ "text_loss": 0.6549318432807922
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 1.3950845756108943e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15944779.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010829231468960643,
+ "skip_count": 0.0,
+ "step": 9886,
+ "text_loss": 0.5681273341178894
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 1.3878334415277583e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15947757.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038863453082740307,
+ "skip_count": 1.0,
+ "step": 9888,
+ "text_loss": 0.4282133877277374
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 46.43205165835045,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017822265625,
+ "learning_rate": 1.3806009357185512e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15952223.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0006428947090171278,
+ "skip_count": 0.0,
+ "step": 9890,
+ "text_loss": 0.4455379247665405
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.441444085705896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 1.3733870609547838e-05,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15955968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00048406270798295736,
+ "skip_count": 0.0,
+ "step": 9892,
+ "text_loss": 0.37554407119750977
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.45083651306135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 1.3661918200008228e-05,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15959376.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004503594245761633,
+ "skip_count": 1.0,
+ "step": 9894,
+ "text_loss": 0.22027169167995453
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 1.3590152156139012e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15962882.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011738749453797936,
+ "skip_count": 0.0,
+ "step": 9896,
+ "text_loss": 0.4203954041004181
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 1.3518572505440973e-05,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 15965816.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00806320272386074,
+ "skip_count": 2.0,
+ "step": 9898,
+ "text_loss": 0.18884631991386414
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.47901379512768,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 1.3447179275343779e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15968840.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004962162580341101,
+ "skip_count": 1.0,
+ "step": 9900,
+ "text_loss": 0.22457796335220337
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.48840622248312,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 1.3375972493205268e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15972768.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025535912718623877,
+ "skip_count": 0.0,
+ "step": 9902,
+ "text_loss": 0.14859545230865479
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 1.3304952186312114e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15975380.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002036662772297859,
+ "skip_count": 0.0,
+ "step": 9904,
+ "text_loss": 0.5820382833480835
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.507191077194015,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 1.3234118381879378e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15978335.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0055219330824911594,
+ "skip_count": 2.0,
+ "step": 9906,
+ "text_loss": 0.29671815037727356
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 1.316347110705074e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15982003.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005196230486035347,
+ "skip_count": 0.0,
+ "step": 9908,
+ "text_loss": 0.5204919576644897
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 1.3093010388898319e-05,
+ "loss": 0.0069,
+ "macro_f1": 1.0,
+ "num_tokens": 15984937.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0032779101748019457,
+ "skip_count": 2.0,
+ "step": 9910,
+ "text_loss": 0.6803483366966248
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 1.3022736254422851e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15988992.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002347869798541069,
+ "skip_count": 0.0,
+ "step": 9912,
+ "text_loss": 0.5335546731948853
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 1.2952648730553462e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15992828.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011128517799079418,
+ "skip_count": 0.0,
+ "step": 9914,
+ "text_loss": 0.686739981174469
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.55415321397123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 1.288274784414789e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15995984.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031158174388110638,
+ "skip_count": 0.0,
+ "step": 9916,
+ "text_loss": 0.16102474927902222
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.563545641326684,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.059814453125,
+ "learning_rate": 1.2813033621992264e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15999606.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029228583443909883,
+ "skip_count": 1.0,
+ "step": 9918,
+ "text_loss": 0.6022558212280273
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.57293806868213,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 1.274350609080116e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16002456.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031404250767081976,
+ "skip_count": 2.0,
+ "step": 9920,
+ "text_loss": 0.7529577016830444
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 1.2674165277217653e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16005547.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038669302593916655,
+ "skip_count": 0.0,
+ "step": 9922,
+ "text_loss": 0.47488540410995483
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 1.2605011207813378e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16009520.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004838052671402693,
+ "skip_count": 0.0,
+ "step": 9924,
+ "text_loss": 0.5252779722213745
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 46.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 1.2536043909088191e-05,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16012730.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017430823063477874,
+ "skip_count": 0.0,
+ "step": 9926,
+ "text_loss": 0.40845534205436707
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0167236328125,
+ "learning_rate": 1.2467263407470619e-05,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16015940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010244545992463827,
+ "skip_count": 0.0,
+ "step": 9928,
+ "text_loss": 0.8465730547904968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.619900205459345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 1.2398669729317357e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16018851.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007380630704574287,
+ "skip_count": 0.0,
+ "step": 9930,
+ "text_loss": 0.37603214383125305
+ },
+ {
+ "acc_repeat": 0.800000011920929,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.629292632814796,
+ "f1_execute": 0.9729729890823364,
+ "f1_repeat": 0.888888955116272,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 1.2330262900913657e-05,
+ "loss": 0.0087,
+ "macro_f1": 0.9539539813995361,
+ "num_tokens": 16022351.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.053848277777433395,
+ "skip_count": 5.0,
+ "step": 9932,
+ "text_loss": 0.2047014981508255
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 46.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 1.2262042948473163e-05,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16024902.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0020845322869718075,
+ "skip_count": 0.0,
+ "step": 9934,
+ "text_loss": 0.6269918084144592
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 1.2194009898137903e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16028056.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008686805376783013,
+ "skip_count": 0.0,
+ "step": 9936,
+ "text_loss": 0.4100899398326874
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 46.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 1.212616377597825e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16032111.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004883588291704655,
+ "skip_count": 3.0,
+ "step": 9938,
+ "text_loss": 0.3921346664428711
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.66686234223657,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 1.2058504607993015e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16035872.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005067490856163204,
+ "skip_count": 0.0,
+ "step": 9940,
+ "text_loss": 0.44368258118629456
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06787109375,
+ "learning_rate": 1.1991032420109238e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16038923.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005819452460855246,
+ "skip_count": 2.0,
+ "step": 9942,
+ "text_loss": 0.27500197291374207
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.685647196947464,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 1.1923747238182403e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 16041803.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.035794492810964584,
+ "skip_count": 3.0,
+ "step": 9944,
+ "text_loss": 0.5083543062210083
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 1.1856649087996384e-05,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 16045258.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002845201175659895,
+ "skip_count": 2.0,
+ "step": 9946,
+ "text_loss": 0.6859534382820129
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 1.1789737995263228e-05,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16048618.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007575460476800799,
+ "skip_count": 0.0,
+ "step": 9948,
+ "text_loss": 0.4512535333633423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 1.1723013985623477e-05,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16051595.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002697878750041127,
+ "skip_count": 1.0,
+ "step": 9950,
+ "text_loss": 0.3572070300579071
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 1.16564770846459e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16054494.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0062429774552583694,
+ "skip_count": 1.0,
+ "step": 9952,
+ "text_loss": 0.5479834079742432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.73260933372468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 1.1590127317827492e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16057555.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009302232647314668,
+ "skip_count": 0.0,
+ "step": 9954,
+ "text_loss": 0.44800761342048645
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 1.1523964710593637e-05,
+ "loss": 0.0032,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16061072.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002112898975610733,
+ "skip_count": 0.0,
+ "step": 9956,
+ "text_loss": 0.3274081349372864
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.751394188435576,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 1.1457989288297942e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16064165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00028447998920455575,
+ "skip_count": 0.0,
+ "step": 9958,
+ "text_loss": 0.5712385773658752
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 1.1392201076222352e-05,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 16067293.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009599249809980392,
+ "skip_count": 2.0,
+ "step": 9960,
+ "text_loss": 0.26818037033081055
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.77017904314646,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055908203125,
+ "learning_rate": 1.132660009957709e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16069852.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005338563583791256,
+ "skip_count": 0.0,
+ "step": 9962,
+ "text_loss": 0.6658869981765747
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 1.1261186383500487e-05,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16072633.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001175224082544446,
+ "skip_count": 1.0,
+ "step": 9964,
+ "text_loss": 0.4461731016635895
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 46.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 1.1195959953059221e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16076065.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0036650802940130234,
+ "skip_count": 0.0,
+ "step": 9966,
+ "text_loss": 0.6107141971588135
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 1.113092083324818e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16079309.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005924097262322903,
+ "skip_count": 2.0,
+ "step": 9968,
+ "text_loss": 0.5104627013206482
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 46.807748752568244,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 1.1066069048990545e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16082180.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.010777595452964306,
+ "skip_count": 0.0,
+ "step": 9970,
+ "text_loss": 0.5205907225608826
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.81714117992369,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.056640625,
+ "learning_rate": 1.100140462513749e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16084654.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019593914039433002,
+ "skip_count": 0.0,
+ "step": 9972,
+ "text_loss": 0.36411789059638977
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 46.82653360727913,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 1.0936927586468693e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 16087736.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0233579371124506,
+ "skip_count": 4.0,
+ "step": 9974,
+ "text_loss": 0.267604261636734
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 1.0872637957691833e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16090838.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00034629934816621244,
+ "skip_count": 0.0,
+ "step": 9976,
+ "text_loss": 0.576068103313446
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 1.0808535763442761e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16094084.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004253332444932312,
+ "skip_count": 0.0,
+ "step": 9978,
+ "text_loss": 0.5883988738059998
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 1.0744621028285662e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16097432.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005800648941658437,
+ "skip_count": 0.0,
+ "step": 9980,
+ "text_loss": 0.3358926475048065
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 46.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 1.068089377671272e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16100711.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015245937975123525,
+ "skip_count": 0.0,
+ "step": 9982,
+ "text_loss": 0.6802405714988708
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 1.061735403314429e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16103952.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002281307242810726,
+ "skip_count": 1.0,
+ "step": 9984,
+ "text_loss": 0.3086298406124115
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 1.055400182192906e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16107101.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007910717977210879,
+ "skip_count": 0.0,
+ "step": 9986,
+ "text_loss": 0.7036139965057373
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 46.89228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 1.0490837167343559e-05,
+ "loss": 0.0077,
+ "macro_f1": 1.0,
+ "num_tokens": 16110316.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030006880406290293,
+ "skip_count": 1.0,
+ "step": 9988,
+ "text_loss": 0.4638058841228485
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 1.04278600935927e-05,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16113206.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006434856331907213,
+ "skip_count": 0.0,
+ "step": 9990,
+ "text_loss": 0.6155068874359131
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052734375,
+ "learning_rate": 1.0365070624809403e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16116098.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007891099085099995,
+ "skip_count": 0.0,
+ "step": 9992,
+ "text_loss": 0.4537872076034546
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 46.92045788083358,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 1.0302468785054641e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 16119344.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.011918486095964909,
+ "skip_count": 1.0,
+ "step": 9994,
+ "text_loss": 0.18828579783439636
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 1.0240054598317672e-05,
+ "loss": 0.0046,
+ "macro_f1": 1.0,
+ "num_tokens": 16122615.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.016306765377521515,
+ "skip_count": 2.0,
+ "step": 9996,
+ "text_loss": 0.2876183092594147
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.93924273554447,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 1.0177828088515694e-05,
+ "loss": 0.0033,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16125506.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00393108231946826,
+ "skip_count": 1.0,
+ "step": 9998,
+ "text_loss": 0.6387818455696106
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 1.011578927949397e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16128499.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001175055862404406,
+ "skip_count": 0.0,
+ "step": 10000,
+ "text_loss": 0.4085952639579773
+ }
+ ],
+ "logging_steps": 2,
+ "max_steps": 10650,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 50,
+ "save_steps": 1000,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.738398356854296e+17,
+ "train_batch_size": 1,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-10000/training_args.bin b/checkpoint-10000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a3d3ae372faf14539639f54454aa52b6ee730c4a
--- /dev/null
+++ b/checkpoint-10000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8
+size 5880
diff --git a/checkpoint-10650/chat_template.jinja b/checkpoint-10650/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0
--- /dev/null
+++ b/checkpoint-10650/chat_template.jinja
@@ -0,0 +1,93 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+ {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+ {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+ {%- if strftime_now is defined %}
+ {%- set date_string = strftime_now("%d %b %Y") %}
+ {%- else %}
+ {%- set date_string = "26 Jul 2024" %}
+ {%- endif %}
+{%- endif %}
+{%- if not tools is defined %}
+ {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+ {%- set system_message = messages[0]['content']|trim %}
+ {%- set messages = messages[1:] %}
+{%- else %}
+ {%- set system_message = "" %}
+{%- endif %}
+
+{#- System message #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if tools is not none %}
+ {{- "Environment: ipython\n" }}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+ {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+ {{- "Do not use variables.\n\n" }}
+ {%- for t in tools %}
+ {{- t | tojson(indent=4) }}
+ {{- "\n\n" }}
+ {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+ {#- Extract the first user message so we can plug it in here #}
+ {%- if messages | length != 0 %}
+ {%- set first_user_message = messages[0]['content']|trim %}
+ {%- set messages = messages[1:] %}
+ {%- else %}
+ {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+ {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+ {{- "Given the following functions, please respond with a JSON for a function call " }}
+ {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+ {{- "Do not use variables.\n\n" }}
+ {%- for t in tools %}
+ {{- t | tojson(indent=4) }}
+ {{- "\n\n" }}
+ {%- endfor %}
+ {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+ {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+ {%- elif 'tool_calls' in message %}
+ {%- if not message.tool_calls|length == 1 %}
+ {{- raise_exception("This model only supports single tool-calls at once!") }}
+ {%- endif %}
+ {%- set tool_call = message.tool_calls[0].function %}
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+ {{- '{"name": "' + tool_call.name + '", ' }}
+ {{- '"parameters": ' }}
+ {{- tool_call.arguments | tojson }}
+ {{- "}" }}
+ {{- "<|eot_id|>" }}
+ {%- elif message.role == "tool" or message.role == "ipython" %}
+ {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+ {%- if message.content is mapping or message.content is iterable %}
+ {{- message.content | tojson }}
+ {%- else %}
+ {{- message.content }}
+ {%- endif %}
+ {{- "<|eot_id|>" }}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
diff --git a/checkpoint-10650/config.json b/checkpoint-10650/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3552bd1c531626bd125241ad5dfcd7fb677462cd
--- /dev/null
+++ b/checkpoint-10650/config.json
@@ -0,0 +1,39 @@
+{
+ "architectures": [
+ "LlamaForCausalLM"
+ ],
+ "attention_bias": false,
+ "attention_dropout": 0.0,
+ "bos_token_id": 128000,
+ "eos_token_id": [
+ 128001,
+ 128008,
+ 128009
+ ],
+ "head_dim": 128,
+ "hidden_act": "silu",
+ "hidden_size": 3072,
+ "initializer_range": 0.02,
+ "intermediate_size": 8192,
+ "max_position_embeddings": 131072,
+ "mlp_bias": false,
+ "model_type": "llama",
+ "num_attention_heads": 24,
+ "num_hidden_layers": 28,
+ "num_key_value_heads": 8,
+ "pretraining_tp": 1,
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": {
+ "factor": 32.0,
+ "high_freq_factor": 4.0,
+ "low_freq_factor": 1.0,
+ "original_max_position_embeddings": 8192,
+ "rope_type": "llama3"
+ },
+ "rope_theta": 500000.0,
+ "tie_word_embeddings": true,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.55.2",
+ "use_cache": true,
+ "vocab_size": 128256
+}
diff --git a/checkpoint-10650/generation_config.json b/checkpoint-10650/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b513e54e3195b917260c9a8a04c9f3683f19de35
--- /dev/null
+++ b/checkpoint-10650/generation_config.json
@@ -0,0 +1,12 @@
+{
+ "bos_token_id": 128000,
+ "do_sample": true,
+ "eos_token_id": [
+ 128001,
+ 128008,
+ 128009
+ ],
+ "temperature": 0.6,
+ "top_p": 0.9,
+ "transformers_version": "4.55.2"
+}
diff --git a/checkpoint-10650/model-00001-of-00002.safetensors b/checkpoint-10650/model-00001-of-00002.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..08a01e1ba553cdcb2222f034a209861d7b54e284
--- /dev/null
+++ b/checkpoint-10650/model-00001-of-00002.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13cbd6d16e927a0c5bad54102514e6e18b4a47b3a6eb911e39d678d328d19f55
+size 4965799096
diff --git a/checkpoint-10650/model-00002-of-00002.safetensors b/checkpoint-10650/model-00002-of-00002.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ee28c0dc703eb09f36a601d56c971edb4d4406e3
--- /dev/null
+++ b/checkpoint-10650/model-00002-of-00002.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdb777b656b02eca5ed71a1eec21997465d360e38b5311d83b9de66d34fc2ff9
+size 1481790520
diff --git a/checkpoint-10650/model.safetensors.index.json b/checkpoint-10650/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..21bb567761d75ade0c0eef6495c450697dd3ff18
--- /dev/null
+++ b/checkpoint-10650/model.safetensors.index.json
@@ -0,0 +1,374 @@
+{
+ "metadata": {
+ "total_parameters": 3223774292,
+ "total_size": 6447548584
+ },
+ "weight_map": {
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.norm.weight": "model-00002-of-00002.safetensors",
+ "model.routers.0.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.0.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.0.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.0.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.1.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.1.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.1.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.1.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.10.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.10.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.10.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.10.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.11.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.11.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.11.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.11.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.12.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.12.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.12.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.12.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.13.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.13.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.13.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.13.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.14.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.14.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.14.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.14.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.15.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.15.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.15.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.15.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.16.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.16.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.16.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.16.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.17.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.17.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.17.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.17.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.18.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.18.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.18.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.18.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.19.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.19.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.19.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.19.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.2.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.2.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.2.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.2.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.20.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.20.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.20.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.20.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.21.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.21.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.21.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.21.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.22.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.22.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.22.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.22.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.23.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.23.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.23.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.23.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.24.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.24.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.24.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.24.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.25.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.25.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.25.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.25.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.26.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.26.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.26.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.26.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.27.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.27.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.27.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.27.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.3.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.3.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.3.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.3.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.4.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.4.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.4.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.4.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.5.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.5.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.5.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.5.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.6.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.6.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.6.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.6.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.7.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.7.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.7.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.7.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.8.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.8.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.8.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.8.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.9.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.9.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.9.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.9.linear2.weight": "model-00002-of-00002.safetensors"
+ }
+}
diff --git a/checkpoint-10650/optimizer.pt b/checkpoint-10650/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..03fc5bdc59b880ca8ba9833a5e2f8651d4e107f8
--- /dev/null
+++ b/checkpoint-10650/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0dd7a38e033ea8dacacb991c36d2d46a2a9f889893d9c26efeebbe35465e69be
+size 44191162
diff --git a/checkpoint-10650/rng_state.pth b/checkpoint-10650/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..cc552cae08c3af1de204610a293370696f1faaaa
--- /dev/null
+++ b/checkpoint-10650/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ed7c9f18c0606d7eb2f3d6cfe5e71f033f5f69b5a6aa170ad2ff926625abd40
+size 14244
diff --git a/checkpoint-10650/scheduler.pt b/checkpoint-10650/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..83c64bebb094a313f6a10e493f9f55c8793e86c7
--- /dev/null
+++ b/checkpoint-10650/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5faea1e19b92c75b4859c50baf8e943951af0ec0dc6f6201e9523b77f93deb7
+size 1064
diff --git a/checkpoint-10650/special_tokens_map.json b/checkpoint-10650/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..165b36bc2293dda9a2fb3c0daf6577d9eba9df7a
--- /dev/null
+++ b/checkpoint-10650/special_tokens_map.json
@@ -0,0 +1,17 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "<|finetune_right_pad_id|>"
+}
diff --git a/checkpoint-10650/tokenizer.json b/checkpoint-10650/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-10650/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-10650/tokenizer_config.json b/checkpoint-10650/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c68051fe3c4d23234a59316bc52d21f6e3a4182c
--- /dev/null
+++ b/checkpoint-10650/tokenizer_config.json
@@ -0,0 +1,2063 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|finetune_right_pad_id|>",
+ "tokenizer_class": "PreTrainedTokenizerFast"
+}
diff --git a/checkpoint-10650/trainer_state.json b/checkpoint-10650/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d8c0f461c7295ab49302e1afeca1ee6024cd459
--- /dev/null
+++ b/checkpoint-10650/trainer_state.json
@@ -0,0 +1,101209 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 50.0,
+ "eval_steps": 500,
+ "global_step": 10650,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 25.0,
+ "epoch": 0.009392427355444672,
+ "f1_execute": 0.6976743936538696,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 2.25,
+ "learning_rate": 2e-06,
+ "loss": 0.4974,
+ "macro_f1": 0.23255813121795654,
+ "num_tokens": 3175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.4339469373226166,
+ "skip_count": 0.0,
+ "step": 2,
+ "text_loss": 0.3330848515033722
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 23.0,
+ "epoch": 0.018784854710889344,
+ "f1_execute": 0.7272726893424988,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.8359375,
+ "learning_rate": 6e-06,
+ "loss": 0.4988,
+ "macro_f1": 0.24242423474788666,
+ "num_tokens": 5816.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.4511934816837311,
+ "skip_count": 1.0,
+ "step": 4,
+ "text_loss": 0.4571273922920227
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.02817728206633402,
+ "f1_execute": 0.6666666865348816,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 2.234375,
+ "learning_rate": 1e-05,
+ "loss": 0.5113,
+ "macro_f1": 0.222222238779068,
+ "num_tokens": 9739.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.49306994676589966,
+ "skip_count": 0.0,
+ "step": 6,
+ "text_loss": 0.41060560941696167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.03756970942177869,
+ "f1_execute": 0.5641025900840759,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.7265625,
+ "learning_rate": 1.4e-05,
+ "loss": 0.4766,
+ "macro_f1": 0.18803420662879944,
+ "num_tokens": 12869.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.48872503638267517,
+ "skip_count": 2.0,
+ "step": 8,
+ "text_loss": 0.36678561568260193
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.046962136777223364,
+ "f1_execute": 0.6976743936538696,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.78125,
+ "learning_rate": 1.8e-05,
+ "loss": 0.4806,
+ "macro_f1": 0.23255813121795654,
+ "num_tokens": 15845.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.45077216625213623,
+ "skip_count": 0.0,
+ "step": 10,
+ "text_loss": 0.5597779154777527
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 26.0,
+ "epoch": 0.05635456413266804,
+ "f1_execute": 0.7179487347602844,
+ "f1_repeat": 0.2857142984867096,
+ "f1_skip": 0.20000000298023224,
+ "grad_norm": 1.5390625,
+ "learning_rate": 2.2e-05,
+ "loss": 0.4557,
+ "macro_f1": 0.40122103691101074,
+ "num_tokens": 19353.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.4130440056324005,
+ "skip_count": 3.0,
+ "step": 12,
+ "text_loss": 0.2056603729724884
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.06574699148811271,
+ "f1_execute": 0.6976743936538696,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 2.4375,
+ "learning_rate": 2.6e-05,
+ "loss": 0.5129,
+ "macro_f1": 0.23255813121795654,
+ "num_tokens": 22675.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.4582902193069458,
+ "skip_count": 0.0,
+ "step": 14,
+ "text_loss": 0.32989829778671265
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 0.07513941884355738,
+ "f1_execute": 0.6829268336296082,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.2222222238779068,
+ "grad_norm": 1.7421875,
+ "learning_rate": 3e-05,
+ "loss": 0.4729,
+ "macro_f1": 0.3017163574695587,
+ "num_tokens": 26022.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.42910993099212646,
+ "skip_count": 1.0,
+ "step": 16,
+ "text_loss": 0.1353905349969864
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.08453184619900206,
+ "f1_execute": 0.7555555105209351,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.4765625,
+ "learning_rate": 3.4000000000000007e-05,
+ "loss": 0.4274,
+ "macro_f1": 0.2518518567085266,
+ "num_tokens": 29251.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.3990713059902191,
+ "skip_count": 0.0,
+ "step": 18,
+ "text_loss": 0.3806765377521515
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 26.0,
+ "epoch": 0.09392427355444673,
+ "f1_execute": 0.6829268336296082,
+ "f1_repeat": 0.2857142984867096,
+ "f1_skip": 0.0,
+ "grad_norm": 1.3125,
+ "learning_rate": 3.8e-05,
+ "loss": 0.4261,
+ "macro_f1": 0.3228803873062134,
+ "num_tokens": 32545.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.40146592259407043,
+ "skip_count": 0.0,
+ "step": 20,
+ "text_loss": 0.25648367404937744
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 26.0,
+ "epoch": 0.1033167009098914,
+ "f1_execute": 0.7272727489471436,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.625,
+ "learning_rate": 4.2000000000000004e-05,
+ "loss": 0.404,
+ "macro_f1": 0.24242424964904785,
+ "num_tokens": 36560.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.372715026140213,
+ "skip_count": 0.0,
+ "step": 22,
+ "text_loss": 0.2799522578716278
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.11270912826533608,
+ "f1_execute": 0.7555555105209351,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.6328125,
+ "learning_rate": 4.6e-05,
+ "loss": 0.4218,
+ "macro_f1": 0.2518518567085266,
+ "num_tokens": 39597.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.4504941403865814,
+ "skip_count": 0.0,
+ "step": 24,
+ "text_loss": 0.6635695695877075
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.12210155562078075,
+ "f1_execute": 0.8085106015205383,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.7109375,
+ "learning_rate": 5e-05,
+ "loss": 0.3886,
+ "macro_f1": 0.26950353384017944,
+ "num_tokens": 43080.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.3498791456222534,
+ "skip_count": 0.0,
+ "step": 26,
+ "text_loss": 0.7035041451454163
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.13149398297622542,
+ "f1_execute": 0.8085106015205383,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.34375,
+ "learning_rate": 5.4e-05,
+ "loss": 0.3724,
+ "macro_f1": 0.26950353384017944,
+ "num_tokens": 46406.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.31265875697135925,
+ "skip_count": 0.0,
+ "step": 28,
+ "text_loss": 0.6388277411460876
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.1408864103316701,
+ "f1_execute": 0.8571428060531616,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.2578125,
+ "learning_rate": 5.800000000000001e-05,
+ "loss": 0.341,
+ "macro_f1": 0.2857142686843872,
+ "num_tokens": 49966.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.3200918138027191,
+ "skip_count": 2.0,
+ "step": 30,
+ "text_loss": 0.17372547090053558
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 25.0,
+ "epoch": 0.15027883768711475,
+ "f1_execute": 0.8571428060531616,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.4140625,
+ "learning_rate": 6.2e-05,
+ "loss": 0.3207,
+ "macro_f1": 0.2857142686843872,
+ "num_tokens": 53378.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.32304447889328003,
+ "skip_count": 1.0,
+ "step": 32,
+ "text_loss": 0.18196581304073334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 25.0,
+ "epoch": 0.15967126504255943,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.46875,
+ "learning_rate": 6.6e-05,
+ "loss": 0.3304,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 56933.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.24814388155937195,
+ "skip_count": 0.0,
+ "step": 34,
+ "text_loss": 0.28823015093803406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 26.0,
+ "epoch": 0.16906369239800412,
+ "f1_execute": 0.9019607901573181,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.1171875,
+ "learning_rate": 7.000000000000001e-05,
+ "loss": 0.2778,
+ "macro_f1": 0.3006536066532135,
+ "num_tokens": 60744.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.22411039471626282,
+ "skip_count": 0.0,
+ "step": 36,
+ "text_loss": 0.5260357856750488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.17845611975344877,
+ "f1_execute": 0.8571428656578064,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.484375,
+ "learning_rate": 7.4e-05,
+ "loss": 0.2738,
+ "macro_f1": 0.2857142984867096,
+ "num_tokens": 64900.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.44355395436286926,
+ "skip_count": 0.0,
+ "step": 38,
+ "text_loss": 0.5382097363471985
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 26.0,
+ "epoch": 0.18784854710889345,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.3828125,
+ "learning_rate": 7.8e-05,
+ "loss": 0.2137,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 68000.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.202330082654953,
+ "skip_count": 0.0,
+ "step": 40,
+ "text_loss": 0.5946118831634521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 25.0,
+ "epoch": 0.19724097446433814,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.78125,
+ "learning_rate": 8.2e-05,
+ "loss": 0.21,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 70529.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.18023855984210968,
+ "skip_count": 0.0,
+ "step": 42,
+ "text_loss": 0.5550904273986816
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.2066334018197828,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.609375,
+ "learning_rate": 8.599999999999999e-05,
+ "loss": 0.1918,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 73427.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.2101590931415558,
+ "skip_count": 0.0,
+ "step": 44,
+ "text_loss": 0.4636923372745514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.21602582917522747,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.53125,
+ "learning_rate": 8.999999999999999e-05,
+ "loss": 0.1881,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 76472.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.11800424009561539,
+ "skip_count": 0.0,
+ "step": 46,
+ "text_loss": 0.4187001883983612
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.22541825653067216,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.953125,
+ "learning_rate": 9.400000000000001e-05,
+ "loss": 0.1446,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 79124.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11632519960403442,
+ "skip_count": 0.0,
+ "step": 48,
+ "text_loss": 0.2253919243812561
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 0.2348106838861168,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.58984375,
+ "learning_rate": 9.800000000000001e-05,
+ "loss": 0.1543,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 81980.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.09669367223978043,
+ "skip_count": 0.0,
+ "step": 50,
+ "text_loss": 0.6053179502487183
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 26.0,
+ "epoch": 0.2442031112415615,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.8515625,
+ "learning_rate": 0.000102,
+ "loss": 0.1393,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 85236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.12471720576286316,
+ "skip_count": 0.0,
+ "step": 52,
+ "text_loss": 0.6027331948280334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.2535955385970062,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.421875,
+ "learning_rate": 0.000106,
+ "loss": 0.1473,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 88238.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.1376056969165802,
+ "skip_count": 2.0,
+ "step": 54,
+ "text_loss": 0.2861751616001129
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.26298796595245083,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.35546875,
+ "learning_rate": 0.00011,
+ "loss": 0.1082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 91056.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07449393719434738,
+ "skip_count": 0.0,
+ "step": 56,
+ "text_loss": 0.48106974363327026
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 26.0,
+ "epoch": 0.2723803933078955,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.271484375,
+ "learning_rate": 0.000114,
+ "loss": 0.1123,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 94987.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07064720243215561,
+ "skip_count": 0.0,
+ "step": 58,
+ "text_loss": 0.3554874658584595
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.2817728206633402,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.5390625,
+ "learning_rate": 0.000118,
+ "loss": 0.1234,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 97909.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.16835889220237732,
+ "skip_count": 2.0,
+ "step": 60,
+ "text_loss": 0.5475804805755615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.29116524801878485,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2353515625,
+ "learning_rate": 0.000122,
+ "loss": 0.1224,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 101043.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06127442046999931,
+ "skip_count": 0.0,
+ "step": 62,
+ "text_loss": 0.5966938734054565
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.3005576753742295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.212890625,
+ "learning_rate": 0.000126,
+ "loss": 0.0931,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 104103.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.047825805842876434,
+ "skip_count": 0.0,
+ "step": 64,
+ "text_loss": 0.5480486750602722
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.3099501027296742,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.00013000000000000002,
+ "loss": 0.1088,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 107009.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.275174081325531,
+ "skip_count": 4.0,
+ "step": 66,
+ "text_loss": 0.41714492440223694
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.31934253008511887,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1923828125,
+ "learning_rate": 0.000134,
+ "loss": 0.1123,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 110486.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.029025178402662277,
+ "skip_count": 0.0,
+ "step": 68,
+ "text_loss": 0.6775627732276917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.3287349574405635,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.314453125,
+ "learning_rate": 0.00013800000000000002,
+ "loss": 0.1049,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 113878.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.10141710191965103,
+ "skip_count": 1.0,
+ "step": 70,
+ "text_loss": 0.6678873896598816
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.33812738479600823,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.248046875,
+ "learning_rate": 0.00014199999999999998,
+ "loss": 0.1119,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 116989.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08002066612243652,
+ "skip_count": 1.0,
+ "step": 72,
+ "text_loss": 0.405692994594574
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.3475198121514529,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1787109375,
+ "learning_rate": 0.000146,
+ "loss": 0.0944,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 119883.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.1867009848356247,
+ "skip_count": 3.0,
+ "step": 74,
+ "text_loss": 0.44616150856018066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.35691223950689754,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.333984375,
+ "learning_rate": 0.00015,
+ "loss": 0.1003,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 123325.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07042168825864792,
+ "skip_count": 2.0,
+ "step": 76,
+ "text_loss": 0.11340200901031494
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.36630466686234225,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.26171875,
+ "learning_rate": 0.000154,
+ "loss": 0.1066,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 126131.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.11535373330116272,
+ "skip_count": 2.0,
+ "step": 78,
+ "text_loss": 0.3269135355949402
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.3756970942177869,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.255859375,
+ "learning_rate": 0.000158,
+ "loss": 0.0891,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 130349.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09497501701116562,
+ "skip_count": 1.0,
+ "step": 80,
+ "text_loss": 0.15273472666740417
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.38508952157323156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1943359375,
+ "learning_rate": 0.000162,
+ "loss": 0.0929,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 133607.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030639523640275,
+ "skip_count": 0.0,
+ "step": 82,
+ "text_loss": 0.282884806394577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.3944819489286763,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1806640625,
+ "learning_rate": 0.00016600000000000002,
+ "loss": 0.1254,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 136694.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07906441390514374,
+ "skip_count": 1.0,
+ "step": 84,
+ "text_loss": 0.459094375371933
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.40387437628412093,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.212890625,
+ "learning_rate": 0.00017,
+ "loss": 0.1071,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 139966.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.1124570444226265,
+ "skip_count": 2.0,
+ "step": 86,
+ "text_loss": 0.29985448718070984
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.4132668036395656,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25390625,
+ "learning_rate": 0.000174,
+ "loss": 0.1031,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 142788.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.1966402679681778,
+ "skip_count": 0.0,
+ "step": 88,
+ "text_loss": 0.6435291767120361
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.4226592309950103,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.349609375,
+ "learning_rate": 0.000178,
+ "loss": 0.0963,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 146192.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0325632207095623,
+ "skip_count": 0.0,
+ "step": 90,
+ "text_loss": 0.35170626640319824
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.43205165835045495,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2265625,
+ "learning_rate": 0.000182,
+ "loss": 0.1073,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 149792.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.15115146338939667,
+ "skip_count": 1.0,
+ "step": 92,
+ "text_loss": 0.83159339427948
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.4414440857058996,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.205078125,
+ "learning_rate": 0.000186,
+ "loss": 0.1073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 152766.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.043313540518283844,
+ "skip_count": 0.0,
+ "step": 94,
+ "text_loss": 0.49707934260368347
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.4508365130613443,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.00019,
+ "loss": 0.0947,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 156112.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.032021280378103256,
+ "skip_count": 0.0,
+ "step": 96,
+ "text_loss": 0.27608928084373474
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.46022894041678897,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2099609375,
+ "learning_rate": 0.000194,
+ "loss": 0.0846,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 159454.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.24473154544830322,
+ "skip_count": 2.0,
+ "step": 98,
+ "text_loss": 0.6026689410209656
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.4696213677722336,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.271484375,
+ "learning_rate": 0.00019800000000000002,
+ "loss": 0.1028,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 163661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.11468276381492615,
+ "skip_count": 2.0,
+ "step": 100,
+ "text_loss": 0.46733155846595764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.47901379512767833,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1806640625,
+ "learning_rate": 0.000202,
+ "loss": 0.1089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 167134.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021144939586520195,
+ "skip_count": 0.0,
+ "step": 102,
+ "text_loss": 0.6362994909286499
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.488406222483123,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1943359375,
+ "learning_rate": 0.000206,
+ "loss": 0.0621,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 170433.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06594710797071457,
+ "skip_count": 1.0,
+ "step": 104,
+ "text_loss": 0.4515477120876312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.49779864983856764,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1591796875,
+ "learning_rate": 0.00021,
+ "loss": 0.0929,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 173387.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.032923027873039246,
+ "skip_count": 0.0,
+ "step": 106,
+ "text_loss": 0.6638453006744385
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.5071910771940124,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.240234375,
+ "learning_rate": 0.000214,
+ "loss": 0.0883,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 176170.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08034781366586685,
+ "skip_count": 0.0,
+ "step": 108,
+ "text_loss": 1.186936855316162
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.516583504549457,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.267578125,
+ "learning_rate": 0.000218,
+ "loss": 0.0794,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 179877.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07814185321331024,
+ "skip_count": 1.0,
+ "step": 110,
+ "text_loss": 0.5488709211349487
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.5259759319049017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2353515625,
+ "learning_rate": 0.000222,
+ "loss": 0.0946,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 182726.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01884695515036583,
+ "skip_count": 0.0,
+ "step": 112,
+ "text_loss": 0.5195863842964172
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.5353683592603463,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.19921875,
+ "learning_rate": 0.00022600000000000002,
+ "loss": 0.0974,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 185624.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09657823294401169,
+ "skip_count": 2.0,
+ "step": 114,
+ "text_loss": 0.43858134746551514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.544760786615791,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.3046875,
+ "learning_rate": 0.00023,
+ "loss": 0.0753,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 188155.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01463601179420948,
+ "skip_count": 0.0,
+ "step": 116,
+ "text_loss": 0.392981618642807
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.5541532139712357,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.439453125,
+ "learning_rate": 0.00023400000000000002,
+ "loss": 0.0843,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 190970.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03859659656882286,
+ "skip_count": 0.0,
+ "step": 118,
+ "text_loss": 0.309179425239563
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.5635456413266804,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2255859375,
+ "learning_rate": 0.00023799999999999998,
+ "loss": 0.053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 193988.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019092386588454247,
+ "skip_count": 0.0,
+ "step": 120,
+ "text_loss": 0.48543134331703186
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.572938068682125,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.35546875,
+ "learning_rate": 0.000242,
+ "loss": 0.1203,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 196475.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0619138665497303,
+ "skip_count": 1.0,
+ "step": 122,
+ "text_loss": 0.4615364074707031
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.5823304960375697,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1875,
+ "learning_rate": 0.000246,
+ "loss": 0.1002,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 200045.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.09752107411623001,
+ "skip_count": 0.0,
+ "step": 124,
+ "text_loss": 0.15802054107189178
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.5917229233930144,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1728515625,
+ "learning_rate": 0.00025,
+ "loss": 0.0773,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 203214.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02896115928888321,
+ "skip_count": 0.0,
+ "step": 126,
+ "text_loss": 0.4543360471725464
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.601115350748459,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.4296875,
+ "learning_rate": 0.000254,
+ "loss": 0.0973,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 206168.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011423567309975624,
+ "skip_count": 0.0,
+ "step": 128,
+ "text_loss": 0.4730179011821747
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6105077781039038,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.365234375,
+ "learning_rate": 0.00025800000000000004,
+ "loss": 0.099,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 209907.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01957600563764572,
+ "skip_count": 0.0,
+ "step": 130,
+ "text_loss": 0.45122358202934265
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6199002054593484,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2060546875,
+ "learning_rate": 0.000262,
+ "loss": 0.0868,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 213521.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04882373288273811,
+ "skip_count": 1.0,
+ "step": 132,
+ "text_loss": 0.4341491758823395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6292926328147931,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1708984375,
+ "learning_rate": 0.000266,
+ "loss": 0.0834,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 216484.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016083380207419395,
+ "skip_count": 0.0,
+ "step": 134,
+ "text_loss": 0.46990111470222473
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6386850601702377,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.220703125,
+ "learning_rate": 0.00027,
+ "loss": 0.0863,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 219398.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01733536459505558,
+ "skip_count": 0.0,
+ "step": 136,
+ "text_loss": 0.4455361068248749
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6480774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1806640625,
+ "learning_rate": 0.00027400000000000005,
+ "loss": 0.0997,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 222430.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01332803163677454,
+ "skip_count": 0.0,
+ "step": 138,
+ "text_loss": 0.47699397802352905
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.657469914881127,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.333984375,
+ "learning_rate": 0.00027800000000000004,
+ "loss": 0.0922,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 225458.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.14924728870391846,
+ "skip_count": 2.0,
+ "step": 140,
+ "text_loss": 0.5858222842216492
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6668623422365718,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25,
+ "learning_rate": 0.00028199999999999997,
+ "loss": 0.0798,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 229365.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.1860177218914032,
+ "skip_count": 2.0,
+ "step": 142,
+ "text_loss": 0.5003137588500977
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6762547695920165,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.00028599999999999996,
+ "loss": 0.054,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 231787.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.16498211026191711,
+ "skip_count": 1.0,
+ "step": 144,
+ "text_loss": 0.5026470422744751
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6856471969474611,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.306640625,
+ "learning_rate": 0.00029,
+ "loss": 0.0936,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 235014.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11801310628652573,
+ "skip_count": 1.0,
+ "step": 146,
+ "text_loss": 0.611888587474823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.6950396243029058,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1806640625,
+ "learning_rate": 0.000294,
+ "loss": 0.0878,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 238210.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02422776259481907,
+ "skip_count": 0.0,
+ "step": 148,
+ "text_loss": 0.2876914143562317
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7044320516583504,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1728515625,
+ "learning_rate": 0.000298,
+ "loss": 0.0858,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 241582.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07282499223947525,
+ "skip_count": 2.0,
+ "step": 150,
+ "text_loss": 0.3919292390346527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7138244790137951,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.37890625,
+ "learning_rate": 0.000302,
+ "loss": 0.0797,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 244621.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.20659038424491882,
+ "skip_count": 1.0,
+ "step": 152,
+ "text_loss": 0.4294498860836029
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7232169063692399,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1787109375,
+ "learning_rate": 0.000306,
+ "loss": 0.072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 247833.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02428400330245495,
+ "skip_count": 0.0,
+ "step": 154,
+ "text_loss": 0.5930765867233276
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7326093337246845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1533203125,
+ "learning_rate": 0.00031,
+ "loss": 0.0772,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 251349.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0167869683355093,
+ "skip_count": 0.0,
+ "step": 156,
+ "text_loss": 0.41063904762268066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7420017610801292,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1572265625,
+ "learning_rate": 0.000314,
+ "loss": 0.0821,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 254886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02531604655086994,
+ "skip_count": 0.0,
+ "step": 158,
+ "text_loss": 0.6739020347595215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7513941884355738,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.201171875,
+ "learning_rate": 0.00031800000000000003,
+ "loss": 0.09,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 258260.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017772775143384933,
+ "skip_count": 0.0,
+ "step": 160,
+ "text_loss": 0.46873849630355835
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7607866157910185,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.224609375,
+ "learning_rate": 0.000322,
+ "loss": 0.0893,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 261846.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.034902360290288925,
+ "skip_count": 1.0,
+ "step": 162,
+ "text_loss": 0.3727971017360687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7701790431464631,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.212890625,
+ "learning_rate": 0.000326,
+ "loss": 0.076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 264348.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013553355820477009,
+ "skip_count": 0.0,
+ "step": 164,
+ "text_loss": 0.5798237323760986
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7795714705019078,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.408203125,
+ "learning_rate": 0.00033,
+ "loss": 0.0926,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 267479.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.13571743667125702,
+ "skip_count": 1.0,
+ "step": 166,
+ "text_loss": 0.8084776997566223
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7889638978573525,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2431640625,
+ "learning_rate": 0.00033400000000000004,
+ "loss": 0.0817,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 270268.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.19884146749973297,
+ "skip_count": 0.0,
+ "step": 168,
+ "text_loss": 0.7366134524345398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.7983563252127972,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.267578125,
+ "learning_rate": 0.00033800000000000003,
+ "loss": 0.1022,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 273518.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.15469175577163696,
+ "skip_count": 1.0,
+ "step": 170,
+ "text_loss": 0.27204006910324097
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8077487525682419,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.17578125,
+ "learning_rate": 0.000342,
+ "loss": 0.0865,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 277210.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08603330701589584,
+ "skip_count": 2.0,
+ "step": 172,
+ "text_loss": 0.7137667536735535
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8171411799236865,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.189453125,
+ "learning_rate": 0.000346,
+ "loss": 0.0902,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 280389.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.17851492762565613,
+ "skip_count": 4.0,
+ "step": 174,
+ "text_loss": 0.5148105621337891
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8265336072791312,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1494140625,
+ "learning_rate": 0.00035,
+ "loss": 0.0853,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 283501.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021331604570150375,
+ "skip_count": 0.0,
+ "step": 176,
+ "text_loss": 0.301013320684433
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8359260346345758,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2158203125,
+ "learning_rate": 0.000354,
+ "loss": 0.0911,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 287154.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.057273946702480316,
+ "skip_count": 2.0,
+ "step": 178,
+ "text_loss": 0.4740981459617615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8453184619900206,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.240234375,
+ "learning_rate": 0.000358,
+ "loss": 0.0904,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 289929.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04116598889231682,
+ "skip_count": 1.0,
+ "step": 180,
+ "text_loss": 0.4838573932647705
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8547108893454652,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.14453125,
+ "learning_rate": 0.000362,
+ "loss": 0.0991,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 294293.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.027111956849694252,
+ "skip_count": 0.0,
+ "step": 182,
+ "text_loss": 0.7495553493499756
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8641033167009099,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.158203125,
+ "learning_rate": 0.000366,
+ "loss": 0.1038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 297730.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019166452810168266,
+ "skip_count": 0.0,
+ "step": 184,
+ "text_loss": 0.534831166267395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 0.8734957440563546,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2236328125,
+ "learning_rate": 0.00037,
+ "loss": 0.0784,
+ "macro_f1": 0.5427350401878357,
+ "num_tokens": 300593.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.2349659502506256,
+ "skip_count": 2.0,
+ "step": 186,
+ "text_loss": 0.3549048602581024
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8828881714117992,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2041015625,
+ "learning_rate": 0.000374,
+ "loss": 0.0827,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 303456.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.22502389550209045,
+ "skip_count": 2.0,
+ "step": 188,
+ "text_loss": 0.8837642073631287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.8922805987672439,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.271484375,
+ "learning_rate": 0.000378,
+ "loss": 0.1085,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 306241.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.12291611731052399,
+ "skip_count": 0.0,
+ "step": 190,
+ "text_loss": 0.73353511095047
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9016730261226886,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15625,
+ "learning_rate": 0.000382,
+ "loss": 0.0969,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 310606.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.055988848209381104,
+ "skip_count": 1.0,
+ "step": 192,
+ "text_loss": 0.6261917352676392
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9110654534781333,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.34375,
+ "learning_rate": 0.000386,
+ "loss": 0.1055,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 313564.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.12363404780626297,
+ "skip_count": 3.0,
+ "step": 194,
+ "text_loss": 0.2790874242782593
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9204578808335779,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.27734375,
+ "learning_rate": 0.00039000000000000005,
+ "loss": 0.0964,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 316958.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.2718356251716614,
+ "skip_count": 2.0,
+ "step": 196,
+ "text_loss": 0.14428086578845978
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9298503081890226,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2021484375,
+ "learning_rate": 0.00039400000000000004,
+ "loss": 0.0917,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 320103.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07188102602958679,
+ "skip_count": 2.0,
+ "step": 198,
+ "text_loss": 0.27155816555023193
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9392427355444672,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.201171875,
+ "learning_rate": 0.000398,
+ "loss": 0.0809,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 323566.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.18038256466388702,
+ "skip_count": 1.0,
+ "step": 200,
+ "text_loss": 0.8453494310379028
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9486351628999119,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.000402,
+ "loss": 0.0801,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 326385.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014639763161540031,
+ "skip_count": 0.0,
+ "step": 202,
+ "text_loss": 0.5733131766319275
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9580275902553567,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.21875,
+ "learning_rate": 0.00040600000000000006,
+ "loss": 0.104,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 329266.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015269627794623375,
+ "skip_count": 0.0,
+ "step": 204,
+ "text_loss": 0.7355639934539795
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9674200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.27734375,
+ "learning_rate": 0.00041,
+ "loss": 0.0833,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 332984.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018046971410512924,
+ "skip_count": 0.0,
+ "step": 206,
+ "text_loss": 0.587641179561615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.976812444966246,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.185546875,
+ "learning_rate": 0.000414,
+ "loss": 0.0588,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 335739.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.12791286408901215,
+ "skip_count": 0.0,
+ "step": 208,
+ "text_loss": 0.6538406610488892
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9862048723216906,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.24609375,
+ "learning_rate": 0.00041799999999999997,
+ "loss": 0.0732,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 338966.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.050490595400333405,
+ "skip_count": 1.0,
+ "step": 210,
+ "text_loss": 0.4188295602798462
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 0.9955972996771353,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.271484375,
+ "learning_rate": 0.000422,
+ "loss": 0.0588,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 342063.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.11652113497257233,
+ "skip_count": 3.0,
+ "step": 212,
+ "text_loss": 0.21822240948677063
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.0046962136777224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2060546875,
+ "learning_rate": 0.000426,
+ "loss": 0.0621,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 344887.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023898238316178322,
+ "skip_count": 0.0,
+ "step": 214,
+ "text_loss": 0.24692800641059875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.014088641033167,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.3671875,
+ "learning_rate": 0.00043,
+ "loss": 0.1005,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 348700.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06414655596017838,
+ "skip_count": 0.0,
+ "step": 216,
+ "text_loss": 0.4744548797607422
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.0234810683886117,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1962890625,
+ "learning_rate": 0.00043400000000000003,
+ "loss": 0.0753,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 351507.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11702914535999298,
+ "skip_count": 1.0,
+ "step": 218,
+ "text_loss": 0.5614864826202393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.0328734957440564,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.189453125,
+ "learning_rate": 0.000438,
+ "loss": 0.0792,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 354484.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014991643838584423,
+ "skip_count": 0.0,
+ "step": 220,
+ "text_loss": 0.47209832072257996
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.042265923099501,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.251953125,
+ "learning_rate": 0.000442,
+ "loss": 0.106,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 357954.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04747112840414047,
+ "skip_count": 1.0,
+ "step": 222,
+ "text_loss": 0.2968728244304657
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.0516583504549457,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.40234375,
+ "learning_rate": 0.000446,
+ "loss": 0.0853,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 360547.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06754162162542343,
+ "skip_count": 2.0,
+ "step": 224,
+ "text_loss": 0.2364148646593094
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.0610507778103904,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2412109375,
+ "learning_rate": 0.00045000000000000004,
+ "loss": 0.1016,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 364529.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07830183953046799,
+ "skip_count": 1.0,
+ "step": 226,
+ "text_loss": 0.4787476360797882
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.070443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1953125,
+ "learning_rate": 0.00045400000000000003,
+ "loss": 0.0792,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 367683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015735948458313942,
+ "skip_count": 0.0,
+ "step": 228,
+ "text_loss": 0.37148505449295044
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.0798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25,
+ "learning_rate": 0.000458,
+ "loss": 0.0995,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 371402.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013354359194636345,
+ "skip_count": 0.0,
+ "step": 230,
+ "text_loss": 0.7464763522148132
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.0892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1494140625,
+ "learning_rate": 0.000462,
+ "loss": 0.0731,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 374587.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013763721100986004,
+ "skip_count": 0.0,
+ "step": 232,
+ "text_loss": 0.8754443526268005
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.098620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.3984375,
+ "learning_rate": 0.00046600000000000005,
+ "loss": 0.0861,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 377513.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010075435042381287,
+ "skip_count": 0.0,
+ "step": 234,
+ "text_loss": 0.31534913182258606
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1080129145876136,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.17578125,
+ "learning_rate": 0.00047,
+ "loss": 0.0791,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 380736.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.059825167059898376,
+ "skip_count": 1.0,
+ "step": 236,
+ "text_loss": 0.5936337113380432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1174053419430585,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.267578125,
+ "learning_rate": 0.000474,
+ "loss": 0.0514,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 383236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09134846180677414,
+ "skip_count": 2.0,
+ "step": 238,
+ "text_loss": 0.5976157784461975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1267977692985032,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.208984375,
+ "learning_rate": 0.00047799999999999996,
+ "loss": 0.0858,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 385778.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11989791691303253,
+ "skip_count": 1.0,
+ "step": 240,
+ "text_loss": 0.3554210960865021
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1361901966539478,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.171875,
+ "learning_rate": 0.000482,
+ "loss": 0.0734,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 388777.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013591105118393898,
+ "skip_count": 0.0,
+ "step": 242,
+ "text_loss": 0.4829460382461548
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1455826240093925,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12060546875,
+ "learning_rate": 0.000486,
+ "loss": 0.0625,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 391797.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0920003354549408,
+ "skip_count": 2.0,
+ "step": 244,
+ "text_loss": 0.3085818886756897
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1549750513648371,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.00049,
+ "loss": 0.0501,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 396485.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0129330949857831,
+ "skip_count": 0.0,
+ "step": 246,
+ "text_loss": 0.42803969979286194
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1643674787202818,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.296875,
+ "learning_rate": 0.000494,
+ "loss": 0.0945,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 399923.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.10677755624055862,
+ "skip_count": 3.0,
+ "step": 248,
+ "text_loss": 0.2908555567264557
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1737599060757264,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.203125,
+ "learning_rate": 0.000498,
+ "loss": 0.0812,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 403647.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.1504337340593338,
+ "skip_count": 3.0,
+ "step": 250,
+ "text_loss": 0.333095908164978
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.183152333431171,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.22265625,
+ "learning_rate": 0.0005020000000000001,
+ "loss": 0.0828,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 409147.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06503184884786606,
+ "skip_count": 2.0,
+ "step": 252,
+ "text_loss": 0.16117942333221436
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.1925447607866158,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.287109375,
+ "learning_rate": 0.000506,
+ "loss": 0.0995,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 412072.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016280122101306915,
+ "skip_count": 0.0,
+ "step": 254,
+ "text_loss": 0.4217492640018463
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.2019371881420604,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.21484375,
+ "learning_rate": 0.00051,
+ "loss": 0.0803,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 415052.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.2117508500814438,
+ "skip_count": 1.0,
+ "step": 256,
+ "text_loss": 0.5795308947563171
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.211329615497505,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2421875,
+ "learning_rate": 0.000514,
+ "loss": 0.0668,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 418099.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.15002092719078064,
+ "skip_count": 0.0,
+ "step": 258,
+ "text_loss": 0.4840938448905945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.2207220428529497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1533203125,
+ "learning_rate": 0.000518,
+ "loss": 0.0538,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 422526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012834074907004833,
+ "skip_count": 0.0,
+ "step": 260,
+ "text_loss": 0.36141225695610046
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.2301144702083944,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.000522,
+ "loss": 0.085,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 425765.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.23808011412620544,
+ "skip_count": 2.0,
+ "step": 262,
+ "text_loss": 0.27572691440582275
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.2395068975638392,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.17578125,
+ "learning_rate": 0.000526,
+ "loss": 0.0708,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 429048.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.055687375366687775,
+ "skip_count": 1.0,
+ "step": 264,
+ "text_loss": 0.37020301818847656
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.248899324919284,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2080078125,
+ "learning_rate": 0.0005300000000000001,
+ "loss": 0.0839,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 431784.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0872957780957222,
+ "skip_count": 1.0,
+ "step": 266,
+ "text_loss": 0.5937283039093018
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.2582917522747286,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.0005340000000000001,
+ "loss": 0.0733,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 434297.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.23507654666900635,
+ "skip_count": 0.0,
+ "step": 268,
+ "text_loss": 0.3367372453212738
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.2676841796301732,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2431640625,
+ "learning_rate": 0.0005380000000000001,
+ "loss": 0.0708,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 437586.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.12860390543937683,
+ "skip_count": 2.0,
+ "step": 270,
+ "text_loss": 0.7149854302406311
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.2770766069856179,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2451171875,
+ "learning_rate": 0.0005420000000000001,
+ "loss": 0.1072,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 440649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.044308312237262726,
+ "skip_count": 1.0,
+ "step": 272,
+ "text_loss": 0.26778292655944824
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.2864690343410625,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.44921875,
+ "learning_rate": 0.000546,
+ "loss": 0.0938,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 443907.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.11514109373092651,
+ "skip_count": 3.0,
+ "step": 274,
+ "text_loss": 0.23578761518001556
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 1.2958614616965072,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2578125,
+ "learning_rate": 0.00055,
+ "loss": 0.0932,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 447147.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.055705297738313675,
+ "skip_count": 2.0,
+ "step": 276,
+ "text_loss": 0.2513524889945984
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.3052538890519518,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.29296875,
+ "learning_rate": 0.000554,
+ "loss": 0.0667,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 450032.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.13778971135616302,
+ "skip_count": 2.0,
+ "step": 278,
+ "text_loss": 0.4857243597507477
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.3146463164073965,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.185546875,
+ "learning_rate": 0.000558,
+ "loss": 0.0672,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 453195.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0700262188911438,
+ "skip_count": 0.0,
+ "step": 280,
+ "text_loss": 0.7589789628982544
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.3240387437628411,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25,
+ "learning_rate": 0.0005620000000000001,
+ "loss": 0.0603,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 455942.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11706235259771347,
+ "skip_count": 2.0,
+ "step": 282,
+ "text_loss": 0.4783432185649872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.3334311711182858,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.265625,
+ "learning_rate": 0.000566,
+ "loss": 0.0793,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 458932.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07073967158794403,
+ "skip_count": 1.0,
+ "step": 284,
+ "text_loss": 0.7117193937301636
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.3428235984737307,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1650390625,
+ "learning_rate": 0.00057,
+ "loss": 0.0915,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 462650.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05301115661859512,
+ "skip_count": 1.0,
+ "step": 286,
+ "text_loss": 0.4175460636615753
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.352216025829175,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2158203125,
+ "learning_rate": 0.000574,
+ "loss": 0.0675,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 466290.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06356479972600937,
+ "skip_count": 1.0,
+ "step": 288,
+ "text_loss": 0.5832946300506592
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 1.36160845318462,
+ "f1_execute": 0.9019607901573181,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.28515625,
+ "learning_rate": 0.000578,
+ "loss": 0.0805,
+ "macro_f1": 0.3006536066532135,
+ "num_tokens": 469296.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.21032999455928802,
+ "skip_count": 3.0,
+ "step": 290,
+ "text_loss": 0.36023473739624023
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.3710008805400646,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.27734375,
+ "learning_rate": 0.0005819999999999999,
+ "loss": 0.0685,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 472272.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08062280714511871,
+ "skip_count": 1.0,
+ "step": 292,
+ "text_loss": 0.37197956442832947
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.3803933078955093,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.28125,
+ "learning_rate": 0.0005859999999999999,
+ "loss": 0.0878,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 475864.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05023600533604622,
+ "skip_count": 2.0,
+ "step": 294,
+ "text_loss": 0.4765273630619049
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.389785735250954,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2177734375,
+ "learning_rate": 0.00059,
+ "loss": 0.0728,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 478916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011689410544931889,
+ "skip_count": 0.0,
+ "step": 296,
+ "text_loss": 0.5878773927688599
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.3991781626063986,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15625,
+ "learning_rate": 0.000594,
+ "loss": 0.0727,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 482369.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010772093199193478,
+ "skip_count": 0.0,
+ "step": 298,
+ "text_loss": 0.4424116313457489
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4085705899618433,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.181640625,
+ "learning_rate": 0.000598,
+ "loss": 0.0787,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 486049.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.23482851684093475,
+ "skip_count": 2.0,
+ "step": 300,
+ "text_loss": 0.21217775344848633
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.417963017317288,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2080078125,
+ "learning_rate": 0.000602,
+ "loss": 0.073,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 488683.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.18843084573745728,
+ "skip_count": 3.0,
+ "step": 302,
+ "text_loss": 0.2109498232603073
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4273554446727326,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.279296875,
+ "learning_rate": 0.000606,
+ "loss": 0.0945,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 492010.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.17861786484718323,
+ "skip_count": 3.0,
+ "step": 304,
+ "text_loss": 0.8446305394172668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4367478720281772,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1943359375,
+ "learning_rate": 0.00061,
+ "loss": 0.0827,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 494764.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014124520123004913,
+ "skip_count": 0.0,
+ "step": 306,
+ "text_loss": 0.742735743522644
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4461402993836219,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.26953125,
+ "learning_rate": 0.000614,
+ "loss": 0.1071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 497820.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017968112602829933,
+ "skip_count": 0.0,
+ "step": 308,
+ "text_loss": 0.28305482864379883
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4555327267390665,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1689453125,
+ "learning_rate": 0.0006180000000000001,
+ "loss": 0.0775,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 500694.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08593655377626419,
+ "skip_count": 2.0,
+ "step": 310,
+ "text_loss": 0.3496848940849304
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4649251540945114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.19140625,
+ "learning_rate": 0.000622,
+ "loss": 0.061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 503871.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016449492424726486,
+ "skip_count": 0.0,
+ "step": 312,
+ "text_loss": 0.6691372990608215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4743175814499558,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.205078125,
+ "learning_rate": 0.000626,
+ "loss": 0.0815,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 506730.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014532964676618576,
+ "skip_count": 0.0,
+ "step": 314,
+ "text_loss": 0.6118118166923523
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4837100088054007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2216796875,
+ "learning_rate": 0.00063,
+ "loss": 0.0742,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 510323.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013093139044940472,
+ "skip_count": 0.0,
+ "step": 316,
+ "text_loss": 0.38126271963119507
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.4931024361608454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.400390625,
+ "learning_rate": 0.000634,
+ "loss": 0.0915,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 514075.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008627045899629593,
+ "skip_count": 0.0,
+ "step": 318,
+ "text_loss": 0.5983037948608398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.50249486351629,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15234375,
+ "learning_rate": 0.000638,
+ "loss": 0.1008,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 517418.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04561378434300423,
+ "skip_count": 1.0,
+ "step": 320,
+ "text_loss": 0.767257034778595
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 1.5118872908717347,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.259765625,
+ "learning_rate": 0.000642,
+ "loss": 0.0926,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 520443.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.024372953921556473,
+ "skip_count": 0.0,
+ "step": 322,
+ "text_loss": 0.6572105884552002
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.5212797182271793,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.30078125,
+ "learning_rate": 0.000646,
+ "loss": 0.0822,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 523317.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08099937438964844,
+ "skip_count": 0.0,
+ "step": 324,
+ "text_loss": 0.205499529838562
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 1.530672145582624,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.0006500000000000001,
+ "loss": 0.0809,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 526355.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0657225176692009,
+ "skip_count": 1.0,
+ "step": 326,
+ "text_loss": 0.2587239742279053
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.5400645729380686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.111328125,
+ "learning_rate": 0.0006540000000000001,
+ "loss": 0.0779,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 529689.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01849208027124405,
+ "skip_count": 0.0,
+ "step": 328,
+ "text_loss": 0.2172023057937622
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.5494570002935135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1845703125,
+ "learning_rate": 0.0006580000000000001,
+ "loss": 0.0758,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 532603.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016184113919734955,
+ "skip_count": 0.0,
+ "step": 330,
+ "text_loss": 0.5980568528175354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.558849427648958,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.220703125,
+ "learning_rate": 0.000662,
+ "loss": 0.0439,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 536056.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01303898449987173,
+ "skip_count": 0.0,
+ "step": 332,
+ "text_loss": 0.5421966314315796
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 1.5682418550044028,
+ "f1_execute": 0.8979591727256775,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.296875,
+ "learning_rate": 0.000666,
+ "loss": 0.0963,
+ "macro_f1": 0.465986430644989,
+ "num_tokens": 539231.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.3075675964355469,
+ "skip_count": 3.0,
+ "step": 334,
+ "text_loss": 0.19719554483890533
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.5776342823598473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.173828125,
+ "learning_rate": 0.00067,
+ "loss": 0.0706,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 542038.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009116224013268948,
+ "skip_count": 0.0,
+ "step": 336,
+ "text_loss": 0.3407036066055298
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.5870267097152921,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2421875,
+ "learning_rate": 0.000674,
+ "loss": 0.0768,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 545019.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021463042125105858,
+ "skip_count": 0.0,
+ "step": 338,
+ "text_loss": 0.24486012756824493
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.5964191370707366,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1708984375,
+ "learning_rate": 0.0006780000000000001,
+ "loss": 0.0889,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 548036.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01857556402683258,
+ "skip_count": 0.0,
+ "step": 340,
+ "text_loss": 0.28140124678611755
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.6058115644261814,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.130859375,
+ "learning_rate": 0.0006820000000000001,
+ "loss": 0.0617,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 551419.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.27090007066726685,
+ "skip_count": 3.0,
+ "step": 342,
+ "text_loss": 0.20690307021141052
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.615203991781626,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.3046875,
+ "learning_rate": 0.0006860000000000001,
+ "loss": 0.1047,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 554037.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09231195598840714,
+ "skip_count": 2.0,
+ "step": 344,
+ "text_loss": 0.4479128420352936
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.6245964191370708,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.255859375,
+ "learning_rate": 0.00069,
+ "loss": 0.0883,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 556672.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00935924518853426,
+ "skip_count": 0.0,
+ "step": 346,
+ "text_loss": 0.6377320289611816
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.6339888464925154,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.000694,
+ "loss": 0.0781,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 559756.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.17641772329807281,
+ "skip_count": 2.0,
+ "step": 348,
+ "text_loss": 0.6097636222839355
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 1.64338127384796,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.30078125,
+ "learning_rate": 0.0006979999999999999,
+ "loss": 0.0616,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 563415.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06240406632423401,
+ "skip_count": 2.0,
+ "step": 350,
+ "text_loss": 0.5291631817817688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.6527737012034047,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.296875,
+ "learning_rate": 0.0007019999999999999,
+ "loss": 0.1026,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 566357.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012269247323274612,
+ "skip_count": 0.0,
+ "step": 352,
+ "text_loss": 0.5170195698738098
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.6621661285588494,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1435546875,
+ "learning_rate": 0.0007059999999999999,
+ "loss": 0.0815,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 569449.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07515309751033783,
+ "skip_count": 2.0,
+ "step": 354,
+ "text_loss": 0.34507250785827637
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.6715585559142943,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.00071,
+ "loss": 0.0791,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 572761.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.20768006145954132,
+ "skip_count": 2.0,
+ "step": 356,
+ "text_loss": 0.3158532381057739
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.6809509832697387,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1884765625,
+ "learning_rate": 0.000714,
+ "loss": 0.0682,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 575909.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.025329967960715294,
+ "skip_count": 0.0,
+ "step": 358,
+ "text_loss": 0.21455390751361847
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 1.6903434106251836,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.21484375,
+ "learning_rate": 0.000718,
+ "loss": 0.0775,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 579186.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.07676175981760025,
+ "skip_count": 0.0,
+ "step": 360,
+ "text_loss": 0.61895352602005
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 1.699735837980628,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.197265625,
+ "learning_rate": 0.000722,
+ "loss": 0.0781,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 582437.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08070661872625351,
+ "skip_count": 1.0,
+ "step": 362,
+ "text_loss": 0.20557661354541779
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.7091282653360729,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2216796875,
+ "learning_rate": 0.000726,
+ "loss": 0.11,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 586096.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015891313552856445,
+ "skip_count": 0.0,
+ "step": 364,
+ "text_loss": 0.597991943359375
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.7185206926915173,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15625,
+ "learning_rate": 0.00073,
+ "loss": 0.0573,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 589520.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.12844261527061462,
+ "skip_count": 3.0,
+ "step": 366,
+ "text_loss": 0.2944789230823517
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.7279131200469622,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.150390625,
+ "learning_rate": 0.000734,
+ "loss": 0.1005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 592691.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02382199838757515,
+ "skip_count": 0.0,
+ "step": 368,
+ "text_loss": 0.23989969491958618
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.7373055474024068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1796875,
+ "learning_rate": 0.000738,
+ "loss": 0.0661,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 596004.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018812084570527077,
+ "skip_count": 0.0,
+ "step": 370,
+ "text_loss": 0.22111408412456512
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.7466979747578515,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2412109375,
+ "learning_rate": 0.000742,
+ "loss": 0.0666,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 599087.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08290331065654755,
+ "skip_count": 1.0,
+ "step": 372,
+ "text_loss": 0.2567356526851654
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.7560904021132961,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2412109375,
+ "learning_rate": 0.000746,
+ "loss": 0.0941,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 602330.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11482042074203491,
+ "skip_count": 1.0,
+ "step": 374,
+ "text_loss": 0.7217292785644531
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.7654828294687408,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2265625,
+ "learning_rate": 0.00075,
+ "loss": 0.0728,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 605503.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11849870532751083,
+ "skip_count": 0.0,
+ "step": 376,
+ "text_loss": 0.5122153759002686
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 1.7748752568241855,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2333984375,
+ "learning_rate": 0.000754,
+ "loss": 0.0835,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 608505.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07090992480516434,
+ "skip_count": 1.0,
+ "step": 378,
+ "text_loss": 0.2204965502023697
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.78426768417963,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1826171875,
+ "learning_rate": 0.000758,
+ "loss": 0.0794,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 611193.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03812089189887047,
+ "skip_count": 1.0,
+ "step": 380,
+ "text_loss": 0.44909021258354187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.793660111535075,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1689453125,
+ "learning_rate": 0.000762,
+ "loss": 0.0882,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 614231.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.10270529240369797,
+ "skip_count": 0.0,
+ "step": 382,
+ "text_loss": 0.13624964654445648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8030525388905194,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.330078125,
+ "learning_rate": 0.0007660000000000001,
+ "loss": 0.1107,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 617090.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11624004691839218,
+ "skip_count": 1.0,
+ "step": 384,
+ "text_loss": 0.7314052581787109
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8124449662459643,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1396484375,
+ "learning_rate": 0.0007700000000000001,
+ "loss": 0.0628,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 620596.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07114322483539581,
+ "skip_count": 2.0,
+ "step": 386,
+ "text_loss": 0.503322958946228
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8218373936014087,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.306640625,
+ "learning_rate": 0.0007740000000000001,
+ "loss": 0.0829,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 624108.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06061873584985733,
+ "skip_count": 2.0,
+ "step": 388,
+ "text_loss": 0.11481904983520508
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8312298209568536,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2099609375,
+ "learning_rate": 0.000778,
+ "loss": 0.0791,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 626895.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.2921771705150604,
+ "skip_count": 4.0,
+ "step": 390,
+ "text_loss": 0.3069624602794647
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8406222483122983,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.30859375,
+ "learning_rate": 0.000782,
+ "loss": 0.0605,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 630204.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.202707901597023,
+ "skip_count": 4.0,
+ "step": 392,
+ "text_loss": 0.6022785305976868
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.850014675667743,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.29296875,
+ "learning_rate": 0.000786,
+ "loss": 0.0877,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 634373.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0221510399132967,
+ "skip_count": 0.0,
+ "step": 394,
+ "text_loss": 0.26787394285202026
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8594071030231876,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.37890625,
+ "learning_rate": 0.00079,
+ "loss": 0.0805,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 637442.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.12636390328407288,
+ "skip_count": 0.0,
+ "step": 396,
+ "text_loss": 0.2799781560897827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8687995303786322,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2080078125,
+ "learning_rate": 0.0007940000000000001,
+ "loss": 0.0724,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 641231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07933453470468521,
+ "skip_count": 2.0,
+ "step": 398,
+ "text_loss": 0.2507784366607666
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8781919577340769,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.0007980000000000001,
+ "loss": 0.0909,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 644560.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.10324911028146744,
+ "skip_count": 0.0,
+ "step": 400,
+ "text_loss": 0.7756280303001404
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8875843850895215,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2275390625,
+ "learning_rate": 0.0008020000000000001,
+ "loss": 0.0783,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 647393.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.18546262383460999,
+ "skip_count": 2.0,
+ "step": 402,
+ "text_loss": 0.5013328194618225
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.8969768124449664,
+ "f1_execute": 0.8571428656578064,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.283203125,
+ "learning_rate": 0.0008060000000000001,
+ "loss": 0.0787,
+ "macro_f1": 0.2857142984867096,
+ "num_tokens": 650355.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.3280293643474579,
+ "skip_count": 4.0,
+ "step": 404,
+ "text_loss": 0.2842077314853668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.9063692398004108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.0008100000000000001,
+ "loss": 0.0901,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 654280.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02623247355222702,
+ "skip_count": 0.0,
+ "step": 406,
+ "text_loss": 0.46742817759513855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.9157616671558557,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.216796875,
+ "learning_rate": 0.0008139999999999999,
+ "loss": 0.0945,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 657568.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009744114242494106,
+ "skip_count": 0.0,
+ "step": 408,
+ "text_loss": 0.7168047428131104
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.9251540945113002,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2158203125,
+ "learning_rate": 0.0008179999999999999,
+ "loss": 0.1065,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 660593.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07591600716114044,
+ "skip_count": 2.0,
+ "step": 410,
+ "text_loss": 0.449823260307312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.934546521866745,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1396484375,
+ "learning_rate": 0.0008219999999999999,
+ "loss": 0.0795,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 663916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02076602540910244,
+ "skip_count": 0.0,
+ "step": 412,
+ "text_loss": 0.4764713943004608
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.9439389492221895,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1650390625,
+ "learning_rate": 0.000826,
+ "loss": 0.0836,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 667502.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.049170155078172684,
+ "skip_count": 1.0,
+ "step": 414,
+ "text_loss": 0.30333325266838074
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.9533313765776343,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1513671875,
+ "learning_rate": 0.00083,
+ "loss": 0.1021,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 670510.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.15554003417491913,
+ "skip_count": 0.0,
+ "step": 416,
+ "text_loss": 0.3691870868206024
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.962723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.000834,
+ "loss": 0.1013,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 674761.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.024516675621271133,
+ "skip_count": 0.0,
+ "step": 418,
+ "text_loss": 0.32850381731987
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.9721162312885236,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10888671875,
+ "learning_rate": 0.000838,
+ "loss": 0.0649,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 678055.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011026890948414803,
+ "skip_count": 0.0,
+ "step": 420,
+ "text_loss": 0.6637290716171265
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.9815086586439683,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.000842,
+ "loss": 0.0771,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 680979.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07451887428760529,
+ "skip_count": 1.0,
+ "step": 422,
+ "text_loss": 0.27131685614585876
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 1.990901085999413,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1318359375,
+ "learning_rate": 0.000846,
+ "loss": 0.0714,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 684144.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11341800540685654,
+ "skip_count": 1.0,
+ "step": 424,
+ "text_loss": 0.652126669883728
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.0,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2158203125,
+ "learning_rate": 0.00085,
+ "loss": 0.0754,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 687004.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08985847979784012,
+ "skip_count": 0.0,
+ "step": 426,
+ "text_loss": 0.2589428424835205
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.009392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.23828125,
+ "learning_rate": 0.000854,
+ "loss": 0.0866,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 689702.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011355436407029629,
+ "skip_count": 0.0,
+ "step": 428,
+ "text_loss": 0.8909716010093689
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.0187848547108893,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1435546875,
+ "learning_rate": 0.000858,
+ "loss": 0.0623,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 692698.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013788948766887188,
+ "skip_count": 0.0,
+ "step": 430,
+ "text_loss": 0.19141142070293427
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.028177282066334,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.000862,
+ "loss": 0.0499,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 696007.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07998392730951309,
+ "skip_count": 2.0,
+ "step": 432,
+ "text_loss": 0.1611809879541397
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.0375697094217786,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.173828125,
+ "learning_rate": 0.000866,
+ "loss": 0.0541,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 700271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06988382339477539,
+ "skip_count": 2.0,
+ "step": 434,
+ "text_loss": 0.37254223227500916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.0469621367772235,
+ "f1_execute": 0.8333333730697632,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1943359375,
+ "learning_rate": 0.00087,
+ "loss": 0.0834,
+ "macro_f1": 0.2777777910232544,
+ "num_tokens": 703519.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.28240787982940674,
+ "skip_count": 5.0,
+ "step": 436,
+ "text_loss": 0.29636648297309875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.056354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.423828125,
+ "learning_rate": 0.000874,
+ "loss": 0.0657,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 706826.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013924967497587204,
+ "skip_count": 0.0,
+ "step": 438,
+ "text_loss": 0.20867908000946045
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.065746991488113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2353515625,
+ "learning_rate": 0.000878,
+ "loss": 0.0657,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 710530.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01170142088085413,
+ "skip_count": 0.0,
+ "step": 440,
+ "text_loss": 0.7273373007774353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.0751394188435572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.171875,
+ "learning_rate": 0.000882,
+ "loss": 0.076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 713503.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011930872686207294,
+ "skip_count": 0.0,
+ "step": 442,
+ "text_loss": 0.39314430952072144
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.084531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.0008860000000000001,
+ "loss": 0.0592,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 716582.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008630385622382164,
+ "skip_count": 0.0,
+ "step": 444,
+ "text_loss": 0.5925271511077881
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.0939242735544465,
+ "f1_execute": 0.9019607901573181,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.23046875,
+ "learning_rate": 0.0008900000000000001,
+ "loss": 0.0811,
+ "macro_f1": 0.3006536066532135,
+ "num_tokens": 719941.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.3015584945678711,
+ "skip_count": 1.0,
+ "step": 446,
+ "text_loss": 0.5059905052185059
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.1033167009098914,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.203125,
+ "learning_rate": 0.000894,
+ "loss": 0.0822,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 723113.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.10897493362426758,
+ "skip_count": 1.0,
+ "step": 448,
+ "text_loss": 0.19616436958312988
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.112709128265336,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.33984375,
+ "learning_rate": 0.000898,
+ "loss": 0.0782,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 726193.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07236456125974655,
+ "skip_count": 2.0,
+ "step": 450,
+ "text_loss": 0.1773054152727127
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.1221015556207807,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.3203125,
+ "learning_rate": 0.000902,
+ "loss": 0.058,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 729275.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08184371143579483,
+ "skip_count": 0.0,
+ "step": 452,
+ "text_loss": 0.4927310049533844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.1314939829762256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1953125,
+ "learning_rate": 0.000906,
+ "loss": 0.0607,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 731948.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014033539220690727,
+ "skip_count": 0.0,
+ "step": 454,
+ "text_loss": 0.4745742678642273
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.154296875,
+ "learning_rate": 0.00091,
+ "loss": 0.0651,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 735351.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0071774693205952644,
+ "skip_count": 0.0,
+ "step": 456,
+ "text_loss": 0.18523462116718292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 2.150278837687115,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.400390625,
+ "learning_rate": 0.0009140000000000001,
+ "loss": 0.0738,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 738587.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07781517505645752,
+ "skip_count": 2.0,
+ "step": 458,
+ "text_loss": 0.3459635376930237
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 26.0,
+ "epoch": 2.1596712650425594,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.28125,
+ "learning_rate": 0.0009180000000000001,
+ "loss": 0.0723,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 741779.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09529037028551102,
+ "skip_count": 2.0,
+ "step": 460,
+ "text_loss": 0.20197433233261108
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.1690636923980042,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1865234375,
+ "learning_rate": 0.0009220000000000001,
+ "loss": 0.0519,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 745355.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009765669703483582,
+ "skip_count": 0.0,
+ "step": 462,
+ "text_loss": 0.7031404376029968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.1784561197534487,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1298828125,
+ "learning_rate": 0.0009260000000000001,
+ "loss": 0.0527,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 748628.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03344850242137909,
+ "skip_count": 1.0,
+ "step": 464,
+ "text_loss": 0.21274663507938385
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.1878485471088935,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.173828125,
+ "learning_rate": 0.00093,
+ "loss": 0.0534,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 751472.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.1354292333126068,
+ "skip_count": 2.0,
+ "step": 466,
+ "text_loss": 0.5350717306137085
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.197240974464338,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.142578125,
+ "learning_rate": 0.000934,
+ "loss": 0.0598,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 754479.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.056420840322971344,
+ "skip_count": 1.0,
+ "step": 468,
+ "text_loss": 0.28153330087661743
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.206633401819783,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.234375,
+ "learning_rate": 0.0009379999999999999,
+ "loss": 0.0597,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 757872.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.1622387170791626,
+ "skip_count": 1.0,
+ "step": 470,
+ "text_loss": 0.22956843674182892
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.2160258291752273,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.5,
+ "learning_rate": 0.000942,
+ "loss": 0.0953,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 760468.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05146972835063934,
+ "skip_count": 2.0,
+ "step": 472,
+ "text_loss": 0.4513966739177704
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.225418256530672,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.212890625,
+ "learning_rate": 0.000946,
+ "loss": 0.0592,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 763519.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.09022669494152069,
+ "skip_count": 0.0,
+ "step": 474,
+ "text_loss": 0.25758957862854004
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.234810683886117,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1259765625,
+ "learning_rate": 0.00095,
+ "loss": 0.0498,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 767391.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03044828027486801,
+ "skip_count": 1.0,
+ "step": 476,
+ "text_loss": 0.21366681158542633
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.2442031112415615,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.291015625,
+ "learning_rate": 0.000954,
+ "loss": 0.0802,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 770338.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.10397060960531235,
+ "skip_count": 1.0,
+ "step": 478,
+ "text_loss": 1.0396177768707275
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.2535955385970063,
+ "f1_execute": 0.8571429252624512,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.267578125,
+ "learning_rate": 0.000958,
+ "loss": 0.1099,
+ "macro_f1": 0.285714328289032,
+ "num_tokens": 773699.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.22604143619537354,
+ "skip_count": 4.0,
+ "step": 480,
+ "text_loss": 0.2570283114910126
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.2629879659524508,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.146484375,
+ "learning_rate": 0.000962,
+ "loss": 0.0667,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 777473.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.048258859664201736,
+ "skip_count": 1.0,
+ "step": 482,
+ "text_loss": 0.2540103495121002
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.2723803933078957,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.197265625,
+ "learning_rate": 0.000966,
+ "loss": 0.0592,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 780833.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023018671199679375,
+ "skip_count": 0.0,
+ "step": 484,
+ "text_loss": 0.38524550199508667
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.28177282066334,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.314453125,
+ "learning_rate": 0.0009699999999999999,
+ "loss": 0.0709,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 783656.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.044845327734947205,
+ "skip_count": 1.0,
+ "step": 486,
+ "text_loss": 0.5859048366546631
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.000974,
+ "loss": 0.0615,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 787173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010898692533373833,
+ "skip_count": 0.0,
+ "step": 488,
+ "text_loss": 0.3456067442893982
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.3005576753742294,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.000978,
+ "loss": 0.0796,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 790395.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06497956812381744,
+ "skip_count": 2.0,
+ "step": 490,
+ "text_loss": 0.3751123249530792
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.3099501027296743,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2158203125,
+ "learning_rate": 0.000982,
+ "loss": 0.0772,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 793137.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07763728499412537,
+ "skip_count": 1.0,
+ "step": 492,
+ "text_loss": 0.43296709656715393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.3193425300851187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1416015625,
+ "learning_rate": 0.0009860000000000001,
+ "loss": 0.0819,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 796497.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02127906307578087,
+ "skip_count": 0.0,
+ "step": 494,
+ "text_loss": 0.4841311275959015
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.3287349574405636,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.00099,
+ "loss": 0.073,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 799361.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.09518691152334213,
+ "skip_count": 0.0,
+ "step": 496,
+ "text_loss": 0.5094487071037292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 2.3381273847960085,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.130859375,
+ "learning_rate": 0.000994,
+ "loss": 0.0789,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 802629.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0563947930932045,
+ "skip_count": 2.0,
+ "step": 498,
+ "text_loss": 0.42783617973327637
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.347519812151453,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1865234375,
+ "learning_rate": 0.000998,
+ "loss": 0.0476,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 805881.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.10570426285266876,
+ "skip_count": 0.0,
+ "step": 500,
+ "text_loss": 0.28395503759384155
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 2.3569122395068973,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2275390625,
+ "learning_rate": 0.0009999999760498814,
+ "loss": 0.0849,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 809283.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.031202208250761032,
+ "skip_count": 2.0,
+ "step": 502,
+ "text_loss": 0.32970911264419556
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.366304666862342,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1455078125,
+ "learning_rate": 0.0009999997844489475,
+ "loss": 0.0574,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 812440.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07647835463285446,
+ "skip_count": 1.0,
+ "step": 504,
+ "text_loss": 0.4901447296142578
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.375697094217787,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25,
+ "learning_rate": 0.000999999401247153,
+ "loss": 0.0668,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 815716.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08515176922082901,
+ "skip_count": 2.0,
+ "step": 506,
+ "text_loss": 0.6157599687576294
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.3850895215732315,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25390625,
+ "learning_rate": 0.0009999988264446445,
+ "loss": 0.0686,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 819086.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00946938619017601,
+ "skip_count": 0.0,
+ "step": 508,
+ "text_loss": 0.5053519010543823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.3944819489286764,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1640625,
+ "learning_rate": 0.0009999980600416424,
+ "loss": 0.0574,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 822268.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01058756373822689,
+ "skip_count": 0.0,
+ "step": 510,
+ "text_loss": 0.5570021867752075
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.403874376284121,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1240234375,
+ "learning_rate": 0.000999997102038441,
+ "loss": 0.0678,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 825728.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008705209009349346,
+ "skip_count": 0.0,
+ "step": 512,
+ "text_loss": 0.6519040465354919
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.4132668036395657,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.220703125,
+ "learning_rate": 0.0009999959524354064,
+ "loss": 0.083,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 829459.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04024193435907364,
+ "skip_count": 1.0,
+ "step": 514,
+ "text_loss": 0.5290043950080872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25390625,
+ "learning_rate": 0.00099999461123298,
+ "loss": 0.0727,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 832291.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015742862597107887,
+ "skip_count": 0.0,
+ "step": 516,
+ "text_loss": 0.7910057902336121
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.432051658350455,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2275390625,
+ "learning_rate": 0.000999993078431675,
+ "loss": 0.0759,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 835399.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.16753782331943512,
+ "skip_count": 3.0,
+ "step": 518,
+ "text_loss": 0.45196083188056946
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.4414440857058994,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.236328125,
+ "learning_rate": 0.0009999913540320792,
+ "loss": 0.0968,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 838993.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09357143193483353,
+ "skip_count": 2.0,
+ "step": 520,
+ "text_loss": 0.5499435663223267
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 2.4508365130613443,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2451171875,
+ "learning_rate": 0.0009999894380348536,
+ "loss": 0.0821,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 842652.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.056803856045007706,
+ "skip_count": 2.0,
+ "step": 522,
+ "text_loss": 0.197520449757576
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 2.4602289404167887,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.2333984375,
+ "learning_rate": 0.000999987330440732,
+ "loss": 0.0725,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 847061.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08962195366621017,
+ "skip_count": 3.0,
+ "step": 524,
+ "text_loss": 0.27509039640426636
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.4696213677722336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.189453125,
+ "learning_rate": 0.000999985031250522,
+ "loss": 0.0561,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 850780.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022930558770895004,
+ "skip_count": 0.0,
+ "step": 526,
+ "text_loss": 0.13291706144809723
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.4790137951276785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.197265625,
+ "learning_rate": 0.0009999825404651053,
+ "loss": 0.0614,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 853886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017097990959882736,
+ "skip_count": 0.0,
+ "step": 528,
+ "text_loss": 0.21706295013427734
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.488406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.212890625,
+ "learning_rate": 0.0009999798580854356,
+ "loss": 0.0724,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 857364.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02831801027059555,
+ "skip_count": 0.0,
+ "step": 530,
+ "text_loss": 0.9035662412643433
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.497798649838568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1591796875,
+ "learning_rate": 0.000999976984112541,
+ "loss": 0.0674,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 860661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019671892747282982,
+ "skip_count": 0.0,
+ "step": 532,
+ "text_loss": 0.8354863524436951
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 2.5071910771940122,
+ "f1_execute": 0.9200000166893005,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.2890625,
+ "learning_rate": 0.0009999739185475231,
+ "loss": 0.0963,
+ "macro_f1": 0.47333335876464844,
+ "num_tokens": 864124.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.21383361518383026,
+ "skip_count": 3.0,
+ "step": 534,
+ "text_loss": 0.23422949016094208
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.516583504549457,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.0009999706613915565,
+ "loss": 0.0598,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 866976.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07158871740102768,
+ "skip_count": 1.0,
+ "step": 536,
+ "text_loss": 0.11800774186849594
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.5259759319049016,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.26953125,
+ "learning_rate": 0.0009999672126458894,
+ "loss": 0.0822,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 870549.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08185924589633942,
+ "skip_count": 1.0,
+ "step": 538,
+ "text_loss": 0.19232480227947235
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.5353683592603464,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1396484375,
+ "learning_rate": 0.000999963572311843,
+ "loss": 0.0604,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 873733.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01633382774889469,
+ "skip_count": 0.0,
+ "step": 540,
+ "text_loss": 0.3725031912326813
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.544760786615791,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15234375,
+ "learning_rate": 0.0009999597403908128,
+ "loss": 0.0761,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 877099.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0782657191157341,
+ "skip_count": 1.0,
+ "step": 542,
+ "text_loss": 0.17589199542999268
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 2.5541532139712357,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2177734375,
+ "learning_rate": 0.0009999557168842669,
+ "loss": 0.0716,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 879883.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05275818333029747,
+ "skip_count": 2.0,
+ "step": 544,
+ "text_loss": 0.26448264718055725
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.56354564132668,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.0009999515017937468,
+ "loss": 0.071,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 882223.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09335892647504807,
+ "skip_count": 2.0,
+ "step": 546,
+ "text_loss": 0.208544060587883
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.572938068682125,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.376953125,
+ "learning_rate": 0.0009999470951208684,
+ "loss": 0.0855,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 885241.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.22983254492282867,
+ "skip_count": 0.0,
+ "step": 548,
+ "text_loss": 0.6612338423728943
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.58233049603757,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.216796875,
+ "learning_rate": 0.00099994249686732,
+ "loss": 0.0786,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 887897.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.12858282029628754,
+ "skip_count": 0.0,
+ "step": 550,
+ "text_loss": 0.4673548936843872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.5917229233930144,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1591796875,
+ "learning_rate": 0.0009999377070348638,
+ "loss": 0.0944,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 891224.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017421770840883255,
+ "skip_count": 0.0,
+ "step": 552,
+ "text_loss": 0.6419258117675781
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.601115350748459,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15625,
+ "learning_rate": 0.000999932725625335,
+ "loss": 0.0791,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 894578.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07890026271343231,
+ "skip_count": 2.0,
+ "step": 554,
+ "text_loss": 0.5970752239227295
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.6105077781039037,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.216796875,
+ "learning_rate": 0.0009999275526406427,
+ "loss": 0.0796,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 897145.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.09836960583925247,
+ "skip_count": 1.0,
+ "step": 556,
+ "text_loss": 0.752425491809845
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.6199002054593485,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1875,
+ "learning_rate": 0.0009999221880827693,
+ "loss": 0.0882,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 900565.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017694659531116486,
+ "skip_count": 0.0,
+ "step": 558,
+ "text_loss": 0.195619136095047
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.629292632814793,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2021484375,
+ "learning_rate": 0.0009999166319537703,
+ "loss": 0.0561,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 903506.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019375264644622803,
+ "skip_count": 0.0,
+ "step": 560,
+ "text_loss": 0.4603337347507477
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 2.638685060170238,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.146484375,
+ "learning_rate": 0.0009999108842557748,
+ "loss": 0.0953,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 906380.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.12013207376003265,
+ "skip_count": 3.0,
+ "step": 562,
+ "text_loss": 0.6279402375221252
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.6480774875256823,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.255859375,
+ "learning_rate": 0.0009999049449909854,
+ "loss": 0.0799,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 909116.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06441342830657959,
+ "skip_count": 1.0,
+ "step": 564,
+ "text_loss": 0.23741699755191803
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.657469914881127,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15234375,
+ "learning_rate": 0.0009998988141616781,
+ "loss": 0.064,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 912189.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08309414982795715,
+ "skip_count": 1.0,
+ "step": 566,
+ "text_loss": 0.27780941128730774
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.6668623422365716,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1962890625,
+ "learning_rate": 0.0009998924917702023,
+ "loss": 0.0876,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 916279.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.07197169959545135,
+ "skip_count": 0.0,
+ "step": 568,
+ "text_loss": 0.6371755599975586
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.6762547695920165,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2255859375,
+ "learning_rate": 0.0009998859778189806,
+ "loss": 0.0706,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 919490.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008022273890674114,
+ "skip_count": 0.0,
+ "step": 570,
+ "text_loss": 0.6028938889503479
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.6856471969474613,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1650390625,
+ "learning_rate": 0.000999879272310509,
+ "loss": 0.084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 923694.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01634674146771431,
+ "skip_count": 0.0,
+ "step": 572,
+ "text_loss": 0.7177054286003113
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.695039624302906,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.17578125,
+ "learning_rate": 0.0009998723752473574,
+ "loss": 0.0716,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 926933.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.060559045523405075,
+ "skip_count": 1.0,
+ "step": 574,
+ "text_loss": 0.5203254818916321
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.185546875,
+ "learning_rate": 0.0009998652866321687,
+ "loss": 0.0801,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 929832.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011485611088573933,
+ "skip_count": 0.0,
+ "step": 576,
+ "text_loss": 0.6147452592849731
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.713824479013795,
+ "f1_execute": 0.8799999952316284,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.000999858006467659,
+ "loss": 0.0649,
+ "macro_f1": 0.29333335161209106,
+ "num_tokens": 933266.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.2929030954837799,
+ "skip_count": 4.0,
+ "step": 578,
+ "text_loss": 0.1720666140317917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.72321690636924,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.24609375,
+ "learning_rate": 0.0009998505347566186,
+ "loss": 0.0782,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 937545.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.053780000656843185,
+ "skip_count": 2.0,
+ "step": 580,
+ "text_loss": 0.3258405327796936
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.7326093337246844,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1416015625,
+ "learning_rate": 0.00099984287150191,
+ "loss": 0.0582,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 941001.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02637636847794056,
+ "skip_count": 0.0,
+ "step": 582,
+ "text_loss": 0.23762771487236023
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.7420017610801293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0009998350167064705,
+ "loss": 0.0672,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 943989.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01637580618262291,
+ "skip_count": 0.0,
+ "step": 584,
+ "text_loss": 0.7460582852363586
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.7513941884355737,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1884765625,
+ "learning_rate": 0.0009998269703733096,
+ "loss": 0.0686,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 947245.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.13934117555618286,
+ "skip_count": 0.0,
+ "step": 586,
+ "text_loss": 0.5284690260887146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.7607866157910186,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.13671875,
+ "learning_rate": 0.0009998187325055106,
+ "loss": 0.0667,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 950116.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02138397842645645,
+ "skip_count": 0.0,
+ "step": 588,
+ "text_loss": 0.3920256197452545
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1533203125,
+ "learning_rate": 0.0009998103031062305,
+ "loss": 0.0778,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 953277.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007098200265318155,
+ "skip_count": 0.0,
+ "step": 590,
+ "text_loss": 0.7472905516624451
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.779571470501908,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.318359375,
+ "learning_rate": 0.0009998016821786994,
+ "loss": 0.0872,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 958229.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.07946522533893585,
+ "skip_count": 1.0,
+ "step": 592,
+ "text_loss": 0.5506448745727539
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.7889638978573528,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1357421875,
+ "learning_rate": 0.000999792869726221,
+ "loss": 0.0523,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 961016.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0850791186094284,
+ "skip_count": 1.0,
+ "step": 594,
+ "text_loss": 0.3824431002140045
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1650390625,
+ "learning_rate": 0.0009997838657521717,
+ "loss": 0.0632,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 963847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016370445489883423,
+ "skip_count": 0.0,
+ "step": 596,
+ "text_loss": 0.2139475792646408
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.8077487525682416,
+ "f1_execute": 0.923076868057251,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12890625,
+ "learning_rate": 0.0009997746702600026,
+ "loss": 0.0702,
+ "macro_f1": 0.307692289352417,
+ "num_tokens": 966619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.1310746818780899,
+ "skip_count": 3.0,
+ "step": 598,
+ "text_loss": 0.3651018440723419
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.8171411799236865,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.23828125,
+ "learning_rate": 0.0009997652832532372,
+ "loss": 0.0792,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 970418.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.14303378760814667,
+ "skip_count": 0.0,
+ "step": 600,
+ "text_loss": 0.7094736099243164
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.8265336072791314,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0009997557047354722,
+ "loss": 0.0531,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 973491.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03334212675690651,
+ "skip_count": 1.0,
+ "step": 602,
+ "text_loss": 0.4812237024307251
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 2.835926034634576,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2890625,
+ "learning_rate": 0.0009997459347103783,
+ "loss": 0.0956,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 976672.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02831871062517166,
+ "skip_count": 0.0,
+ "step": 604,
+ "text_loss": 0.21737146377563477
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.8453184619900207,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1298828125,
+ "learning_rate": 0.0009997359731816998,
+ "loss": 0.0646,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 979898.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017968013882637024,
+ "skip_count": 0.0,
+ "step": 606,
+ "text_loss": 0.5458008050918579
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.854710889345465,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.224609375,
+ "learning_rate": 0.0009997258201532536,
+ "loss": 0.0751,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 982811.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016256732866168022,
+ "skip_count": 0.0,
+ "step": 608,
+ "text_loss": 0.8643257021903992
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2275390625,
+ "learning_rate": 0.0009997154756289303,
+ "loss": 0.0561,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 985245.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021214161068201065,
+ "skip_count": 0.0,
+ "step": 610,
+ "text_loss": 0.2204967886209488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.8734957440563544,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.150390625,
+ "learning_rate": 0.000999704939612694,
+ "loss": 0.0636,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 988539.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.23249399662017822,
+ "skip_count": 2.0,
+ "step": 612,
+ "text_loss": 0.32489025592803955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.8828881714117993,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.095703125,
+ "learning_rate": 0.0009996942121085824,
+ "loss": 0.0445,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 991660.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010706410743296146,
+ "skip_count": 0.0,
+ "step": 614,
+ "text_loss": 0.4551754891872406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.8922805987672437,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.3671875,
+ "learning_rate": 0.000999683293120706,
+ "loss": 0.1016,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 994828.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006676184479147196,
+ "skip_count": 0.0,
+ "step": 616,
+ "text_loss": 0.6212068200111389
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.9016730261226886,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.408203125,
+ "learning_rate": 0.0009996721826532491,
+ "loss": 0.0976,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 997951.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.2148125320672989,
+ "skip_count": 2.0,
+ "step": 618,
+ "text_loss": 0.26514527201652527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.911065453478133,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1904296875,
+ "learning_rate": 0.000999660880710469,
+ "loss": 0.0909,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1001139.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022332455962896347,
+ "skip_count": 0.0,
+ "step": 620,
+ "text_loss": 0.26131340861320496
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.920457880833578,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.169921875,
+ "learning_rate": 0.0009996493872966971,
+ "loss": 0.0732,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1003678.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08348730951547623,
+ "skip_count": 0.0,
+ "step": 622,
+ "text_loss": 0.19151706993579865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.929850308189023,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.173828125,
+ "learning_rate": 0.0009996377024163374,
+ "loss": 0.0822,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1007082.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.028577150776982307,
+ "skip_count": 0.0,
+ "step": 624,
+ "text_loss": 0.305387407541275
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.9392427355444672,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11279296875,
+ "learning_rate": 0.0009996258260738676,
+ "loss": 0.0892,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1010064.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08312026411294937,
+ "skip_count": 0.0,
+ "step": 626,
+ "text_loss": 0.49436143040657043
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.9486351628999117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009996137582738388,
+ "loss": 0.0591,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1013462.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013337327167391777,
+ "skip_count": 0.0,
+ "step": 628,
+ "text_loss": 0.6515294313430786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.9580275902553566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.140625,
+ "learning_rate": 0.000999601499020875,
+ "loss": 0.0537,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1016246.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.029126765206456184,
+ "skip_count": 0.0,
+ "step": 630,
+ "text_loss": 0.18834827840328217
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.9674200176108014,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.095703125,
+ "learning_rate": 0.0009995890483196746,
+ "loss": 0.0602,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1019286.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.054844800382852554,
+ "skip_count": 1.0,
+ "step": 632,
+ "text_loss": 0.6988179087638855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.322265625,
+ "learning_rate": 0.0009995764061750086,
+ "loss": 0.0767,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1022207.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010095693171024323,
+ "skip_count": 0.0,
+ "step": 634,
+ "text_loss": 0.558451771736145
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.9862048723216907,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2890625,
+ "learning_rate": 0.000999563572591721,
+ "loss": 0.0521,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1025319.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0698433518409729,
+ "skip_count": 1.0,
+ "step": 636,
+ "text_loss": 0.5961872935295105
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 2.995597299677135,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11083984375,
+ "learning_rate": 0.0009995505475747302,
+ "loss": 0.0849,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1028362.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.040211405605077744,
+ "skip_count": 1.0,
+ "step": 638,
+ "text_loss": 0.546863317489624
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.004696213677722,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.119140625,
+ "learning_rate": 0.0009995373311290272,
+ "loss": 0.0709,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1032199.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.1457643061876297,
+ "skip_count": 1.0,
+ "step": 640,
+ "text_loss": 0.2137298285961151
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.014088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1279296875,
+ "learning_rate": 0.0009995239232596764,
+ "loss": 0.0545,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1035801.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011394930072128773,
+ "skip_count": 0.0,
+ "step": 642,
+ "text_loss": 0.43054503202438354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.0234810683886115,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1015625,
+ "learning_rate": 0.0009995103239718163,
+ "loss": 0.0665,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1039223.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00997432041913271,
+ "skip_count": 0.0,
+ "step": 644,
+ "text_loss": 0.7749615907669067
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.0328734957440564,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2275390625,
+ "learning_rate": 0.0009994965332706573,
+ "loss": 0.0755,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1042154.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.10589150339365005,
+ "skip_count": 0.0,
+ "step": 646,
+ "text_loss": 0.7812211513519287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.042265923099501,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1943359375,
+ "learning_rate": 0.0009994825511614846,
+ "loss": 0.0383,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1045250.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0748734176158905,
+ "skip_count": 1.0,
+ "step": 648,
+ "text_loss": 0.844803512096405
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.0516583504549457,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1220703125,
+ "learning_rate": 0.0009994683776496562,
+ "loss": 0.0433,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1048446.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03742415830492973,
+ "skip_count": 1.0,
+ "step": 650,
+ "text_loss": 0.2098839282989502
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.0610507778103906,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12890625,
+ "learning_rate": 0.0009994540127406034,
+ "loss": 0.0591,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1051840.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06025516986846924,
+ "skip_count": 2.0,
+ "step": 652,
+ "text_loss": 0.27727583050727844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.070443205165835,
+ "f1_execute": 0.8979591727256775,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.181640625,
+ "learning_rate": 0.0009994394564398306,
+ "loss": 0.0519,
+ "macro_f1": 0.521541953086853,
+ "num_tokens": 1055142.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.22807340323925018,
+ "skip_count": 2.0,
+ "step": 654,
+ "text_loss": 0.9672397971153259
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.130859375,
+ "learning_rate": 0.0009994247087529158,
+ "loss": 0.0618,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1057698.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01348950993269682,
+ "skip_count": 0.0,
+ "step": 656,
+ "text_loss": 0.6375506520271301
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.0892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1953125,
+ "learning_rate": 0.0009994097696855106,
+ "loss": 0.0412,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1060624.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009649243205785751,
+ "skip_count": 0.0,
+ "step": 658,
+ "text_loss": 0.5315385460853577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.098620487232169,
+ "f1_execute": 0.923076868057251,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2041015625,
+ "learning_rate": 0.0009993946392433395,
+ "loss": 0.0609,
+ "macro_f1": 0.307692289352417,
+ "num_tokens": 1065076.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.1250980943441391,
+ "skip_count": 3.0,
+ "step": 660,
+ "text_loss": 0.25780341029167175
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.1080129145876136,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1640625,
+ "learning_rate": 0.0009993793174322006,
+ "loss": 0.0471,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1068365.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011544390581548214,
+ "skip_count": 0.0,
+ "step": 662,
+ "text_loss": 0.34876301884651184
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.1174053419430585,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0009993638042579654,
+ "loss": 0.0473,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1071693.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03777370601892471,
+ "skip_count": 1.0,
+ "step": 664,
+ "text_loss": 0.21811571717262268
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.126797769298503,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.203125,
+ "learning_rate": 0.0009993480997265783,
+ "loss": 0.0475,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1074733.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.049949806183576584,
+ "skip_count": 2.0,
+ "step": 666,
+ "text_loss": 0.38410288095474243
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.136190196653948,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10302734375,
+ "learning_rate": 0.0009993322038440572,
+ "loss": 0.0605,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1077993.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0247171800583601,
+ "skip_count": 0.0,
+ "step": 668,
+ "text_loss": 0.25576895475387573
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.1455826240093923,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.216796875,
+ "learning_rate": 0.000999316116616494,
+ "loss": 0.0619,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1080491.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008118715137243271,
+ "skip_count": 0.0,
+ "step": 670,
+ "text_loss": 0.6269792914390564
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.154975051364837,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.173828125,
+ "learning_rate": 0.0009992998380500527,
+ "loss": 0.0462,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1083817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03366057574748993,
+ "skip_count": 1.0,
+ "step": 672,
+ "text_loss": 0.26891493797302246
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.1643674787202816,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.0009992833681509716,
+ "loss": 0.0529,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1087368.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.020552074536681175,
+ "skip_count": 0.0,
+ "step": 674,
+ "text_loss": 0.14421936869621277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.1737599060757264,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.18359375,
+ "learning_rate": 0.0009992667069255619,
+ "loss": 0.0696,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 1090452.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06937336176633835,
+ "skip_count": 2.0,
+ "step": 676,
+ "text_loss": 0.24999259412288666
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.1831523334311713,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08740234375,
+ "learning_rate": 0.0009992498543802085,
+ "loss": 0.0588,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1093996.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0380021296441555,
+ "skip_count": 0.0,
+ "step": 678,
+ "text_loss": 0.42473849654197693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.25,
+ "avg_layers": 27.0,
+ "epoch": 3.1925447607866158,
+ "f1_execute": 0.9200000166893005,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.2119140625,
+ "learning_rate": 0.0009992328105213688,
+ "loss": 0.0411,
+ "macro_f1": 0.4400000274181366,
+ "num_tokens": 1096837.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.20885063707828522,
+ "skip_count": 4.0,
+ "step": 680,
+ "text_loss": 0.3829527199268341
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.2019371881420606,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.0009992155753555747,
+ "loss": 0.0722,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1100320.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018230699002742767,
+ "skip_count": 2.0,
+ "step": 682,
+ "text_loss": 0.6190969944000244
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.211329615497505,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.30859375,
+ "learning_rate": 0.0009991981488894303,
+ "loss": 0.0681,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 1103682.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05550144240260124,
+ "skip_count": 1.0,
+ "step": 684,
+ "text_loss": 0.44418027997016907
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.22072204285295,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2158203125,
+ "learning_rate": 0.0009991805311296133,
+ "loss": 0.0507,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1106427.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07990608364343643,
+ "skip_count": 2.0,
+ "step": 686,
+ "text_loss": 0.5577231645584106
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.2301144702083944,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1962890625,
+ "learning_rate": 0.0009991627220828753,
+ "loss": 0.0568,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1109314.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05167485028505325,
+ "skip_count": 2.0,
+ "step": 688,
+ "text_loss": 0.27325430512428284
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.2395068975638392,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10693359375,
+ "learning_rate": 0.0009991447217560408,
+ "loss": 0.0521,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1112748.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04621964320540428,
+ "skip_count": 2.0,
+ "step": 690,
+ "text_loss": 0.5288321375846863
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.2488993249192837,
+ "f1_execute": 0.923076868057251,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1962890625,
+ "learning_rate": 0.000999126530156007,
+ "loss": 0.0499,
+ "macro_f1": 0.307692289352417,
+ "num_tokens": 1116965.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11950276792049408,
+ "skip_count": 2.0,
+ "step": 692,
+ "text_loss": 0.14215624332427979
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.2582917522747286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2353515625,
+ "learning_rate": 0.0009991081472897454,
+ "loss": 0.0722,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1120570.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01905500330030918,
+ "skip_count": 0.0,
+ "step": 694,
+ "text_loss": 0.41862696409225464
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.267684179630173,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1357421875,
+ "learning_rate": 0.0009990895731643002,
+ "loss": 0.0464,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1124009.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06974572688341141,
+ "skip_count": 0.0,
+ "step": 696,
+ "text_loss": 0.41160130500793457
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.277076606985618,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1923828125,
+ "learning_rate": 0.000999070807786789,
+ "loss": 0.0531,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1127370.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.07055293023586273,
+ "skip_count": 0.0,
+ "step": 698,
+ "text_loss": 0.48068273067474365
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.2864690343410627,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.197265625,
+ "learning_rate": 0.000999051851164403,
+ "loss": 0.0619,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1130234.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.12506946921348572,
+ "skip_count": 1.0,
+ "step": 700,
+ "text_loss": 0.47925490140914917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1943359375,
+ "learning_rate": 0.000999032703304406,
+ "loss": 0.0674,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1132874.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00809287466108799,
+ "skip_count": 0.0,
+ "step": 702,
+ "text_loss": 0.47433632612228394
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.305253889051952,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1064453125,
+ "learning_rate": 0.0009990133642141358,
+ "loss": 0.0497,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1136011.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0319170281291008,
+ "skip_count": 2.0,
+ "step": 704,
+ "text_loss": 0.6574832201004028
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.3146463164073965,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.33984375,
+ "learning_rate": 0.000998993833901003,
+ "loss": 0.0619,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1139674.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09850362688302994,
+ "skip_count": 2.0,
+ "step": 706,
+ "text_loss": 0.7660127282142639
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.3240387437628414,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12158203125,
+ "learning_rate": 0.0009989741123724919,
+ "loss": 0.0574,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1143558.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006673311349004507,
+ "skip_count": 0.0,
+ "step": 708,
+ "text_loss": 0.5976111888885498
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.333431171118286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.154296875,
+ "learning_rate": 0.0009989541996361594,
+ "loss": 0.045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1146122.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004988791421055794,
+ "skip_count": 0.0,
+ "step": 710,
+ "text_loss": 0.5256119966506958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.3428235984737307,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1044921875,
+ "learning_rate": 0.0009989340956996367,
+ "loss": 0.0528,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1149546.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0067769973538815975,
+ "skip_count": 0.0,
+ "step": 712,
+ "text_loss": 0.5040497779846191
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.352216025829175,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.26953125,
+ "learning_rate": 0.0009989138005706273,
+ "loss": 0.0735,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1153195.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09899546951055527,
+ "skip_count": 2.0,
+ "step": 714,
+ "text_loss": 0.20803412795066833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1396484375,
+ "learning_rate": 0.000998893314256908,
+ "loss": 0.064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1157081.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010492355562746525,
+ "skip_count": 0.0,
+ "step": 716,
+ "text_loss": 0.23077639937400818
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.3710008805400644,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1298828125,
+ "learning_rate": 0.0009988726367663298,
+ "loss": 0.0539,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1160079.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01063773687928915,
+ "skip_count": 0.0,
+ "step": 718,
+ "text_loss": 0.6085864901542664
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.3803933078955093,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1640625,
+ "learning_rate": 0.0009988517681068163,
+ "loss": 0.0421,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1163249.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.05981874838471413,
+ "skip_count": 0.0,
+ "step": 720,
+ "text_loss": 0.4047050476074219
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.3897857352509537,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.171875,
+ "learning_rate": 0.0009988307082863638,
+ "loss": 0.0361,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1166259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009750043973326683,
+ "skip_count": 0.0,
+ "step": 722,
+ "text_loss": 0.5306474566459656
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.3991781626063986,
+ "f1_execute": 0.9411765336990356,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.240234375,
+ "learning_rate": 0.0009988094573130434,
+ "loss": 0.063,
+ "macro_f1": 0.5359477400779724,
+ "num_tokens": 1168887.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.18601104617118835,
+ "skip_count": 2.0,
+ "step": 724,
+ "text_loss": 0.53528892993927
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.408570589961843,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.142578125,
+ "learning_rate": 0.0009987880151949974,
+ "loss": 0.0496,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1172625.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02845010720193386,
+ "skip_count": 1.0,
+ "step": 726,
+ "text_loss": 0.4760453701019287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.417963017317288,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2177734375,
+ "learning_rate": 0.0009987663819404434,
+ "loss": 0.06,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1176580.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017596980556845665,
+ "skip_count": 2.0,
+ "step": 728,
+ "text_loss": 0.5146099328994751
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.427355444672733,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1318359375,
+ "learning_rate": 0.000998744557557671,
+ "loss": 0.0484,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1179804.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0625474750995636,
+ "skip_count": 1.0,
+ "step": 730,
+ "text_loss": 0.27738022804260254
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.436747872028177,
+ "f1_execute": 0.923076868057251,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.203125,
+ "learning_rate": 0.0009987225420550433,
+ "loss": 0.0796,
+ "macro_f1": 0.307692289352417,
+ "num_tokens": 1182658.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.16188351809978485,
+ "skip_count": 2.0,
+ "step": 732,
+ "text_loss": 0.23231445252895355
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.446140299383622,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2001953125,
+ "learning_rate": 0.0009987003354409965,
+ "loss": 0.0626,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1185451.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02391529455780983,
+ "skip_count": 0.0,
+ "step": 734,
+ "text_loss": 0.4496627151966095
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.4555327267390665,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.234375,
+ "learning_rate": 0.0009986779377240405,
+ "loss": 0.0513,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 1188666.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08435963839292526,
+ "skip_count": 1.0,
+ "step": 736,
+ "text_loss": 0.4950787127017975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.4649251540945114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1220703125,
+ "learning_rate": 0.000998655348912758,
+ "loss": 0.0515,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1193035.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01648722216486931,
+ "skip_count": 0.0,
+ "step": 738,
+ "text_loss": 0.24761848151683807
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1513671875,
+ "learning_rate": 0.0009986325690158051,
+ "loss": 0.0435,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1196840.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013143910095095634,
+ "skip_count": 0.0,
+ "step": 740,
+ "text_loss": 0.15662719309329987
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.4837100088054007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009986095980419113,
+ "loss": 0.0757,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1200573.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.026706280186772346,
+ "skip_count": 0.0,
+ "step": 742,
+ "text_loss": 0.16725164651870728
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.493102436160845,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1982421875,
+ "learning_rate": 0.0009985864359998787,
+ "loss": 0.0795,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 1203589.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.28607678413391113,
+ "skip_count": 3.0,
+ "step": 744,
+ "text_loss": 0.6350882053375244
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.50249486351629,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.0009985630828985835,
+ "loss": 0.0572,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1206422.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05685260891914368,
+ "skip_count": 1.0,
+ "step": 746,
+ "text_loss": 0.33779552578926086
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.5118872908717345,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.09814453125,
+ "learning_rate": 0.0009985395387469742,
+ "loss": 0.0458,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1211588.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0437830351293087,
+ "skip_count": 2.0,
+ "step": 748,
+ "text_loss": 0.28664472699165344
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.5212797182271793,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15625,
+ "learning_rate": 0.0009985158035540735,
+ "loss": 0.0714,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1214580.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.07074898481369019,
+ "skip_count": 0.0,
+ "step": 750,
+ "text_loss": 0.3939313292503357
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.21484375,
+ "learning_rate": 0.0009984918773289762,
+ "loss": 0.0699,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1217388.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009757856838405132,
+ "skip_count": 0.0,
+ "step": 752,
+ "text_loss": 0.37641215324401855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.5400645729380686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.140625,
+ "learning_rate": 0.0009984677600808512,
+ "loss": 0.054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1219960.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02515069581568241,
+ "skip_count": 0.0,
+ "step": 754,
+ "text_loss": 0.155938982963562
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.5494570002935135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.30078125,
+ "learning_rate": 0.0009984434518189405,
+ "loss": 0.0764,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1223234.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.025766927748918533,
+ "skip_count": 0.0,
+ "step": 756,
+ "text_loss": 0.691118061542511
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 3.558849427648958,
+ "f1_execute": 0.9411765336990356,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1416015625,
+ "learning_rate": 0.0009984189525525584,
+ "loss": 0.0451,
+ "macro_f1": 0.5359477400779724,
+ "num_tokens": 1225764.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.1782722771167755,
+ "skip_count": 2.0,
+ "step": 758,
+ "text_loss": 0.3592209219932556
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.568241855004403,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.189453125,
+ "learning_rate": 0.0009983942622910935,
+ "loss": 0.0659,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1230097.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00825568474829197,
+ "skip_count": 0.0,
+ "step": 760,
+ "text_loss": 0.4646475315093994
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.5776342823598473,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1962890625,
+ "learning_rate": 0.0009983693810440074,
+ "loss": 0.0477,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1233140.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04156976938247681,
+ "skip_count": 2.0,
+ "step": 762,
+ "text_loss": 0.298682302236557
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.587026709715292,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.3515625,
+ "learning_rate": 0.000998344308820834,
+ "loss": 0.0666,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1236305.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05697929114103317,
+ "skip_count": 1.0,
+ "step": 764,
+ "text_loss": 0.5249121189117432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.5964191370707366,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.18359375,
+ "learning_rate": 0.0009983190456311817,
+ "loss": 0.0592,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1239673.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09547408670186996,
+ "skip_count": 3.0,
+ "step": 766,
+ "text_loss": 0.41277334094047546
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 3.6058115644261814,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.185546875,
+ "learning_rate": 0.000998293591484731,
+ "loss": 0.0484,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1242292.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030693158507347107,
+ "skip_count": 2.0,
+ "step": 768,
+ "text_loss": 0.1583656519651413
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.615203991781626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15234375,
+ "learning_rate": 0.000998267946391236,
+ "loss": 0.051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1244661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01211300864815712,
+ "skip_count": 0.0,
+ "step": 770,
+ "text_loss": 0.4629349112510681
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.6245964191370708,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0927734375,
+ "learning_rate": 0.0009982421103605238,
+ "loss": 0.0441,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1248688.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0665968507528305,
+ "skip_count": 2.0,
+ "step": 772,
+ "text_loss": 0.4019293785095215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.6339888464925156,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2890625,
+ "learning_rate": 0.000998216083402495,
+ "loss": 0.0613,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1251395.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07186859846115112,
+ "skip_count": 2.0,
+ "step": 774,
+ "text_loss": 0.4659276604652405
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.302734375,
+ "learning_rate": 0.0009981898655271235,
+ "loss": 0.0488,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1254888.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007823926396667957,
+ "skip_count": 0.0,
+ "step": 776,
+ "text_loss": 0.5160359740257263
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 3.6527737012034045,
+ "f1_execute": 0.9130434989929199,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.11962890625,
+ "learning_rate": 0.0009981634567444557,
+ "loss": 0.0775,
+ "macro_f1": 0.590062141418457,
+ "num_tokens": 1258250.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.24624499678611755,
+ "skip_count": 4.0,
+ "step": 778,
+ "text_loss": 0.29319918155670166
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.6621661285588494,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.0009981368570646115,
+ "loss": 0.0885,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1260916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030730176717042923,
+ "skip_count": 1.0,
+ "step": 780,
+ "text_loss": 0.624981164932251
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.6715585559142943,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.142578125,
+ "learning_rate": 0.0009981100664977838,
+ "loss": 0.0699,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1264004.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006829176563769579,
+ "skip_count": 0.0,
+ "step": 782,
+ "text_loss": 0.6137266159057617
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.6809509832697387,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1748046875,
+ "learning_rate": 0.0009980830850542391,
+ "loss": 0.058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1267130.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018471000716090202,
+ "skip_count": 0.0,
+ "step": 784,
+ "text_loss": 0.15213175117969513
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.6903434106251836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2353515625,
+ "learning_rate": 0.0009980559127443166,
+ "loss": 0.052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1271129.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007903140969574451,
+ "skip_count": 0.0,
+ "step": 786,
+ "text_loss": 0.5768613219261169
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.699735837980628,
+ "f1_execute": 0.923076868057251,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.130859375,
+ "learning_rate": 0.000998028549578429,
+ "loss": 0.0719,
+ "macro_f1": 0.307692289352417,
+ "num_tokens": 1274232.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06737866252660751,
+ "skip_count": 3.0,
+ "step": 788,
+ "text_loss": 0.2877073585987091
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.709128265336073,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1748046875,
+ "learning_rate": 0.0009980009955670615,
+ "loss": 0.0698,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1277193.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.10194934904575348,
+ "skip_count": 3.0,
+ "step": 790,
+ "text_loss": 0.11860492825508118
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.7185206926915173,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.126953125,
+ "learning_rate": 0.000997973250720773,
+ "loss": 0.0552,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1280960.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.10297708213329315,
+ "skip_count": 2.0,
+ "step": 792,
+ "text_loss": 0.13477706909179688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.727913120046962,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009979453150501954,
+ "loss": 0.0663,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1284611.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06122037023305893,
+ "skip_count": 1.0,
+ "step": 794,
+ "text_loss": 0.40569379925727844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.737305547402407,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1279296875,
+ "learning_rate": 0.000997917188566034,
+ "loss": 0.062,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1287834.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.061135001480579376,
+ "skip_count": 2.0,
+ "step": 796,
+ "text_loss": 0.2829287648200989
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.7466979747578515,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.109375,
+ "learning_rate": 0.0009978888712790664,
+ "loss": 0.0654,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1291666.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04841872677206993,
+ "skip_count": 1.0,
+ "step": 798,
+ "text_loss": 1.011757254600525
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.20000000298023224,
+ "avg_layers": 27.0,
+ "epoch": 3.756090402113296,
+ "f1_execute": 0.8979591727256775,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.3333333134651184,
+ "grad_norm": 0.14453125,
+ "learning_rate": 0.0009978603632001444,
+ "loss": 0.0636,
+ "macro_f1": 0.4104308485984802,
+ "num_tokens": 1294627.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.15698759257793427,
+ "skip_count": 5.0,
+ "step": 800,
+ "text_loss": 0.4457623362541199
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.765482829468741,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.283203125,
+ "learning_rate": 0.0009978316643401916,
+ "loss": 0.0688,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1297711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018952010199427605,
+ "skip_count": 0.0,
+ "step": 802,
+ "text_loss": 0.2069481462240219
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.7748752568241857,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.14453125,
+ "learning_rate": 0.0009978027747102062,
+ "loss": 0.0479,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1300569.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014538386836647987,
+ "skip_count": 0.0,
+ "step": 804,
+ "text_loss": 0.4983852505683899
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.78426768417963,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2109375,
+ "learning_rate": 0.0009977736943212584,
+ "loss": 0.0721,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1303969.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.11164087057113647,
+ "skip_count": 2.0,
+ "step": 806,
+ "text_loss": 0.2910642921924591
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.793660111535075,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1826171875,
+ "learning_rate": 0.000997744423184492,
+ "loss": 0.0424,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1307263.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06073406711220741,
+ "skip_count": 1.0,
+ "step": 808,
+ "text_loss": 0.18831779062747955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.25,
+ "avg_layers": 27.0,
+ "epoch": 3.8030525388905194,
+ "f1_execute": 0.9200000166893005,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.26171875,
+ "learning_rate": 0.0009977149613111236,
+ "loss": 0.0486,
+ "macro_f1": 0.4400000274181366,
+ "num_tokens": 1309953.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11035524308681488,
+ "skip_count": 4.0,
+ "step": 810,
+ "text_loss": 0.7872759699821472
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.8124449662459643,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1650390625,
+ "learning_rate": 0.0009976853087124433,
+ "loss": 0.0536,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1313243.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021804286167025566,
+ "skip_count": 0.0,
+ "step": 812,
+ "text_loss": 0.22349292039871216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.8218373936014087,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.28125,
+ "learning_rate": 0.0009976554653998138,
+ "loss": 0.0612,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 1316165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.10715524107217789,
+ "skip_count": 2.0,
+ "step": 814,
+ "text_loss": 0.18035532534122467
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.8312298209568536,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1279296875,
+ "learning_rate": 0.000997625431384671,
+ "loss": 0.0564,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1319206.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007173649035394192,
+ "skip_count": 0.0,
+ "step": 816,
+ "text_loss": 0.48928648233413696
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.8406222483122985,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1357421875,
+ "learning_rate": 0.0009975952066785243,
+ "loss": 0.0655,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 1322549.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.22308112680912018,
+ "skip_count": 4.0,
+ "step": 818,
+ "text_loss": 0.5211259722709656
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.850014675667743,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1337890625,
+ "learning_rate": 0.0009975647912929557,
+ "loss": 0.0564,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1325213.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00998698640614748,
+ "skip_count": 0.0,
+ "step": 820,
+ "text_loss": 0.7117052674293518
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.8594071030231873,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15234375,
+ "learning_rate": 0.0009975341852396205,
+ "loss": 0.0723,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1328383.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07454588264226913,
+ "skip_count": 2.0,
+ "step": 822,
+ "text_loss": 0.34539610147476196
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.8687995303786322,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1630859375,
+ "learning_rate": 0.0009975033885302469,
+ "loss": 0.0604,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1331406.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009157589636743069,
+ "skip_count": 0.0,
+ "step": 824,
+ "text_loss": 0.7484824657440186
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.878191957734077,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1923828125,
+ "learning_rate": 0.0009974724011766363,
+ "loss": 0.0474,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1334410.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.17149391770362854,
+ "skip_count": 0.0,
+ "step": 826,
+ "text_loss": 0.5913820266723633
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.8875843850895215,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1884765625,
+ "learning_rate": 0.0009974412231906632,
+ "loss": 0.058,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1337653.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.09743282198905945,
+ "skip_count": 1.0,
+ "step": 828,
+ "text_loss": 0.2505693733692169
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.8969768124449664,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1533203125,
+ "learning_rate": 0.0009974098545842748,
+ "loss": 0.0638,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1340860.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.041490405797958374,
+ "skip_count": 1.0,
+ "step": 830,
+ "text_loss": 0.5585370063781738
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 3.906369239800411,
+ "f1_execute": 0.9019607901573181,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.193359375,
+ "learning_rate": 0.0009973782953694918,
+ "loss": 0.0746,
+ "macro_f1": 0.3006536066532135,
+ "num_tokens": 1344232.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.16080693900585175,
+ "skip_count": 3.0,
+ "step": 832,
+ "text_loss": 0.4782734513282776
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.9157616671558557,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1298828125,
+ "learning_rate": 0.000997346545558408,
+ "loss": 0.0522,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1347667.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01173500344157219,
+ "skip_count": 0.0,
+ "step": 834,
+ "text_loss": 0.25036177039146423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.173828125,
+ "learning_rate": 0.0009973146051631895,
+ "loss": 0.0522,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1350707.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011477196589112282,
+ "skip_count": 0.0,
+ "step": 836,
+ "text_loss": 0.5482863187789917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.934546521866745,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1650390625,
+ "learning_rate": 0.0009972824741960764,
+ "loss": 0.0536,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1353704.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010528896935284138,
+ "skip_count": 0.0,
+ "step": 838,
+ "text_loss": 0.6732596158981323
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.9439389492221895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1181640625,
+ "learning_rate": 0.000997250152669381,
+ "loss": 0.0573,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1356608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010678744874894619,
+ "skip_count": 0.0,
+ "step": 840,
+ "text_loss": 0.5479338765144348
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.9533313765776343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.181640625,
+ "learning_rate": 0.000997217640595489,
+ "loss": 0.0631,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1359809.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00835978239774704,
+ "skip_count": 0.0,
+ "step": 842,
+ "text_loss": 0.42543259263038635
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.9627238039330788,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1923828125,
+ "learning_rate": 0.0009971849379868593,
+ "loss": 0.0653,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1362201.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009930923581123352,
+ "skip_count": 0.0,
+ "step": 844,
+ "text_loss": 0.720462441444397
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.9721162312885236,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1123046875,
+ "learning_rate": 0.0009971520448560235,
+ "loss": 0.0615,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1365790.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06344373524188995,
+ "skip_count": 1.0,
+ "step": 846,
+ "text_loss": 0.8423607349395752
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 26.0,
+ "epoch": 3.9815086586439685,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.16796875,
+ "learning_rate": 0.000997118961215586,
+ "loss": 0.0674,
+ "macro_f1": 0.4533333480358124,
+ "num_tokens": 1368387.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.14688406884670258,
+ "skip_count": 3.0,
+ "step": 848,
+ "text_loss": 0.3933577537536621
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 3.990901085999413,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.000997085687078225,
+ "loss": 0.0518,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1371189.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009953443892300129,
+ "skip_count": 0.0,
+ "step": 850,
+ "text_loss": 0.41469162702560425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 4.0,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15625,
+ "learning_rate": 0.0009970522224566909,
+ "loss": 0.0555,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 1374008.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.048870690166950226,
+ "skip_count": 1.0,
+ "step": 852,
+ "text_loss": 0.613615870475769
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.009392427355444,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.283203125,
+ "learning_rate": 0.0009970185673638075,
+ "loss": 0.0629,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1376662.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06865929812192917,
+ "skip_count": 1.0,
+ "step": 854,
+ "text_loss": 0.4392736256122589
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 4.01878485471089,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.162109375,
+ "learning_rate": 0.0009969847218124716,
+ "loss": 0.0506,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1380049.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02382219396531582,
+ "skip_count": 1.0,
+ "step": 856,
+ "text_loss": 0.19115346670150757
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.028177282066334,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1884765625,
+ "learning_rate": 0.0009969506858156527,
+ "loss": 0.0344,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1383008.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03907281160354614,
+ "skip_count": 1.0,
+ "step": 858,
+ "text_loss": 0.34842637181282043
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.037569709421779,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12060546875,
+ "learning_rate": 0.0009969164593863935,
+ "loss": 0.0365,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1387051.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007645803038030863,
+ "skip_count": 0.0,
+ "step": 860,
+ "text_loss": 0.3810436725616455
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.046962136777223,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1484375,
+ "learning_rate": 0.0009968820425378098,
+ "loss": 0.0463,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1390244.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04435238987207413,
+ "skip_count": 0.0,
+ "step": 862,
+ "text_loss": 0.34853485226631165
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.056354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.28515625,
+ "learning_rate": 0.00099684743528309,
+ "loss": 0.0424,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1392976.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006071661598980427,
+ "skip_count": 0.0,
+ "step": 864,
+ "text_loss": 0.6395178437232971
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.065746991488113,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0810546875,
+ "learning_rate": 0.0009968126376354958,
+ "loss": 0.0477,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1396061.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05011235550045967,
+ "skip_count": 2.0,
+ "step": 866,
+ "text_loss": 0.09103966504335403
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.075139418843557,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.154296875,
+ "learning_rate": 0.0009967776496083616,
+ "loss": 0.0509,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1398993.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03979124873876572,
+ "skip_count": 0.0,
+ "step": 868,
+ "text_loss": 0.27257058024406433
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.084531846199002,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.14453125,
+ "learning_rate": 0.000996742471215095,
+ "loss": 0.0516,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1402080.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030823837965726852,
+ "skip_count": 2.0,
+ "step": 870,
+ "text_loss": 0.7047103047370911
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.093924273554447,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009967071024691763,
+ "loss": 0.0461,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1404890.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009721715934574604,
+ "skip_count": 0.0,
+ "step": 872,
+ "text_loss": 0.959106981754303
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.103316700909891,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1142578125,
+ "learning_rate": 0.000996671543384159,
+ "loss": 0.05,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1407853.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006025883834809065,
+ "skip_count": 0.0,
+ "step": 874,
+ "text_loss": 0.47571972012519836
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 4.112709128265336,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09765625,
+ "learning_rate": 0.0009966357939736692,
+ "loss": 0.0416,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1410723.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.025964925065636635,
+ "skip_count": 0.0,
+ "step": 876,
+ "text_loss": 0.4964611530303955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.122101555620781,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09423828125,
+ "learning_rate": 0.0009965998542514065,
+ "loss": 0.0415,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1414008.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09509637206792831,
+ "skip_count": 2.0,
+ "step": 878,
+ "text_loss": 0.621494710445404
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 4.131493982976226,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.11083984375,
+ "learning_rate": 0.0009965637242311427,
+ "loss": 0.0472,
+ "macro_f1": 0.542222261428833,
+ "num_tokens": 1417447.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02520318515598774,
+ "skip_count": 4.0,
+ "step": 880,
+ "text_loss": 0.40209758281707764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 25.0,
+ "epoch": 4.14088641033167,
+ "f1_execute": 0.936170220375061,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.263671875,
+ "learning_rate": 0.000996527403926723,
+ "loss": 0.0495,
+ "macro_f1": 0.5342789888381958,
+ "num_tokens": 1419905.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.13183781504631042,
+ "skip_count": 6.0,
+ "step": 882,
+ "text_loss": 0.642185389995575
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.1502788376871145,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1201171875,
+ "learning_rate": 0.0009964908933520655,
+ "loss": 0.0375,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1423436.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009429510682821274,
+ "skip_count": 0.0,
+ "step": 884,
+ "text_loss": 0.48232755064964294
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.15967126504256,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1669921875,
+ "learning_rate": 0.0009964541925211613,
+ "loss": 0.0349,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1426842.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07629609107971191,
+ "skip_count": 2.0,
+ "step": 886,
+ "text_loss": 0.16620934009552002
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.169063692398004,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0927734375,
+ "learning_rate": 0.0009964173014480738,
+ "loss": 0.0348,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1430430.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.036814019083976746,
+ "skip_count": 2.0,
+ "step": 888,
+ "text_loss": 0.4866008758544922
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.178456119753449,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1123046875,
+ "learning_rate": 0.0009963802201469398,
+ "loss": 0.0476,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1433821.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0041250260546803474,
+ "skip_count": 0.0,
+ "step": 890,
+ "text_loss": 0.578216552734375
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.187848547108893,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2373046875,
+ "learning_rate": 0.0009963429486319693,
+ "loss": 0.0463,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1436976.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06213559955358505,
+ "skip_count": 2.0,
+ "step": 892,
+ "text_loss": 0.221701517701149
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 4.197240974464338,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.361328125,
+ "learning_rate": 0.0009963054869174446,
+ "loss": 0.0313,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 1440397.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07532428950071335,
+ "skip_count": 2.0,
+ "step": 894,
+ "text_loss": 0.6922838091850281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.206633401819783,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1572265625,
+ "learning_rate": 0.0009962678350177209,
+ "loss": 0.0472,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1443604.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0419243648648262,
+ "skip_count": 1.0,
+ "step": 896,
+ "text_loss": 0.22092342376708984
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.216025829175227,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1015625,
+ "learning_rate": 0.0009962299929472268,
+ "loss": 0.034,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1446257.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.10849297791719437,
+ "skip_count": 0.0,
+ "step": 898,
+ "text_loss": 0.26394811272621155
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.225418256530672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10205078125,
+ "learning_rate": 0.000996191960720463,
+ "loss": 0.0394,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1449669.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0092767970636487,
+ "skip_count": 0.0,
+ "step": 900,
+ "text_loss": 0.5338577628135681
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.234810683886117,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.154296875,
+ "learning_rate": 0.0009961537383520042,
+ "loss": 0.0354,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1452450.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02985367365181446,
+ "skip_count": 0.0,
+ "step": 902,
+ "text_loss": 0.5875228047370911
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.2442031112415615,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10205078125,
+ "learning_rate": 0.0009961153258564966,
+ "loss": 0.0378,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1456909.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06794842332601547,
+ "skip_count": 3.0,
+ "step": 904,
+ "text_loss": 0.40959444642066956
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.253595538597006,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0009960767232486604,
+ "loss": 0.0476,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1461712.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023562447167932987,
+ "skip_count": 0.0,
+ "step": 906,
+ "text_loss": 0.3932875096797943
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.262987965952451,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08203125,
+ "learning_rate": 0.000996037930543288,
+ "loss": 0.0505,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1464817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03880339860916138,
+ "skip_count": 1.0,
+ "step": 908,
+ "text_loss": 0.17482402920722961
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.272380393307896,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2119140625,
+ "learning_rate": 0.000995998947755245,
+ "loss": 0.0479,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1467810.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01736828312277794,
+ "skip_count": 1.0,
+ "step": 910,
+ "text_loss": 0.4140470325946808
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.169921875,
+ "learning_rate": 0.0009959597748994695,
+ "loss": 0.0752,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1470802.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011824851855635643,
+ "skip_count": 0.0,
+ "step": 912,
+ "text_loss": 0.7153383493423462
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 4.2911652480187845,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1455078125,
+ "learning_rate": 0.0009959204119909726,
+ "loss": 0.0421,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1474539.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.025456594303250313,
+ "skip_count": 0.0,
+ "step": 914,
+ "text_loss": 0.42812058329582214
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.142578125,
+ "learning_rate": 0.0009958808590448385,
+ "loss": 0.0489,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1477552.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006795851048082113,
+ "skip_count": 0.0,
+ "step": 916,
+ "text_loss": 0.5402814149856567
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.309950102729674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1083984375,
+ "learning_rate": 0.0009958411160762234,
+ "loss": 0.039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1482547.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015615932643413544,
+ "skip_count": 0.0,
+ "step": 918,
+ "text_loss": 0.3836168050765991
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.319342530085119,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08984375,
+ "learning_rate": 0.0009958011831003577,
+ "loss": 0.0448,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1485807.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.043541423976421356,
+ "skip_count": 1.0,
+ "step": 920,
+ "text_loss": 0.4333936274051666
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 4.328734957440563,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1337890625,
+ "learning_rate": 0.000995761060132543,
+ "loss": 0.0418,
+ "macro_f1": 0.6538461446762085,
+ "num_tokens": 1488941.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.05866432189941406,
+ "skip_count": 2.0,
+ "step": 922,
+ "text_loss": 0.4106994867324829
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.3381273847960085,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1630859375,
+ "learning_rate": 0.0009957207471881552,
+ "loss": 0.0531,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1492026.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02714901603758335,
+ "skip_count": 2.0,
+ "step": 924,
+ "text_loss": 0.542091429233551
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.347519812151453,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1796875,
+ "learning_rate": 0.0009956802442826415,
+ "loss": 0.0386,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1494543.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0563737191259861,
+ "skip_count": 0.0,
+ "step": 926,
+ "text_loss": 0.47209203243255615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.356912239506897,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1259765625,
+ "learning_rate": 0.0009956395514315235,
+ "loss": 0.0496,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1497831.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03285066783428192,
+ "skip_count": 0.0,
+ "step": 928,
+ "text_loss": 0.6628931164741516
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.366304666862343,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.154296875,
+ "learning_rate": 0.0009955986686503943,
+ "loss": 0.0466,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1501375.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.024297121912240982,
+ "skip_count": 1.0,
+ "step": 930,
+ "text_loss": 0.495676189661026
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.25,
+ "avg_layers": 28.0,
+ "epoch": 4.375697094217787,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.11181640625,
+ "learning_rate": 0.0009955575959549202,
+ "loss": 0.0424,
+ "macro_f1": 0.7795917987823486,
+ "num_tokens": 1504363.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.12196464836597443,
+ "skip_count": 4.0,
+ "step": 932,
+ "text_loss": 0.26123273372650146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.3850895215732315,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1708984375,
+ "learning_rate": 0.0009955163333608408,
+ "loss": 0.0538,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1507178.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012947078794240952,
+ "skip_count": 0.0,
+ "step": 934,
+ "text_loss": 0.32552677392959595
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.394481948928676,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.154296875,
+ "learning_rate": 0.0009954748808839674,
+ "loss": 0.0379,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1509910.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008946365676820278,
+ "skip_count": 0.0,
+ "step": 936,
+ "text_loss": 0.533141016960144
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.403874376284121,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.140625,
+ "learning_rate": 0.000995433238540185,
+ "loss": 0.0466,
+ "macro_f1": 0.6538461446762085,
+ "num_tokens": 1512826.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.029975678771734238,
+ "skip_count": 1.0,
+ "step": 938,
+ "text_loss": 0.2953577935695648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.413266803639566,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10888671875,
+ "learning_rate": 0.0009953914063454512,
+ "loss": 0.0497,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1517230.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0889134630560875,
+ "skip_count": 2.0,
+ "step": 940,
+ "text_loss": 0.5368834733963013
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.193359375,
+ "learning_rate": 0.000995349384315796,
+ "loss": 0.0413,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1519876.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013458753935992718,
+ "skip_count": 0.0,
+ "step": 942,
+ "text_loss": 0.2005518227815628
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 4.432051658350455,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1357421875,
+ "learning_rate": 0.000995307172467322,
+ "loss": 0.0444,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 1522998.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08850377053022385,
+ "skip_count": 1.0,
+ "step": 944,
+ "text_loss": 0.227926567196846
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.4414440857059,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1435546875,
+ "learning_rate": 0.0009952647708162054,
+ "loss": 0.0503,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1527100.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03199794515967369,
+ "skip_count": 1.0,
+ "step": 946,
+ "text_loss": 0.4859686493873596
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.450836513061344,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1748046875,
+ "learning_rate": 0.0009952221793786942,
+ "loss": 0.0354,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1530028.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006507779937237501,
+ "skip_count": 0.0,
+ "step": 948,
+ "text_loss": 0.6855354905128479
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.460228940416789,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10986328125,
+ "learning_rate": 0.0009951793981711097,
+ "loss": 0.0584,
+ "macro_f1": 0.6538461446762085,
+ "num_tokens": 1533254.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06175103038549423,
+ "skip_count": 1.0,
+ "step": 950,
+ "text_loss": 0.7590400576591492
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.469621367772234,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1025390625,
+ "learning_rate": 0.0009951364272098458,
+ "loss": 0.0295,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1536239.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03773383051156998,
+ "skip_count": 2.0,
+ "step": 952,
+ "text_loss": 0.669784665107727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.4790137951276785,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1748046875,
+ "learning_rate": 0.0009950932665113688,
+ "loss": 0.0507,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1539682.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07280613481998444,
+ "skip_count": 2.0,
+ "step": 954,
+ "text_loss": 0.3365570902824402
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.488406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12255859375,
+ "learning_rate": 0.0009950499160922184,
+ "loss": 0.0541,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1542875.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01770266517996788,
+ "skip_count": 0.0,
+ "step": 956,
+ "text_loss": 0.0921545997262001
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.497798649838567,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09375,
+ "learning_rate": 0.000995006375969006,
+ "loss": 0.0473,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1547135.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.07672002166509628,
+ "skip_count": 0.0,
+ "step": 958,
+ "text_loss": 0.5887606739997864
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.507191077194013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1376953125,
+ "learning_rate": 0.0009949626461584165,
+ "loss": 0.043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1550100.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006247182376682758,
+ "skip_count": 0.0,
+ "step": 960,
+ "text_loss": 0.5777931213378906
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.516583504549457,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.119140625,
+ "learning_rate": 0.0009949187266772076,
+ "loss": 0.0366,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1553192.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030319908633828163,
+ "skip_count": 2.0,
+ "step": 962,
+ "text_loss": 0.2370252162218094
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.5259759319049016,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.0009948746175422088,
+ "loss": 0.0511,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1556318.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006004320923238993,
+ "skip_count": 0.0,
+ "step": 964,
+ "text_loss": 0.6271032094955444
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15234375,
+ "learning_rate": 0.000994830318770323,
+ "loss": 0.0514,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1559195.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011544366367161274,
+ "skip_count": 0.0,
+ "step": 966,
+ "text_loss": 0.47256720066070557
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 4.544760786615791,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.171875,
+ "learning_rate": 0.0009947858303785255,
+ "loss": 0.0374,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 1561813.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.05258861929178238,
+ "skip_count": 1.0,
+ "step": 968,
+ "text_loss": 0.7703132629394531
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.554153213971236,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1142578125,
+ "learning_rate": 0.0009947411523838648,
+ "loss": 0.0453,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1564634.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011216280050575733,
+ "skip_count": 0.0,
+ "step": 970,
+ "text_loss": 0.4666804075241089
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1533203125,
+ "learning_rate": 0.0009946962848034608,
+ "loss": 0.0696,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1567959.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009387624450027943,
+ "skip_count": 0.0,
+ "step": 972,
+ "text_loss": 0.4067264199256897
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.5729380686821255,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.203125,
+ "learning_rate": 0.0009946512276545075,
+ "loss": 0.0397,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1571221.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.041713520884513855,
+ "skip_count": 0.0,
+ "step": 974,
+ "text_loss": 0.5242366194725037
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 4.58233049603757,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.228515625,
+ "learning_rate": 0.0009946059809542705,
+ "loss": 0.0487,
+ "macro_f1": 0.7644445300102234,
+ "num_tokens": 1575033.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.05748331546783447,
+ "skip_count": 2.0,
+ "step": 976,
+ "text_loss": 0.5704690217971802
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 4.591722923393014,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1396484375,
+ "learning_rate": 0.0009945605447200887,
+ "loss": 0.0445,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1579050.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016765203326940536,
+ "skip_count": 0.0,
+ "step": 978,
+ "text_loss": 0.4804173707962036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.601115350748459,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1337890625,
+ "learning_rate": 0.0009945149189693732,
+ "loss": 0.0406,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1582967.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021518222987651825,
+ "skip_count": 2.0,
+ "step": 980,
+ "text_loss": 0.4138598144054413
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.610507778103904,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11474609375,
+ "learning_rate": 0.0009944691037196078,
+ "loss": 0.0456,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1586282.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012246460653841496,
+ "skip_count": 0.0,
+ "step": 982,
+ "text_loss": 0.22561736404895782
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 24.0,
+ "epoch": 4.6199002054593485,
+ "f1_execute": 0.930232584476471,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.8000000715255737,
+ "grad_norm": 0.1455078125,
+ "learning_rate": 0.0009944230989883491,
+ "loss": 0.0456,
+ "macro_f1": 0.7989664077758789,
+ "num_tokens": 1589279.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.09344895929098129,
+ "skip_count": 5.0,
+ "step": 984,
+ "text_loss": 0.4416656494140625
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.629292632814793,
+ "f1_execute": 0.9411765336990356,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.111328125,
+ "learning_rate": 0.0009943769047932264,
+ "loss": 0.0404,
+ "macro_f1": 0.5359477400779724,
+ "num_tokens": 1592398.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.08916857838630676,
+ "skip_count": 2.0,
+ "step": 986,
+ "text_loss": 0.5536438822746277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.638685060170237,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15234375,
+ "learning_rate": 0.000994330521151941,
+ "loss": 0.039,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1596213.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06114347651600838,
+ "skip_count": 1.0,
+ "step": 988,
+ "text_loss": 0.5835405588150024
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1953125,
+ "learning_rate": 0.000994283948082267,
+ "loss": 0.0573,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1598827.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017335431184619665,
+ "skip_count": 0.0,
+ "step": 990,
+ "text_loss": 0.5857380032539368
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.657469914881127,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10693359375,
+ "learning_rate": 0.0009942371856020522,
+ "loss": 0.0341,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1602915.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014606470242142677,
+ "skip_count": 0.0,
+ "step": 992,
+ "text_loss": 0.6939892768859863
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 4.666862342236572,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.140625,
+ "learning_rate": 0.0009941902337292155,
+ "loss": 0.06,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 1605776.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.06297315657138824,
+ "skip_count": 1.0,
+ "step": 994,
+ "text_loss": 0.37616831064224243
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.676254769592017,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1083984375,
+ "learning_rate": 0.0009941430924817487,
+ "loss": 0.0572,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1609856.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03297794610261917,
+ "skip_count": 2.0,
+ "step": 996,
+ "text_loss": 0.2098303586244583
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.685647196947461,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10107421875,
+ "learning_rate": 0.000994095761877717,
+ "loss": 0.0499,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1612904.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012901155278086662,
+ "skip_count": 0.0,
+ "step": 998,
+ "text_loss": 0.20103533565998077
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 4.695039624302906,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.259765625,
+ "learning_rate": 0.000994048241935257,
+ "loss": 0.0535,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1615540.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.020434845238924026,
+ "skip_count": 0.0,
+ "step": 1000,
+ "text_loss": 0.32709044218063354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.70443205165835,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1669921875,
+ "learning_rate": 0.0009940005326725789,
+ "loss": 0.0453,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1618786.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07831378281116486,
+ "skip_count": 2.0,
+ "step": 1002,
+ "text_loss": 0.5789632797241211
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.713824479013795,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.21875,
+ "learning_rate": 0.0009939526341079647,
+ "loss": 0.0511,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1621736.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04863874986767769,
+ "skip_count": 0.0,
+ "step": 1004,
+ "text_loss": 0.6128849387168884
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1435546875,
+ "learning_rate": 0.0009939045462597693,
+ "loss": 0.0538,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1624649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00677989237010479,
+ "skip_count": 0.0,
+ "step": 1006,
+ "text_loss": 0.6168264150619507
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.732609333724684,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009938562691464202,
+ "loss": 0.0524,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1627700.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019490402191877365,
+ "skip_count": 0.0,
+ "step": 1008,
+ "text_loss": 0.17463822662830353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.742001761080129,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1298828125,
+ "learning_rate": 0.000993807802786417,
+ "loss": 0.0475,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1630714.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019022391643375158,
+ "skip_count": 0.0,
+ "step": 1010,
+ "text_loss": 0.5675593018531799
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 29.0,
+ "epoch": 4.751394188435574,
+ "f1_execute": 0.9599999785423279,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1640625,
+ "learning_rate": 0.0009937591471983322,
+ "loss": 0.0501,
+ "macro_f1": 0.7644444704055786,
+ "num_tokens": 1633770.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.042485643178224564,
+ "skip_count": 2.0,
+ "step": 1012,
+ "text_loss": 0.42387229204177856
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.760786615791019,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1396484375,
+ "learning_rate": 0.0009937103024008109,
+ "loss": 0.0545,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1637120.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09427817165851593,
+ "skip_count": 1.0,
+ "step": 1014,
+ "text_loss": 0.49511051177978516
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12890625,
+ "learning_rate": 0.0009936612684125702,
+ "loss": 0.0503,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1640165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005106127820909023,
+ "skip_count": 0.0,
+ "step": 1016,
+ "text_loss": 0.5398799180984497
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.7795714705019074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2734375,
+ "learning_rate": 0.0009936120452524004,
+ "loss": 0.0506,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1643251.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016914300620555878,
+ "skip_count": 0.0,
+ "step": 1018,
+ "text_loss": 0.20882178843021393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.788963897857353,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1962890625,
+ "learning_rate": 0.0009935626329391637,
+ "loss": 0.0537,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1646560.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.13481520116329193,
+ "skip_count": 2.0,
+ "step": 1020,
+ "text_loss": 0.5719883441925049
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 4.798356325212797,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0009935130314917948,
+ "loss": 0.0602,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1649538.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07700438797473907,
+ "skip_count": 2.0,
+ "step": 1022,
+ "text_loss": 0.1303367167711258
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.807748752568242,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1494140625,
+ "learning_rate": 0.0009934632409293015,
+ "loss": 0.0611,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1652397.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11416907608509064,
+ "skip_count": 1.0,
+ "step": 1024,
+ "text_loss": 0.24076920747756958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 4.817141179923686,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.306640625,
+ "learning_rate": 0.0009934132612707631,
+ "loss": 0.0507,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 1654938.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.09484589844942093,
+ "skip_count": 2.0,
+ "step": 1026,
+ "text_loss": 0.1652517318725586
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.826533607279131,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1435546875,
+ "learning_rate": 0.0009933630925353324,
+ "loss": 0.0395,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1658536.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00741987070068717,
+ "skip_count": 0.0,
+ "step": 1028,
+ "text_loss": 0.49296700954437256
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.835926034634576,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1845703125,
+ "learning_rate": 0.0009933127347422337,
+ "loss": 0.0602,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1661446.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08399344235658646,
+ "skip_count": 2.0,
+ "step": 1030,
+ "text_loss": 0.22363591194152832
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.158203125,
+ "learning_rate": 0.0009932621879107648,
+ "loss": 0.0475,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1664612.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031781597062945366,
+ "skip_count": 0.0,
+ "step": 1032,
+ "text_loss": 0.36083245277404785
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.854710889345466,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2275390625,
+ "learning_rate": 0.000993211452060295,
+ "loss": 0.042,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1667467.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03595469892024994,
+ "skip_count": 1.0,
+ "step": 1034,
+ "text_loss": 0.16372856497764587
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.86410331670091,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.189453125,
+ "learning_rate": 0.000993160527210266,
+ "loss": 0.061,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1670675.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.1597205102443695,
+ "skip_count": 0.0,
+ "step": 1036,
+ "text_loss": 0.6049913763999939
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.873495744056354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2197265625,
+ "learning_rate": 0.000993109413380193,
+ "loss": 0.0562,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1673477.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009756010957062244,
+ "skip_count": 0.0,
+ "step": 1038,
+ "text_loss": 0.7034620642662048
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 4.882888171411799,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1806640625,
+ "learning_rate": 0.0009930581105896624,
+ "loss": 0.0559,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1676809.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.020718922838568687,
+ "skip_count": 0.0,
+ "step": 1040,
+ "text_loss": 0.2814720571041107
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.892280598767244,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1923828125,
+ "learning_rate": 0.0009930066188583338,
+ "loss": 0.0445,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1679398.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04755603149533272,
+ "skip_count": 1.0,
+ "step": 1042,
+ "text_loss": 0.5445759296417236
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.901673026122689,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.126953125,
+ "learning_rate": 0.0009929549382059388,
+ "loss": 0.0509,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1682269.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01040949858725071,
+ "skip_count": 0.0,
+ "step": 1044,
+ "text_loss": 0.2876914143562317
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.911065453478133,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1259765625,
+ "learning_rate": 0.0009929030686522816,
+ "loss": 0.0363,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1685428.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008158888667821884,
+ "skip_count": 0.0,
+ "step": 1046,
+ "text_loss": 0.49053525924682617
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.9204578808335775,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1630859375,
+ "learning_rate": 0.0009928510102172386,
+ "loss": 0.0498,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1688252.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005102572031319141,
+ "skip_count": 0.0,
+ "step": 1048,
+ "text_loss": 0.5274341106414795
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.929850308189023,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1591796875,
+ "learning_rate": 0.0009927987629207587,
+ "loss": 0.0564,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1691289.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016768503934144974,
+ "skip_count": 0.0,
+ "step": 1050,
+ "text_loss": 0.9935035109519958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.939242735544467,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1455078125,
+ "learning_rate": 0.0009927463267828634,
+ "loss": 0.0488,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1694148.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010905829258263111,
+ "skip_count": 0.0,
+ "step": 1052,
+ "text_loss": 0.20895758271217346
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.948635162899912,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1455078125,
+ "learning_rate": 0.000992693701823646,
+ "loss": 0.0624,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1698543.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.10533971339464188,
+ "skip_count": 0.0,
+ "step": 1054,
+ "text_loss": 0.5776236653327942
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.958027590255357,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.255859375,
+ "learning_rate": 0.0009926408880632726,
+ "loss": 0.0556,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1702460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.026313411071896553,
+ "skip_count": 1.0,
+ "step": 1056,
+ "text_loss": 0.34990596771240234
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.967420017610801,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.099609375,
+ "learning_rate": 0.0009925878855219818,
+ "loss": 0.0391,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1705686.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007763393223285675,
+ "skip_count": 0.0,
+ "step": 1058,
+ "text_loss": 0.4980163276195526
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.976812444966246,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.177734375,
+ "learning_rate": 0.000992534694220084,
+ "loss": 0.0613,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1708739.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03998444974422455,
+ "skip_count": 1.0,
+ "step": 1060,
+ "text_loss": 0.29092350602149963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.98620487232169,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1572265625,
+ "learning_rate": 0.000992481314177962,
+ "loss": 0.0312,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1711903.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06966045498847961,
+ "skip_count": 1.0,
+ "step": 1062,
+ "text_loss": 0.6267179250717163
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 4.995597299677136,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.244140625,
+ "learning_rate": 0.0009924277454160717,
+ "loss": 0.0548,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1715974.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05536063387989998,
+ "skip_count": 1.0,
+ "step": 1064,
+ "text_loss": 0.5813798904418945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.004696213677723,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.134765625,
+ "learning_rate": 0.0009923739879549402,
+ "loss": 0.0423,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1718828.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.020993782207369804,
+ "skip_count": 0.0,
+ "step": 1066,
+ "text_loss": 0.22665327787399292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.014088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0888671875,
+ "learning_rate": 0.0009923200418151677,
+ "loss": 0.0301,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1722419.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007351701147854328,
+ "skip_count": 0.0,
+ "step": 1068,
+ "text_loss": 0.5796169638633728
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.0234810683886115,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.142578125,
+ "learning_rate": 0.0009922659070174264,
+ "loss": 0.0452,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1725663.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.026033315807580948,
+ "skip_count": 0.0,
+ "step": 1070,
+ "text_loss": 0.25742828845977783
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.0009922115835824612,
+ "loss": 0.041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1729239.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0118600158020854,
+ "skip_count": 0.0,
+ "step": 1072,
+ "text_loss": 0.21630282700061798
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 5.042265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12158203125,
+ "learning_rate": 0.0009921570715310884,
+ "loss": 0.0364,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 1732507.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.016118815168738365,
+ "skip_count": 0.0,
+ "step": 1074,
+ "text_loss": 0.5639925003051758
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.051658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0791015625,
+ "learning_rate": 0.0009921023708841974,
+ "loss": 0.0407,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1736182.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004275390412658453,
+ "skip_count": 0.0,
+ "step": 1076,
+ "text_loss": 0.5758615136146545
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1103515625,
+ "learning_rate": 0.0009920474816627496,
+ "loss": 0.037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1739559.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01299292128533125,
+ "skip_count": 0.0,
+ "step": 1078,
+ "text_loss": 0.18221625685691833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.0704432051658355,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1630859375,
+ "learning_rate": 0.0009919924038877788,
+ "loss": 0.0343,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1742890.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.038295745849609375,
+ "skip_count": 2.0,
+ "step": 1080,
+ "text_loss": 0.17354349792003632
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 29.0,
+ "epoch": 5.07983563252128,
+ "f1_execute": 0.9583333134651184,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.1884765625,
+ "learning_rate": 0.0009919371375803905,
+ "loss": 0.0455,
+ "macro_f1": 0.8194444179534912,
+ "num_tokens": 1746433.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04052971675992012,
+ "skip_count": 3.0,
+ "step": 1082,
+ "text_loss": 0.2250112146139145
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.0009918816827617632,
+ "loss": 0.0353,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1750802.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009114136919379234,
+ "skip_count": 0.0,
+ "step": 1084,
+ "text_loss": 0.2526719272136688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.098620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1279296875,
+ "learning_rate": 0.000991826039453147,
+ "loss": 0.0392,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1754272.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004904678091406822,
+ "skip_count": 0.0,
+ "step": 1086,
+ "text_loss": 0.7308789491653442
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 5.108012914587614,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.138671875,
+ "learning_rate": 0.000991770207675865,
+ "loss": 0.0327,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 1757231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02129189297556877,
+ "skip_count": 2.0,
+ "step": 1088,
+ "text_loss": 0.21764220297336578
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.1174053419430585,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009917141874513113,
+ "loss": 0.0315,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1760003.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01310618408024311,
+ "skip_count": 0.0,
+ "step": 1090,
+ "text_loss": 0.33892181515693665
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.126797769298503,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.171875,
+ "learning_rate": 0.0009916579788009537,
+ "loss": 0.0457,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1763052.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02059309557080269,
+ "skip_count": 2.0,
+ "step": 1092,
+ "text_loss": 0.6551769375801086
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.136190196653947,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10546875,
+ "learning_rate": 0.0009916015817463312,
+ "loss": 0.0385,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1766655.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0274797435849905,
+ "skip_count": 2.0,
+ "step": 1094,
+ "text_loss": 0.3984372019767761
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11181640625,
+ "learning_rate": 0.000991544996309055,
+ "loss": 0.0271,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1769997.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01437368243932724,
+ "skip_count": 0.0,
+ "step": 1096,
+ "text_loss": 0.4203338921070099
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.154975051364837,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1103515625,
+ "learning_rate": 0.000991488222510809,
+ "loss": 0.0292,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1773130.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001382062560878694,
+ "skip_count": 0.0,
+ "step": 1098,
+ "text_loss": 0.43132516741752625
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.164367478720282,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.123046875,
+ "learning_rate": 0.000991431260373349,
+ "loss": 0.0329,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1775682.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.1115434318780899,
+ "skip_count": 2.0,
+ "step": 1100,
+ "text_loss": 0.3218227028846741
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.111328125,
+ "learning_rate": 0.000991374109918503,
+ "loss": 0.0185,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1778407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009529678151011467,
+ "skip_count": 0.0,
+ "step": 1102,
+ "text_loss": 0.17183731496334076
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.183152333431171,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1142578125,
+ "learning_rate": 0.000991316771168171,
+ "loss": 0.044,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1781518.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018668074160814285,
+ "skip_count": 2.0,
+ "step": 1104,
+ "text_loss": 1.1324785947799683
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.192544760786616,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.125,
+ "learning_rate": 0.0009912592441443258,
+ "loss": 0.0411,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1784878.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04145100712776184,
+ "skip_count": 1.0,
+ "step": 1106,
+ "text_loss": 0.6082063317298889
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.20193718814206,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08984375,
+ "learning_rate": 0.0009912015288690112,
+ "loss": 0.0421,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1788978.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021450644358992577,
+ "skip_count": 1.0,
+ "step": 1108,
+ "text_loss": 0.5597621202468872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.2113296154975055,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.083984375,
+ "learning_rate": 0.0009911436253643444,
+ "loss": 0.0238,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1792321.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017405325546860695,
+ "skip_count": 0.0,
+ "step": 1110,
+ "text_loss": 0.2560598850250244
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.0009910855336525137,
+ "loss": 0.0383,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1795182.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007162237539887428,
+ "skip_count": 0.0,
+ "step": 1112,
+ "text_loss": 0.3438240587711334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 5.230114470208394,
+ "f1_execute": 0.9411765336990356,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.115234375,
+ "learning_rate": 0.00099102725375578,
+ "loss": 0.0326,
+ "macro_f1": 0.480392187833786,
+ "num_tokens": 1798987.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.11149197816848755,
+ "skip_count": 3.0,
+ "step": 1114,
+ "text_loss": 0.20455503463745117
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.239506897563839,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10791015625,
+ "learning_rate": 0.0009909687856964767,
+ "loss": 0.035,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 1802064.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.12679415941238403,
+ "skip_count": 3.0,
+ "step": 1116,
+ "text_loss": 0.11996729671955109
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.248899324919284,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.12451171875,
+ "learning_rate": 0.0009909101294970082,
+ "loss": 0.0365,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1805412.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05108053982257843,
+ "skip_count": 2.0,
+ "step": 1118,
+ "text_loss": 0.13224145770072937
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 5.258291752274729,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.123046875,
+ "learning_rate": 0.0009908512851798522,
+ "loss": 0.0455,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 1808196.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02131766639649868,
+ "skip_count": 1.0,
+ "step": 1120,
+ "text_loss": 0.7824069261550903
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.138671875,
+ "learning_rate": 0.0009907922527675576,
+ "loss": 0.0405,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1811622.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006226244382560253,
+ "skip_count": 0.0,
+ "step": 1122,
+ "text_loss": 0.5419743061065674
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.277076606985618,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.12890625,
+ "learning_rate": 0.000990733032282746,
+ "loss": 0.0535,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1814628.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03088250942528248,
+ "skip_count": 2.0,
+ "step": 1124,
+ "text_loss": 0.37100958824157715
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 5.286469034341063,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0810546875,
+ "learning_rate": 0.000990673623748111,
+ "loss": 0.0348,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 1817205.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05495348572731018,
+ "skip_count": 1.0,
+ "step": 1126,
+ "text_loss": 0.20241330564022064
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 5.295861461696507,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.0927734375,
+ "learning_rate": 0.0009906140271864173,
+ "loss": 0.0433,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 1820141.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.037809282541275024,
+ "skip_count": 2.0,
+ "step": 1128,
+ "text_loss": 0.32965806126594543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 5.305253889051952,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0908203125,
+ "learning_rate": 0.0009905542426205032,
+ "loss": 0.0348,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 1824011.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03320181369781494,
+ "skip_count": 1.0,
+ "step": 1130,
+ "text_loss": 0.36329755187034607
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.314646316407397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.0009904942700732777,
+ "loss": 0.0335,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1826873.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004102326463907957,
+ "skip_count": 0.0,
+ "step": 1132,
+ "text_loss": 0.6692602038383484
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.324038743762841,
+ "f1_execute": 0.8799999952316284,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08544921875,
+ "learning_rate": 0.0009904341095677226,
+ "loss": 0.03,
+ "macro_f1": 0.29333335161209106,
+ "num_tokens": 1830103.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.2376193106174469,
+ "skip_count": 4.0,
+ "step": 1134,
+ "text_loss": 0.19212862849235535
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.333431171118286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.119140625,
+ "learning_rate": 0.0009903737611268919,
+ "loss": 0.0445,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1833201.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005253395065665245,
+ "skip_count": 0.0,
+ "step": 1136,
+ "text_loss": 0.6773360371589661
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.34282359847373,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09814453125,
+ "learning_rate": 0.0009903132247739107,
+ "loss": 0.0305,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 1836045.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.14382585883140564,
+ "skip_count": 3.0,
+ "step": 1138,
+ "text_loss": 0.2882297933101654
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.3522160258291755,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.150390625,
+ "learning_rate": 0.0009902525005319766,
+ "loss": 0.04,
+ "macro_f1": 0.5427350401878357,
+ "num_tokens": 1839721.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04033960774540901,
+ "skip_count": 2.0,
+ "step": 1140,
+ "text_loss": 0.7172559499740601
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 5.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12109375,
+ "learning_rate": 0.0009901915884243597,
+ "loss": 0.0351,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 1842614.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005162308923900127,
+ "skip_count": 0.0,
+ "step": 1142,
+ "text_loss": 0.42892804741859436
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.371000880540064,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1240234375,
+ "learning_rate": 0.0009901304884744014,
+ "loss": 0.0386,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1845444.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.10117656737565994,
+ "skip_count": 2.0,
+ "step": 1144,
+ "text_loss": 0.20806430280208588
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.380393307895509,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.130859375,
+ "learning_rate": 0.0009900692007055152,
+ "loss": 0.0357,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1848558.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014107038266956806,
+ "skip_count": 0.0,
+ "step": 1146,
+ "text_loss": 0.5355974435806274
+ },
+ {
+ "acc_repeat": 0.25,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 5.389785735250954,
+ "f1_execute": 0.9166666865348816,
+ "f1_repeat": 0.4000000059604645,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.16015625,
+ "learning_rate": 0.000990007725141187,
+ "loss": 0.0449,
+ "macro_f1": 0.6611111164093018,
+ "num_tokens": 1852723.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.15537866950035095,
+ "skip_count": 2.0,
+ "step": 1148,
+ "text_loss": 0.6388513445854187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.399178162606399,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1181640625,
+ "learning_rate": 0.0009899460618049741,
+ "loss": 0.0397,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1856181.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011800912208855152,
+ "skip_count": 0.0,
+ "step": 1150,
+ "text_loss": 0.6113069653511047
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 5.408570589961843,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1005859375,
+ "learning_rate": 0.000989884210720506,
+ "loss": 0.0331,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 1859685.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.022900646552443504,
+ "skip_count": 0.0,
+ "step": 1152,
+ "text_loss": 0.25718021392822266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.4179630173172875,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.0009898221719114844,
+ "loss": 0.0354,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1862505.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.026814989745616913,
+ "skip_count": 1.0,
+ "step": 1154,
+ "text_loss": 0.5426549911499023
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.427355444672733,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1015625,
+ "learning_rate": 0.0009897599454016823,
+ "loss": 0.0401,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1866266.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032623792067170143,
+ "skip_count": 0.0,
+ "step": 1156,
+ "text_loss": 0.37752896547317505
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.436747872028177,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07080078125,
+ "learning_rate": 0.0009896975312149454,
+ "loss": 0.0369,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1870216.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015617577359080315,
+ "skip_count": 0.0,
+ "step": 1158,
+ "text_loss": 0.18207129836082458
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.446140299383622,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11669921875,
+ "learning_rate": 0.0009896349293751906,
+ "loss": 0.0423,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1873338.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02250153198838234,
+ "skip_count": 1.0,
+ "step": 1160,
+ "text_loss": 0.548884391784668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.455532726739067,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1484375,
+ "learning_rate": 0.0009895721399064072,
+ "loss": 0.0388,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 1876470.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.055204521864652634,
+ "skip_count": 1.0,
+ "step": 1162,
+ "text_loss": 0.48052409291267395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.464925154094511,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.0009895091628326564,
+ "loss": 0.0293,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1879354.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009093789383769035,
+ "skip_count": 0.0,
+ "step": 1164,
+ "text_loss": 0.3908069431781769
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.474317581449956,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.140625,
+ "learning_rate": 0.000989445998178071,
+ "loss": 0.0323,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1881941.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015086972154676914,
+ "skip_count": 1.0,
+ "step": 1166,
+ "text_loss": 0.4884725511074066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.4837100088054,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.134765625,
+ "learning_rate": 0.0009893826459668558,
+ "loss": 0.0386,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 1885374.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06587666273117065,
+ "skip_count": 3.0,
+ "step": 1168,
+ "text_loss": 0.12760137021541595
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.493102436160846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1591796875,
+ "learning_rate": 0.0009893191062232873,
+ "loss": 0.0322,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1888612.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006088624242693186,
+ "skip_count": 0.0,
+ "step": 1170,
+ "text_loss": 0.4821319580078125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1279296875,
+ "learning_rate": 0.0009892553789717143,
+ "loss": 0.0389,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1891463.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010113578289747238,
+ "skip_count": 0.0,
+ "step": 1172,
+ "text_loss": 0.3613642454147339
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.5118872908717345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1025390625,
+ "learning_rate": 0.0009891914642365573,
+ "loss": 0.0404,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1894230.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004947459790855646,
+ "skip_count": 0.0,
+ "step": 1174,
+ "text_loss": 0.5037549138069153
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.521279718227179,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1572265625,
+ "learning_rate": 0.0009891273620423083,
+ "loss": 0.0428,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1897294.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.026075217872858047,
+ "skip_count": 0.0,
+ "step": 1176,
+ "text_loss": 0.32558977603912354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.530672145582624,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12158203125,
+ "learning_rate": 0.0009890630724135314,
+ "loss": 0.0351,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1901553.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06650999188423157,
+ "skip_count": 1.0,
+ "step": 1178,
+ "text_loss": 0.23473620414733887
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 5.540064572938069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.0009889985953748625,
+ "loss": 0.0268,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 1904556.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010361116379499435,
+ "skip_count": 1.0,
+ "step": 1180,
+ "text_loss": 0.6927042007446289
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.549457000293513,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.103515625,
+ "learning_rate": 0.0009889339309510094,
+ "loss": 0.0351,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1908053.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013286533765494823,
+ "skip_count": 0.0,
+ "step": 1182,
+ "text_loss": 0.19977325201034546
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 28.0,
+ "epoch": 5.558849427648958,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.5,
+ "grad_norm": 0.058837890625,
+ "learning_rate": 0.0009888690791667518,
+ "loss": 0.0204,
+ "macro_f1": 0.7018141150474548,
+ "num_tokens": 1911754.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.11920545995235443,
+ "skip_count": 3.0,
+ "step": 1184,
+ "text_loss": 0.4072858691215515
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.568241855004403,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11083984375,
+ "learning_rate": 0.0009888040400469408,
+ "loss": 0.0391,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1914862.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03652849420905113,
+ "skip_count": 1.0,
+ "step": 1186,
+ "text_loss": 0.2654043138027191
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.577634282359847,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1689453125,
+ "learning_rate": 0.0009887388136164996,
+ "loss": 0.0336,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1918542.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03991910070180893,
+ "skip_count": 2.0,
+ "step": 1188,
+ "text_loss": 0.21130657196044922
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 5.587026709715292,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09521484375,
+ "learning_rate": 0.000988673399900423,
+ "loss": 0.0429,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1921589.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014900135807693005,
+ "skip_count": 0.0,
+ "step": 1190,
+ "text_loss": 0.5519335865974426
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.596419137070737,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1884765625,
+ "learning_rate": 0.0009886077989237777,
+ "loss": 0.0405,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1924320.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06271552294492722,
+ "skip_count": 1.0,
+ "step": 1192,
+ "text_loss": 0.213813915848732
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 24.0,
+ "epoch": 5.6058115644261814,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.888888955116272,
+ "grad_norm": 0.1875,
+ "learning_rate": 0.000988542010711702,
+ "loss": 0.0342,
+ "macro_f1": 0.6225374937057495,
+ "num_tokens": 1927178.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03081391751766205,
+ "skip_count": 5.0,
+ "step": 1194,
+ "text_loss": 0.7524349093437195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.615203991781626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.255859375,
+ "learning_rate": 0.0009884760352894064,
+ "loss": 0.0518,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1930216.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008556773886084557,
+ "skip_count": 0.0,
+ "step": 1196,
+ "text_loss": 0.28230375051498413
+ },
+ {
+ "acc_repeat": 0.3333333432674408,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 5.62459641913707,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.5,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1064453125,
+ "learning_rate": 0.0009884098726821726,
+ "loss": 0.0472,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 1933312.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.05344727262854576,
+ "skip_count": 0.0,
+ "step": 1198,
+ "text_loss": 0.5509607195854187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 5.633988846492516,
+ "f1_execute": 0.9411765336990356,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.1298828125,
+ "learning_rate": 0.000988343522915354,
+ "loss": 0.0441,
+ "macro_f1": 0.480392187833786,
+ "num_tokens": 1936160.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.07324771583080292,
+ "skip_count": 3.0,
+ "step": 1200,
+ "text_loss": 0.30565372109413147
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 25.0,
+ "epoch": 5.64338127384796,
+ "f1_execute": 0.8936169743537903,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.444444477558136,
+ "grad_norm": 0.2470703125,
+ "learning_rate": 0.0009882769860143764,
+ "loss": 0.0317,
+ "macro_f1": 0.4460204839706421,
+ "num_tokens": 1939266.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.18620699644088745,
+ "skip_count": 6.0,
+ "step": 1202,
+ "text_loss": 0.976121723651886
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 5.6527737012034045,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1318359375,
+ "learning_rate": 0.000988210262004737,
+ "loss": 0.0474,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 1942173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007703613489866257,
+ "skip_count": 1.0,
+ "step": 1204,
+ "text_loss": 0.5647401809692383
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.66216612855885,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1484375,
+ "learning_rate": 0.0009881433509120036,
+ "loss": 0.0376,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1945071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02162683941423893,
+ "skip_count": 2.0,
+ "step": 1206,
+ "text_loss": 0.24229218065738678
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.671558555914294,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0966796875,
+ "learning_rate": 0.0009880762527618176,
+ "loss": 0.0383,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1949060.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017667081207036972,
+ "skip_count": 0.0,
+ "step": 1208,
+ "text_loss": 0.4035970866680145
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.680950983269739,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.154296875,
+ "learning_rate": 0.0009880089675798908,
+ "loss": 0.0367,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1951698.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006405784282833338,
+ "skip_count": 0.0,
+ "step": 1210,
+ "text_loss": 0.5319879055023193
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.690343410625183,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09814453125,
+ "learning_rate": 0.0009879414953920071,
+ "loss": 0.0294,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1955266.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009859707206487656,
+ "skip_count": 0.0,
+ "step": 1212,
+ "text_loss": 0.6687407493591309
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.699735837980628,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.130859375,
+ "learning_rate": 0.0009878738362240219,
+ "loss": 0.045,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 1958538.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030890554189682007,
+ "skip_count": 2.0,
+ "step": 1214,
+ "text_loss": 0.20820017158985138
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 5.709128265336073,
+ "f1_execute": 0.9200000166893005,
+ "f1_repeat": 0.5,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1806640625,
+ "learning_rate": 0.000987805990101862,
+ "loss": 0.0317,
+ "macro_f1": 0.47333335876464844,
+ "num_tokens": 1961419.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.10383198410272598,
+ "skip_count": 2.0,
+ "step": 1216,
+ "text_loss": 0.8664976358413696
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.718520692691517,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1435546875,
+ "learning_rate": 0.0009877379570515268,
+ "loss": 0.0366,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1964836.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013376163318753242,
+ "skip_count": 0.0,
+ "step": 1218,
+ "text_loss": 0.4223395884037018
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.727913120046962,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0859375,
+ "learning_rate": 0.0009876697370990865,
+ "loss": 0.0343,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1967620.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008577900938689709,
+ "skip_count": 0.0,
+ "step": 1220,
+ "text_loss": 0.4789901375770569
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1728515625,
+ "learning_rate": 0.0009876013302706828,
+ "loss": 0.049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1971100.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004730266984552145,
+ "skip_count": 0.0,
+ "step": 1222,
+ "text_loss": 0.6799837946891785
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.7466979747578515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08349609375,
+ "learning_rate": 0.0009875327365925295,
+ "loss": 0.0341,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1974408.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010849526152014732,
+ "skip_count": 0.0,
+ "step": 1224,
+ "text_loss": 0.18967926502227783
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 5.756090402113296,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.169921875,
+ "learning_rate": 0.0009874639560909118,
+ "loss": 0.0498,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 1977046.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04841252416372299,
+ "skip_count": 1.0,
+ "step": 1226,
+ "text_loss": 0.6133310198783875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.765482829468741,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1318359375,
+ "learning_rate": 0.0009873949887921867,
+ "loss": 0.0402,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1980330.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.029638588428497314,
+ "skip_count": 1.0,
+ "step": 1228,
+ "text_loss": 0.15649555623531342
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 5.774875256824186,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1103515625,
+ "learning_rate": 0.0009873258347227823,
+ "loss": 0.0331,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 1983173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009955910965800285,
+ "skip_count": 0.0,
+ "step": 1230,
+ "text_loss": 0.4741005599498749
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0849609375,
+ "learning_rate": 0.0009872564939091989,
+ "loss": 0.0342,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 1986825.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010205300524830818,
+ "skip_count": 0.0,
+ "step": 1232,
+ "text_loss": 0.5315462350845337
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5714285969734192,
+ "avg_layers": 25.0,
+ "epoch": 5.7936601115350745,
+ "f1_execute": 0.9302325248718262,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.7272727489471436,
+ "grad_norm": 0.11865234375,
+ "learning_rate": 0.0009871869663780077,
+ "loss": 0.0336,
+ "macro_f1": 0.8858351111412048,
+ "num_tokens": 1990448.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.09120134264230728,
+ "skip_count": 7.0,
+ "step": 1234,
+ "text_loss": 0.6187508702278137
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 5.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.125,
+ "learning_rate": 0.0009871172521558522,
+ "loss": 0.0475,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 1993474.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016188839450478554,
+ "skip_count": 1.0,
+ "step": 1236,
+ "text_loss": 0.20783066749572754
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 5.812444966245964,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.216796875,
+ "learning_rate": 0.0009870473512694465,
+ "loss": 0.0373,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 1996536.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05046704784035683,
+ "skip_count": 3.0,
+ "step": 1238,
+ "text_loss": 0.247748002409935
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 5.821837393601409,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.09033203125,
+ "learning_rate": 0.0009869772637455772,
+ "loss": 0.0251,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 1999530.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.044926248490810394,
+ "skip_count": 2.0,
+ "step": 1240,
+ "text_loss": 0.26001980900764465
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 5.831229820956853,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1513671875,
+ "learning_rate": 0.000986906989611102,
+ "loss": 0.0446,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2002782.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.025911526754498482,
+ "skip_count": 0.0,
+ "step": 1242,
+ "text_loss": 0.9009982943534851
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.8406222483122985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.115234375,
+ "learning_rate": 0.0009868365288929492,
+ "loss": 0.0371,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2005331.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0043760035187006,
+ "skip_count": 0.0,
+ "step": 1244,
+ "text_loss": 0.5547386407852173
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.850014675667743,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1005859375,
+ "learning_rate": 0.0009867658816181206,
+ "loss": 0.0374,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2008115.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009227181784808636,
+ "skip_count": 0.0,
+ "step": 1246,
+ "text_loss": 1.0067731142044067
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.859407103023187,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.126953125,
+ "learning_rate": 0.000986695047813688,
+ "loss": 0.0261,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2011137.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.023822437971830368,
+ "skip_count": 0.0,
+ "step": 1248,
+ "text_loss": 0.30058956146240234
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 5.868799530378633,
+ "f1_execute": 0.9200000166893005,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.1044921875,
+ "learning_rate": 0.0009866240275067948,
+ "loss": 0.044,
+ "macro_f1": 0.47333335876464844,
+ "num_tokens": 2014159.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.21523773670196533,
+ "skip_count": 3.0,
+ "step": 1250,
+ "text_loss": 0.39072203636169434
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 5.878191957734077,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1201171875,
+ "learning_rate": 0.0009865528207246563,
+ "loss": 0.0351,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2017731.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06184682995080948,
+ "skip_count": 2.0,
+ "step": 1252,
+ "text_loss": 0.35751575231552124
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.8875843850895215,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.166015625,
+ "learning_rate": 0.000986481427494559,
+ "loss": 0.0336,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2020485.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007573372684419155,
+ "skip_count": 0.0,
+ "step": 1254,
+ "text_loss": 0.4061077833175659
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.896976812444966,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1708984375,
+ "learning_rate": 0.000986409847843861,
+ "loss": 0.0382,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2024149.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.07447971403598785,
+ "skip_count": 0.0,
+ "step": 1256,
+ "text_loss": 0.41876497864723206
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.906369239800411,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.17578125,
+ "learning_rate": 0.000986338081799992,
+ "loss": 0.0351,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2026545.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006609147880226374,
+ "skip_count": 0.0,
+ "step": 1258,
+ "text_loss": 0.4673794209957123
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.915761667155856,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1123046875,
+ "learning_rate": 0.0009862661293904523,
+ "loss": 0.0498,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2029581.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.10624702274799347,
+ "skip_count": 2.0,
+ "step": 1260,
+ "text_loss": 0.3483233153820038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1201171875,
+ "learning_rate": 0.0009861939906428145,
+ "loss": 0.0525,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2033936.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007944886572659016,
+ "skip_count": 0.0,
+ "step": 1262,
+ "text_loss": 0.16362667083740234
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 5.934546521866745,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11669921875,
+ "learning_rate": 0.0009861216655847225,
+ "loss": 0.0376,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2037876.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007004092447459698,
+ "skip_count": 0.0,
+ "step": 1264,
+ "text_loss": 0.43228110671043396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.94393894922219,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1005859375,
+ "learning_rate": 0.0009860491542438912,
+ "loss": 0.047,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2040842.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.026916226372122765,
+ "skip_count": 1.0,
+ "step": 1266,
+ "text_loss": 0.5901188850402832
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.953331376577634,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0986328125,
+ "learning_rate": 0.000985976456648107,
+ "loss": 0.0353,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2043890.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007325216196477413,
+ "skip_count": 0.0,
+ "step": 1268,
+ "text_loss": 0.8780109882354736
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 5.962723803933079,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.10205078125,
+ "learning_rate": 0.000985903572825228,
+ "loss": 0.0306,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 2048848.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05007527023553848,
+ "skip_count": 2.0,
+ "step": 1270,
+ "text_loss": 0.5863722562789917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 5.972116231288524,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.173828125,
+ "learning_rate": 0.000985830502803183,
+ "loss": 0.0396,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2051561.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023995524272322655,
+ "skip_count": 0.0,
+ "step": 1272,
+ "text_loss": 0.7460709810256958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.9815086586439685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10205078125,
+ "learning_rate": 0.0009857572466099732,
+ "loss": 0.0431,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2054752.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006928362417966127,
+ "skip_count": 0.0,
+ "step": 1274,
+ "text_loss": 0.5130293369293213
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 5.990901085999413,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.162109375,
+ "learning_rate": 0.0009856838042736698,
+ "loss": 0.0501,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2058151.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006969396956264973,
+ "skip_count": 0.0,
+ "step": 1276,
+ "text_loss": 0.5911393761634827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1357421875,
+ "learning_rate": 0.0009856101758224166,
+ "loss": 0.0441,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2061012.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003499418031424284,
+ "skip_count": 0.0,
+ "step": 1278,
+ "text_loss": 0.25347545742988586
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.009392427355444,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.000985536361284428,
+ "loss": 0.0229,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2064597.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007856054231524467,
+ "skip_count": 0.0,
+ "step": 1280,
+ "text_loss": 0.7476963400840759
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.01878485471089,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0888671875,
+ "learning_rate": 0.0009854623606879898,
+ "loss": 0.0245,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2067972.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02617792971432209,
+ "skip_count": 1.0,
+ "step": 1282,
+ "text_loss": 0.5775872468948364
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 6.028177282066334,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09033203125,
+ "learning_rate": 0.000985388174061459,
+ "loss": 0.0356,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 2071812.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.035979997366666794,
+ "skip_count": 1.0,
+ "step": 1284,
+ "text_loss": 0.2933400869369507
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.037569709421779,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08447265625,
+ "learning_rate": 0.0009853138014332646,
+ "loss": 0.0273,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2074868.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005142854526638985,
+ "skip_count": 0.0,
+ "step": 1286,
+ "text_loss": 0.29085102677345276
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.046962136777223,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09033203125,
+ "learning_rate": 0.0009852392428319058,
+ "loss": 0.0306,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2078225.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032799106556922197,
+ "skip_count": 0.0,
+ "step": 1288,
+ "text_loss": 0.7293626070022583
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 6.056354564132668,
+ "f1_execute": 0.9411765336990356,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.08935546875,
+ "learning_rate": 0.0009851644982859537,
+ "loss": 0.0273,
+ "macro_f1": 0.480392187833786,
+ "num_tokens": 2081495.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.12224318832159042,
+ "skip_count": 3.0,
+ "step": 1290,
+ "text_loss": 0.26125892996788025
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.065746991488113,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1435546875,
+ "learning_rate": 0.0009850895678240508,
+ "loss": 0.0283,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2084390.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010662888176739216,
+ "skip_count": 0.0,
+ "step": 1292,
+ "text_loss": 0.3510764539241791
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 6.075139418843557,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1689453125,
+ "learning_rate": 0.0009850144514749104,
+ "loss": 0.0332,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2087210.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01979079470038414,
+ "skip_count": 2.0,
+ "step": 1294,
+ "text_loss": 0.40202176570892334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 6.084531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.11669921875,
+ "learning_rate": 0.000984939149267317,
+ "loss": 0.0253,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2090777.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005172552540898323,
+ "skip_count": 1.0,
+ "step": 1296,
+ "text_loss": 0.5275651216506958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.093924273554447,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.095703125,
+ "learning_rate": 0.0009848636612301272,
+ "loss": 0.0299,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2094248.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029599082190543413,
+ "skip_count": 0.0,
+ "step": 1298,
+ "text_loss": 0.4517653286457062
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.103316700909891,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.23046875,
+ "learning_rate": 0.0009847879873922675,
+ "loss": 0.0357,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2097139.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011455860920250416,
+ "skip_count": 0.0,
+ "step": 1300,
+ "text_loss": 0.16888445615768433
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.112709128265336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09619140625,
+ "learning_rate": 0.0009847121277827366,
+ "loss": 0.0301,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2100415.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008091195486485958,
+ "skip_count": 0.0,
+ "step": 1302,
+ "text_loss": 0.40061676502227783
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.122101555620781,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1123046875,
+ "learning_rate": 0.000984636082430604,
+ "loss": 0.0285,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2103285.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009593960829079151,
+ "skip_count": 0.0,
+ "step": 1304,
+ "text_loss": 0.7211073637008667
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.131493982976226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.107421875,
+ "learning_rate": 0.0009845598513650103,
+ "loss": 0.0231,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2106255.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023068038281053305,
+ "skip_count": 0.0,
+ "step": 1306,
+ "text_loss": 0.7077119946479797
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.171875,
+ "learning_rate": 0.0009844834346151674,
+ "loss": 0.043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2109305.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007703019306063652,
+ "skip_count": 0.0,
+ "step": 1308,
+ "text_loss": 0.3534316122531891
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.1502788376871145,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1025390625,
+ "learning_rate": 0.0009844068322103585,
+ "loss": 0.0287,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2112216.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023549847304821014,
+ "skip_count": 1.0,
+ "step": 1310,
+ "text_loss": 0.6792599558830261
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.150390625,
+ "learning_rate": 0.0009843300441799378,
+ "loss": 0.0211,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2114925.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007605871185660362,
+ "skip_count": 0.0,
+ "step": 1312,
+ "text_loss": 0.1571389138698578
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.169063692398004,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.134765625,
+ "learning_rate": 0.0009842530705533304,
+ "loss": 0.0253,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2117744.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014964760281145573,
+ "skip_count": 0.0,
+ "step": 1314,
+ "text_loss": 0.7840361595153809
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.178456119753449,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.000984175911360033,
+ "loss": 0.0238,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2120848.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004663798492401838,
+ "skip_count": 0.0,
+ "step": 1316,
+ "text_loss": 0.536246120929718
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 6.187848547108893,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1201171875,
+ "learning_rate": 0.000984098566629613,
+ "loss": 0.0288,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2123651.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022852955386042595,
+ "skip_count": 2.0,
+ "step": 1318,
+ "text_loss": 0.43372172117233276
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.197240974464338,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07958984375,
+ "learning_rate": 0.0009840210363917087,
+ "loss": 0.0216,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2128011.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012578422203660011,
+ "skip_count": 0.0,
+ "step": 1320,
+ "text_loss": 0.28190380334854126
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10986328125,
+ "learning_rate": 0.0009839433206760306,
+ "loss": 0.0204,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2131035.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006863643880933523,
+ "skip_count": 0.0,
+ "step": 1322,
+ "text_loss": 0.6340444087982178
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.216025829175227,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1796875,
+ "learning_rate": 0.0009838654195123589,
+ "loss": 0.0243,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2133856.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00468854233622551,
+ "skip_count": 0.0,
+ "step": 1324,
+ "text_loss": 0.5138425827026367
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.225418256530672,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.115234375,
+ "learning_rate": 0.0009837873329305458,
+ "loss": 0.0396,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2136451.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005731126759201288,
+ "skip_count": 0.0,
+ "step": 1326,
+ "text_loss": 0.742124617099762
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.17578125,
+ "learning_rate": 0.000983709060960514,
+ "loss": 0.0416,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2139496.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0056343949399888515,
+ "skip_count": 0.0,
+ "step": 1328,
+ "text_loss": 0.7317464351654053
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.2442031112415615,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10791015625,
+ "learning_rate": 0.0009836306036322576,
+ "loss": 0.0312,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2143120.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005127966403961182,
+ "skip_count": 0.0,
+ "step": 1330,
+ "text_loss": 0.538652241230011
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 6.253595538597006,
+ "f1_execute": 0.9130434989929199,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.11083984375,
+ "learning_rate": 0.0009835519609758415,
+ "loss": 0.0301,
+ "macro_f1": 0.590062141418457,
+ "num_tokens": 2145807.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.1673707216978073,
+ "skip_count": 4.0,
+ "step": 1332,
+ "text_loss": 0.3498198091983795
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.262987965952451,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0009834731330214017,
+ "loss": 0.0293,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2148397.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04026653990149498,
+ "skip_count": 0.0,
+ "step": 1334,
+ "text_loss": 0.8153424859046936
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 27.0,
+ "epoch": 6.272380393307896,
+ "f1_execute": 0.8999999761581421,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.8000000715255737,
+ "grad_norm": 0.16015625,
+ "learning_rate": 0.0009833941197991455,
+ "loss": 0.0329,
+ "macro_f1": 0.7888889312744141,
+ "num_tokens": 2152226.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.05481519177556038,
+ "skip_count": 5.0,
+ "step": 1336,
+ "text_loss": 0.7802760004997253
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 6.28177282066334,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.0009833149213393506,
+ "loss": 0.0304,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2156023.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01760484278202057,
+ "skip_count": 0.0,
+ "step": 1338,
+ "text_loss": 0.19721226394176483
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.2911652480187845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11474609375,
+ "learning_rate": 0.000983235537672366,
+ "loss": 0.0256,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2160037.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013206037692725658,
+ "skip_count": 0.0,
+ "step": 1340,
+ "text_loss": 0.5003817081451416
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.000983155968828612,
+ "loss": 0.0315,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2163910.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01256406120955944,
+ "skip_count": 0.0,
+ "step": 1342,
+ "text_loss": 0.5996923446655273
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.309950102729674,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11962890625,
+ "learning_rate": 0.0009830762148385793,
+ "loss": 0.0313,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2166921.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015086234547197819,
+ "skip_count": 1.0,
+ "step": 1344,
+ "text_loss": 0.45356282591819763
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.319342530085119,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08447265625,
+ "learning_rate": 0.0009829962757328297,
+ "loss": 0.0223,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2170135.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07909081131219864,
+ "skip_count": 2.0,
+ "step": 1346,
+ "text_loss": 0.2874644994735718
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 6.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0009829161515419959,
+ "loss": 0.0246,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2173029.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013569854199886322,
+ "skip_count": 2.0,
+ "step": 1348,
+ "text_loss": 0.25533875823020935
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.3381273847960085,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0009828358422967823,
+ "loss": 0.0226,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2176605.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08111091703176498,
+ "skip_count": 1.0,
+ "step": 1350,
+ "text_loss": 0.32827726006507874
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 6.347519812151453,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.091796875,
+ "learning_rate": 0.0009827553480279627,
+ "loss": 0.03,
+ "macro_f1": 0.5427350401878357,
+ "num_tokens": 2179406.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.026550088077783585,
+ "skip_count": 2.0,
+ "step": 1352,
+ "text_loss": 0.2966301143169403
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0791015625,
+ "learning_rate": 0.0009826746687663832,
+ "loss": 0.0301,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2182353.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003914554137736559,
+ "skip_count": 0.0,
+ "step": 1354,
+ "text_loss": 0.7596251964569092
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 6.366304666862343,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0859375,
+ "learning_rate": 0.0009825938045429602,
+ "loss": 0.0324,
+ "macro_f1": 0.5866667032241821,
+ "num_tokens": 2185786.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.059612665325403214,
+ "skip_count": 3.0,
+ "step": 1356,
+ "text_loss": 0.12325898557901382
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.375697094217787,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10009765625,
+ "learning_rate": 0.0009825127553886807,
+ "loss": 0.0375,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2190157.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0071132429875433445,
+ "skip_count": 0.0,
+ "step": 1358,
+ "text_loss": 0.9287898540496826
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.3850895215732315,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0986328125,
+ "learning_rate": 0.0009824315213346033,
+ "loss": 0.0348,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2193077.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009611099027097225,
+ "skip_count": 0.0,
+ "step": 1360,
+ "text_loss": 0.20427259802818298
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.394481948928676,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10888671875,
+ "learning_rate": 0.0009823501024118569,
+ "loss": 0.0285,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2196494.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006913455203175545,
+ "skip_count": 0.0,
+ "step": 1362,
+ "text_loss": 0.574759840965271
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.403874376284121,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.0009822684986516411,
+ "loss": 0.0245,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2199839.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009208920411765575,
+ "skip_count": 0.0,
+ "step": 1364,
+ "text_loss": 0.42422571778297424
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.413266803639566,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0927734375,
+ "learning_rate": 0.000982186710085227,
+ "loss": 0.0208,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2203212.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.059975091367959976,
+ "skip_count": 1.0,
+ "step": 1366,
+ "text_loss": 0.29213017225265503
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.25,
+ "avg_layers": 27.0,
+ "epoch": 6.42265923099501,
+ "f1_execute": 0.9411765336990356,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.181640625,
+ "learning_rate": 0.0009821047367439561,
+ "loss": 0.0358,
+ "macro_f1": 0.44705885648727417,
+ "num_tokens": 2206240.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.048244867473840714,
+ "skip_count": 4.0,
+ "step": 1368,
+ "text_loss": 0.3072395324707031
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.432051658350455,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11181640625,
+ "learning_rate": 0.0009820225786592405,
+ "loss": 0.0375,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2209903.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.026068156585097313,
+ "skip_count": 0.0,
+ "step": 1370,
+ "text_loss": 0.5961400270462036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.4414440857059,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.109375,
+ "learning_rate": 0.0009819402358625634,
+ "loss": 0.0366,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2213439.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022615568712353706,
+ "skip_count": 1.0,
+ "step": 1372,
+ "text_loss": 0.19375644624233246
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.450836513061344,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1240234375,
+ "learning_rate": 0.000981857708385479,
+ "loss": 0.0346,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2216457.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005855285096913576,
+ "skip_count": 0.0,
+ "step": 1374,
+ "text_loss": 0.5123368501663208
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.460228940416789,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09423828125,
+ "learning_rate": 0.0009817749962596114,
+ "loss": 0.0249,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2219975.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0651634931564331,
+ "skip_count": 0.0,
+ "step": 1376,
+ "text_loss": 0.5999220609664917
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09912109375,
+ "learning_rate": 0.0009816920995166568,
+ "loss": 0.0371,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2222833.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011408994905650616,
+ "skip_count": 0.0,
+ "step": 1378,
+ "text_loss": 0.5323230624198914
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.4790137951276785,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.205078125,
+ "learning_rate": 0.0009816090181883807,
+ "loss": 0.0313,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2225842.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.039720915257930756,
+ "skip_count": 2.0,
+ "step": 1380,
+ "text_loss": 0.23363439738750458
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.488406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12255859375,
+ "learning_rate": 0.0009815257523066204,
+ "loss": 0.0249,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2229430.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002765297656878829,
+ "skip_count": 0.0,
+ "step": 1382,
+ "text_loss": 0.718977689743042
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.497798649838567,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.130859375,
+ "learning_rate": 0.0009814423019032835,
+ "loss": 0.0396,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2232594.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.05362323671579361,
+ "skip_count": 0.0,
+ "step": 1384,
+ "text_loss": 0.6392166614532471
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.507191077194013,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.150390625,
+ "learning_rate": 0.0009813586670103483,
+ "loss": 0.0426,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 2236327.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.031728316098451614,
+ "skip_count": 1.0,
+ "step": 1386,
+ "text_loss": 0.5951619148254395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 6.516583504549457,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.126953125,
+ "learning_rate": 0.0009812748476598638,
+ "loss": 0.031,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2239746.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03981253132224083,
+ "skip_count": 2.0,
+ "step": 1388,
+ "text_loss": 0.22756551206111908
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 6.5259759319049016,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.12451171875,
+ "learning_rate": 0.0009811908438839498,
+ "loss": 0.0331,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2242786.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04617162421345711,
+ "skip_count": 2.0,
+ "step": 1390,
+ "text_loss": 0.3233799934387207
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.535368359260346,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.154296875,
+ "learning_rate": 0.000981106655714797,
+ "loss": 0.0358,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2245696.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.046828847378492355,
+ "skip_count": 1.0,
+ "step": 1392,
+ "text_loss": 0.24273279309272766
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 6.544760786615791,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.0009810222831846656,
+ "loss": 0.0307,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2249326.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010921589098870754,
+ "skip_count": 2.0,
+ "step": 1394,
+ "text_loss": 0.3921460807323456
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 6.554153213971236,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09423828125,
+ "learning_rate": 0.0009809377263258882,
+ "loss": 0.0315,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 2253393.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04564022272825241,
+ "skip_count": 1.0,
+ "step": 1396,
+ "text_loss": 0.582602858543396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 6.56354564132668,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.103515625,
+ "learning_rate": 0.000980852985170867,
+ "loss": 0.0328,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2256626.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013289985246956348,
+ "skip_count": 0.0,
+ "step": 1398,
+ "text_loss": 0.41031694412231445
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.5729380686821255,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1728515625,
+ "learning_rate": 0.0009807680597520745,
+ "loss": 0.0264,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2259326.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0065213534981012344,
+ "skip_count": 0.0,
+ "step": 1400,
+ "text_loss": 0.2888098657131195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.58233049603757,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.23046875,
+ "learning_rate": 0.0009806829501020546,
+ "loss": 0.0358,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2262344.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04199840500950813,
+ "skip_count": 1.0,
+ "step": 1402,
+ "text_loss": 0.31973034143447876
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.591722923393014,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08935546875,
+ "learning_rate": 0.0009805976562534215,
+ "loss": 0.0317,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 2266354.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015434930101037025,
+ "skip_count": 1.0,
+ "step": 1404,
+ "text_loss": 0.508630633354187
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 6.601115350748459,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.140625,
+ "learning_rate": 0.0009805121782388599,
+ "loss": 0.0339,
+ "macro_f1": 0.6533333659172058,
+ "num_tokens": 2269660.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0720924660563469,
+ "skip_count": 2.0,
+ "step": 1406,
+ "text_loss": 0.40927737951278687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 6.610507778103904,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0009804265160911253,
+ "loss": 0.0266,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2273335.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02400495670735836,
+ "skip_count": 2.0,
+ "step": 1408,
+ "text_loss": 0.1777762621641159
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.6199002054593485,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2314453125,
+ "learning_rate": 0.0009803406698430433,
+ "loss": 0.0371,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2277107.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02560107782483101,
+ "skip_count": 1.0,
+ "step": 1410,
+ "text_loss": 0.17955881357192993
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.629292632814793,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07470703125,
+ "learning_rate": 0.0009802546395275104,
+ "loss": 0.0349,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2281638.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006655813194811344,
+ "skip_count": 0.0,
+ "step": 1412,
+ "text_loss": 0.20882295072078705
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 28.0,
+ "epoch": 6.638685060170237,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.08740234375,
+ "learning_rate": 0.000980168425177494,
+ "loss": 0.0342,
+ "macro_f1": 0.8200000524520874,
+ "num_tokens": 2284876.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06325097382068634,
+ "skip_count": 3.0,
+ "step": 1414,
+ "text_loss": 0.26035264134407043
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.648077487525683,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.138671875,
+ "learning_rate": 0.000980082026826031,
+ "loss": 0.0315,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2288938.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013436575420200825,
+ "skip_count": 0.0,
+ "step": 1416,
+ "text_loss": 0.5502325892448425
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.657469914881127,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07177734375,
+ "learning_rate": 0.0009799954445062296,
+ "loss": 0.0193,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 2292317.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011264479719102383,
+ "skip_count": 1.0,
+ "step": 1418,
+ "text_loss": 0.48075684905052185
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 6.666862342236572,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009799086782512686,
+ "loss": 0.0292,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2295935.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02833271212875843,
+ "skip_count": 2.0,
+ "step": 1420,
+ "text_loss": 0.18221206963062286
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 6.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.09375,
+ "learning_rate": 0.0009798217280943967,
+ "loss": 0.0356,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2298927.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009208574891090393,
+ "skip_count": 1.0,
+ "step": 1422,
+ "text_loss": 0.48686322569847107
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 6.685647196947461,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09423828125,
+ "learning_rate": 0.0009797345940689335,
+ "loss": 0.0267,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2301541.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015011847950518131,
+ "skip_count": 0.0,
+ "step": 1424,
+ "text_loss": 0.49446266889572144
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.4000000059604645,
+ "avg_layers": 26.0,
+ "epoch": 6.695039624302906,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5714285969734192,
+ "grad_norm": 0.1337890625,
+ "learning_rate": 0.0009796472762082687,
+ "loss": 0.0338,
+ "macro_f1": 0.5034013986587524,
+ "num_tokens": 2304589.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05912091210484505,
+ "skip_count": 5.0,
+ "step": 1426,
+ "text_loss": 0.23945684731006622
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.70443205165835,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09765625,
+ "learning_rate": 0.000979559774545863,
+ "loss": 0.0405,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2307860.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021242303773760796,
+ "skip_count": 1.0,
+ "step": 1428,
+ "text_loss": 0.531273365020752
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.713824479013795,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.000979472089115247,
+ "loss": 0.0276,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2311581.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02768544852733612,
+ "skip_count": 2.0,
+ "step": 1430,
+ "text_loss": 0.2497459501028061
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12255859375,
+ "learning_rate": 0.000979384219950022,
+ "loss": 0.0346,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2314639.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008678150363266468,
+ "skip_count": 0.0,
+ "step": 1432,
+ "text_loss": 0.6579355001449585
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.732609333724684,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08056640625,
+ "learning_rate": 0.0009792961670838595,
+ "loss": 0.0362,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2317927.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03325597569346428,
+ "skip_count": 0.0,
+ "step": 1434,
+ "text_loss": 0.5209436416625977
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.742001761080129,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1494140625,
+ "learning_rate": 0.0009792079305505016,
+ "loss": 0.0306,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2321065.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.019228918477892876,
+ "skip_count": 0.0,
+ "step": 1436,
+ "text_loss": 0.41087067127227783
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.751394188435574,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10986328125,
+ "learning_rate": 0.000979119510383761,
+ "loss": 0.0371,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2323714.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017071325331926346,
+ "skip_count": 0.0,
+ "step": 1438,
+ "text_loss": 0.21490029990673065
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.760786615791019,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2060546875,
+ "learning_rate": 0.00097903090661752,
+ "loss": 0.0309,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2326454.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00991755723953247,
+ "skip_count": 0.0,
+ "step": 1440,
+ "text_loss": 0.23847346007823944
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.770179043146463,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.232421875,
+ "learning_rate": 0.000978942119285732,
+ "loss": 0.0404,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2329462.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04908733069896698,
+ "skip_count": 1.0,
+ "step": 1442,
+ "text_loss": 0.23343028128147125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.7795714705019074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1044921875,
+ "learning_rate": 0.0009788531484224204,
+ "loss": 0.0264,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2332146.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032628148328512907,
+ "skip_count": 0.0,
+ "step": 1444,
+ "text_loss": 0.47423800826072693
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 29.0,
+ "epoch": 6.788963897857353,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.5,
+ "grad_norm": 0.10693359375,
+ "learning_rate": 0.0009787639940616788,
+ "loss": 0.0405,
+ "macro_f1": 0.7018141150474548,
+ "num_tokens": 2335738.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.14336998760700226,
+ "skip_count": 3.0,
+ "step": 1446,
+ "text_loss": 0.21837592124938965
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 6.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.189453125,
+ "learning_rate": 0.0009786746562376717,
+ "loss": 0.0241,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2338488.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010542908683419228,
+ "skip_count": 1.0,
+ "step": 1448,
+ "text_loss": 1.0614757537841797
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.807748752568242,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1728515625,
+ "learning_rate": 0.0009785851349846334,
+ "loss": 0.0268,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2342074.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005998016335070133,
+ "skip_count": 0.0,
+ "step": 1450,
+ "text_loss": 0.4269719421863556
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 26.0,
+ "epoch": 6.817141179923686,
+ "f1_execute": 0.9411764740943909,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.1083984375,
+ "learning_rate": 0.0009784954303368686,
+ "loss": 0.0384,
+ "macro_f1": 0.44705885648727417,
+ "num_tokens": 2345838.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0959126204252243,
+ "skip_count": 3.0,
+ "step": 1452,
+ "text_loss": 0.3315916955471039
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.826533607279131,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1005859375,
+ "learning_rate": 0.0009784055423287521,
+ "loss": 0.0218,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2348939.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025467623490840197,
+ "skip_count": 0.0,
+ "step": 1454,
+ "text_loss": 0.6162732839584351
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.835926034634576,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.115234375,
+ "learning_rate": 0.0009783154709947293,
+ "loss": 0.0256,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2352232.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01860538125038147,
+ "skip_count": 1.0,
+ "step": 1456,
+ "text_loss": 0.23928768932819366
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.84531846199002,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09912109375,
+ "learning_rate": 0.0009782252163693158,
+ "loss": 0.0201,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2355159.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04412713274359703,
+ "skip_count": 1.0,
+ "step": 1458,
+ "text_loss": 0.3371323347091675
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.21484375,
+ "learning_rate": 0.0009781347784870973,
+ "loss": 0.0379,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2358175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006809141952544451,
+ "skip_count": 0.0,
+ "step": 1460,
+ "text_loss": 0.547267735004425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.86410331670091,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.095703125,
+ "learning_rate": 0.0009780441573827296,
+ "loss": 0.03,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 2360991.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08924390375614166,
+ "skip_count": 4.0,
+ "step": 1462,
+ "text_loss": 0.7026563882827759
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.873495744056354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1865234375,
+ "learning_rate": 0.000977953353090939,
+ "loss": 0.0272,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2363894.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021858472377061844,
+ "skip_count": 0.0,
+ "step": 1464,
+ "text_loss": 0.2718065083026886
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.882888171411799,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11474609375,
+ "learning_rate": 0.0009778623656465219,
+ "loss": 0.0338,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2367265.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.044781096279621124,
+ "skip_count": 0.0,
+ "step": 1466,
+ "text_loss": 0.5008095502853394
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.892280598767244,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0009777711950843448,
+ "loss": 0.0212,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2370186.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0040459707379341125,
+ "skip_count": 0.0,
+ "step": 1468,
+ "text_loss": 0.5242461562156677
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 6.901673026122689,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.134765625,
+ "learning_rate": 0.0009776798414393446,
+ "loss": 0.0279,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 2373314.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0708528608083725,
+ "skip_count": 3.0,
+ "step": 1470,
+ "text_loss": 0.2821732461452484
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.911065453478133,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1328125,
+ "learning_rate": 0.0009775883047465279,
+ "loss": 0.0414,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 2376435.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0290578193962574,
+ "skip_count": 1.0,
+ "step": 1472,
+ "text_loss": 0.8438440561294556
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.9204578808335775,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10546875,
+ "learning_rate": 0.000977496585040972,
+ "loss": 0.0373,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2380244.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010360375046730042,
+ "skip_count": 0.0,
+ "step": 1474,
+ "text_loss": 0.4356135427951813
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 6.929850308189023,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09912109375,
+ "learning_rate": 0.000977404682357824,
+ "loss": 0.0294,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2383498.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023518972098827362,
+ "skip_count": 0.0,
+ "step": 1476,
+ "text_loss": 0.25195425748825073
+ },
+ {
+ "acc_repeat": 0.800000011920929,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 6.939242735544467,
+ "f1_execute": 0.9743589162826538,
+ "f1_repeat": 0.888888955116272,
+ "f1_skip": 1.0,
+ "grad_norm": 0.11181640625,
+ "learning_rate": 0.000977312596732301,
+ "loss": 0.0375,
+ "macro_f1": 0.9544159770011902,
+ "num_tokens": 2386414.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.08190606534481049,
+ "skip_count": 4.0,
+ "step": 1478,
+ "text_loss": 0.6586798429489136
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 6.948635162899912,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10546875,
+ "learning_rate": 0.0009772203281996905,
+ "loss": 0.0336,
+ "macro_f1": 1.0,
+ "num_tokens": 2389399.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.016441475600004196,
+ "skip_count": 2.0,
+ "step": 1480,
+ "text_loss": 0.3671986758708954
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.958027590255357,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09814453125,
+ "learning_rate": 0.0009771278767953502,
+ "loss": 0.0357,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2392400.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019211363047361374,
+ "skip_count": 0.0,
+ "step": 1482,
+ "text_loss": 0.27418580651283264
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.967420017610801,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0947265625,
+ "learning_rate": 0.0009770352425547072,
+ "loss": 0.0292,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2395123.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015800386667251587,
+ "skip_count": 0.0,
+ "step": 1484,
+ "text_loss": 0.19896622002124786
+ },
+ {
+ "acc_repeat": 0.3333333432674408,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 6.976812444966246,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.5,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12890625,
+ "learning_rate": 0.0009769424255132596,
+ "loss": 0.0256,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 2397359.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.06670158356428146,
+ "skip_count": 0.0,
+ "step": 1486,
+ "text_loss": 0.4229799509048462
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.98620487232169,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1162109375,
+ "learning_rate": 0.0009768494257065747,
+ "loss": 0.0218,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2400387.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011144762858748436,
+ "skip_count": 1.0,
+ "step": 1488,
+ "text_loss": 0.4264226257801056
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 6.995597299677136,
+ "f1_execute": 0.9019608497619629,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12353515625,
+ "learning_rate": 0.0009767562431702904,
+ "loss": 0.0387,
+ "macro_f1": 0.3006536364555359,
+ "num_tokens": 2403241.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.12339717149734497,
+ "skip_count": 3.0,
+ "step": 1490,
+ "text_loss": 0.2850193977355957
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.004696213677723,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07177734375,
+ "learning_rate": 0.0009766628779401142,
+ "loss": 0.0215,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2406087.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008174685761332512,
+ "skip_count": 1.0,
+ "step": 1492,
+ "text_loss": 0.6756544709205627
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.014088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 0.000976569330051824,
+ "loss": 0.0186,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2409312.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021256296895444393,
+ "skip_count": 0.0,
+ "step": 1494,
+ "text_loss": 0.4789894223213196
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.0234810683886115,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.0009764755995412677,
+ "loss": 0.0193,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2412758.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003944927826523781,
+ "skip_count": 0.0,
+ "step": 1496,
+ "text_loss": 0.5157490968704224
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.032873495744056,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09228515625,
+ "learning_rate": 0.0009763816864443627,
+ "loss": 0.0239,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2416079.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03893325850367546,
+ "skip_count": 0.0,
+ "step": 1498,
+ "text_loss": 0.28045418858528137
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.042265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1279296875,
+ "learning_rate": 0.0009762875907970968,
+ "loss": 0.0199,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2420340.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017725443467497826,
+ "skip_count": 0.0,
+ "step": 1500,
+ "text_loss": 0.35550856590270996
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.051658350454946,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 0.0009761933126355277,
+ "loss": 0.0245,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2424735.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01393749937415123,
+ "skip_count": 1.0,
+ "step": 1502,
+ "text_loss": 0.38840189576148987
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 7.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1630859375,
+ "learning_rate": 0.0009760988519957828,
+ "loss": 0.0249,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2428132.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01687910407781601,
+ "skip_count": 2.0,
+ "step": 1504,
+ "text_loss": 0.3031681478023529
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.0704432051658355,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0009760042089140598,
+ "loss": 0.0193,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 2431592.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04704280197620392,
+ "skip_count": 2.0,
+ "step": 1506,
+ "text_loss": 0.16355200111865997
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0986328125,
+ "learning_rate": 0.0009759093834266259,
+ "loss": 0.0206,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2434236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016075772000476718,
+ "skip_count": 0.0,
+ "step": 1508,
+ "text_loss": 0.6080073118209839
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1025390625,
+ "learning_rate": 0.0009758143755698186,
+ "loss": 0.015,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2437170.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008451299741864204,
+ "skip_count": 0.0,
+ "step": 1510,
+ "text_loss": 0.22100484371185303
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 7.098620487232169,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0009757191853800449,
+ "loss": 0.0227,
+ "macro_f1": 0.5866667032241821,
+ "num_tokens": 2441187.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.046565692871809006,
+ "skip_count": 3.0,
+ "step": 1512,
+ "text_loss": 0.25098952651023865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.108012914587614,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11279296875,
+ "learning_rate": 0.000975623812893782,
+ "loss": 0.0276,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2444664.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02872578240931034,
+ "skip_count": 1.0,
+ "step": 1514,
+ "text_loss": 0.4952253997325897
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.1174053419430585,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1142578125,
+ "learning_rate": 0.0009755282581475768,
+ "loss": 0.0233,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2447748.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002055214950814843,
+ "skip_count": 0.0,
+ "step": 1516,
+ "text_loss": 0.7465500831604004
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.126797769298503,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10302734375,
+ "learning_rate": 0.000975432521178046,
+ "loss": 0.0216,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2450834.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04498551785945892,
+ "skip_count": 0.0,
+ "step": 1518,
+ "text_loss": 0.28144413232803345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.136190196653947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09912109375,
+ "learning_rate": 0.0009753366020218763,
+ "loss": 0.0234,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2454233.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003669742727652192,
+ "skip_count": 0.0,
+ "step": 1520,
+ "text_loss": 0.5667551755905151
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0830078125,
+ "learning_rate": 0.0009752405007158238,
+ "loss": 0.0238,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2457331.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010455607436597347,
+ "skip_count": 0.0,
+ "step": 1522,
+ "text_loss": 0.19575810432434082
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 7.154975051364837,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0751953125,
+ "learning_rate": 0.0009751442172967151,
+ "loss": 0.0193,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 2459935.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.025189083069562912,
+ "skip_count": 1.0,
+ "step": 1524,
+ "text_loss": 0.45453405380249023
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 7.164367478720282,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0927734375,
+ "learning_rate": 0.000975047751801446,
+ "loss": 0.0187,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2463008.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012297490611672401,
+ "skip_count": 0.0,
+ "step": 1526,
+ "text_loss": 0.31437572836875916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1044921875,
+ "learning_rate": 0.0009749511042669823,
+ "loss": 0.0233,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2466475.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011026266030967236,
+ "skip_count": 0.0,
+ "step": 1528,
+ "text_loss": 0.46604859828948975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.183152333431171,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1376953125,
+ "learning_rate": 0.0009748542747303595,
+ "loss": 0.0182,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2469320.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011934996582567692,
+ "skip_count": 1.0,
+ "step": 1530,
+ "text_loss": 0.7764923572540283
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.192544760786616,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0966796875,
+ "learning_rate": 0.0009747572632286827,
+ "loss": 0.0203,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2472468.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005786920432001352,
+ "skip_count": 0.0,
+ "step": 1532,
+ "text_loss": 0.3555782437324524
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 7.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0849609375,
+ "learning_rate": 0.0009746600697991271,
+ "loss": 0.02,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2475736.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0026990731712430716,
+ "skip_count": 0.0,
+ "step": 1534,
+ "text_loss": 0.49561792612075806
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 7.2113296154975055,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0556640625,
+ "learning_rate": 0.0009745626944789375,
+ "loss": 0.0204,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 2478887.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.020221207290887833,
+ "skip_count": 2.0,
+ "step": 1536,
+ "text_loss": 0.5375416278839111
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.22072204285295,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12158203125,
+ "learning_rate": 0.0009744651373054279,
+ "loss": 0.0286,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2481293.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03131086751818657,
+ "skip_count": 1.0,
+ "step": 1538,
+ "text_loss": 0.5241039395332336
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 7.230114470208394,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.08984375,
+ "learning_rate": 0.0009743673983159828,
+ "loss": 0.0241,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 2484403.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04448170214891434,
+ "skip_count": 4.0,
+ "step": 1540,
+ "text_loss": 0.7465724349021912
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.239506897563839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08935546875,
+ "learning_rate": 0.0009742694775480557,
+ "loss": 0.0265,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2487952.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007171491626650095,
+ "skip_count": 1.0,
+ "step": 1542,
+ "text_loss": 0.2877117097377777
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 7.248899324919284,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07275390625,
+ "learning_rate": 0.0009741713750391703,
+ "loss": 0.0171,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2490815.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004559285007417202,
+ "skip_count": 0.0,
+ "step": 1544,
+ "text_loss": 0.6097800135612488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.258291752274729,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06787109375,
+ "learning_rate": 0.0009740730908269193,
+ "loss": 0.0174,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2494727.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005271553061902523,
+ "skip_count": 0.0,
+ "step": 1546,
+ "text_loss": 0.5431114435195923
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0703125,
+ "learning_rate": 0.0009739746249489658,
+ "loss": 0.0239,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2499266.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015409323386847973,
+ "skip_count": 0.0,
+ "step": 1548,
+ "text_loss": 0.4702678322792053
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.277076606985618,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1171875,
+ "learning_rate": 0.0009738759774430417,
+ "loss": 0.0216,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2502273.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.030183158814907074,
+ "skip_count": 1.0,
+ "step": 1550,
+ "text_loss": 0.3239189088344574
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.286469034341063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.0009737771483469493,
+ "loss": 0.0196,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2507624.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005410848651081324,
+ "skip_count": 0.0,
+ "step": 1552,
+ "text_loss": 0.4014642834663391
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07763671875,
+ "learning_rate": 0.0009736781376985598,
+ "loss": 0.0168,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2510366.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0066976165398955345,
+ "skip_count": 1.0,
+ "step": 1554,
+ "text_loss": 0.5924848914146423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.305253889051952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.13671875,
+ "learning_rate": 0.0009735789455358144,
+ "loss": 0.022,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2513317.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002763477386906743,
+ "skip_count": 0.0,
+ "step": 1556,
+ "text_loss": 0.3222943842411041
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.314646316407397,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11767578125,
+ "learning_rate": 0.0009734795718967237,
+ "loss": 0.0283,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2516628.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.061566028743982315,
+ "skip_count": 2.0,
+ "step": 1558,
+ "text_loss": 0.3249334692955017
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 7.324038743762841,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.095703125,
+ "learning_rate": 0.0009733800168193679,
+ "loss": 0.0228,
+ "macro_f1": 1.0,
+ "num_tokens": 2519424.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.017976421862840652,
+ "skip_count": 4.0,
+ "step": 1560,
+ "text_loss": 0.3341919481754303
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.333431171118286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1826171875,
+ "learning_rate": 0.0009732802803418966,
+ "loss": 0.023,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2522922.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002525332849472761,
+ "skip_count": 0.0,
+ "step": 1562,
+ "text_loss": 0.3176332712173462
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.34282359847373,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07861328125,
+ "learning_rate": 0.0009731803625025292,
+ "loss": 0.0196,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2525811.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015524424612522125,
+ "skip_count": 1.0,
+ "step": 1564,
+ "text_loss": 0.532774031162262
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.3522160258291755,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10205078125,
+ "learning_rate": 0.0009730802633395541,
+ "loss": 0.0257,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 2529157.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.08138631284236908,
+ "skip_count": 1.0,
+ "step": 1566,
+ "text_loss": 0.529487133026123
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07666015625,
+ "learning_rate": 0.0009729799828913298,
+ "loss": 0.0223,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2532249.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035867292899638414,
+ "skip_count": 0.0,
+ "step": 1568,
+ "text_loss": 0.503160297870636
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 7.371000880540064,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.06884765625,
+ "learning_rate": 0.0009728795211962838,
+ "loss": 0.0259,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2535904.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02987455204129219,
+ "skip_count": 2.0,
+ "step": 1570,
+ "text_loss": 0.9170270562171936
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.380393307895509,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11865234375,
+ "learning_rate": 0.0009727788782929131,
+ "loss": 0.0273,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2538943.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04676021635532379,
+ "skip_count": 0.0,
+ "step": 1572,
+ "text_loss": 0.29146310687065125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.389785735250954,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0009726780542197844,
+ "loss": 0.0169,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2541805.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002127803163602948,
+ "skip_count": 0.0,
+ "step": 1574,
+ "text_loss": 1.0126502513885498
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.399178162606399,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.142578125,
+ "learning_rate": 0.0009725770490155338,
+ "loss": 0.0262,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2546213.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007609677035361528,
+ "skip_count": 0.0,
+ "step": 1576,
+ "text_loss": 0.190168559551239
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.408570589961843,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.083984375,
+ "learning_rate": 0.0009724758627188665,
+ "loss": 0.0356,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2549554.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.033554721623659134,
+ "skip_count": 1.0,
+ "step": 1578,
+ "text_loss": 0.2977406084537506
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.4179630173172875,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.140625,
+ "learning_rate": 0.0009723744953685572,
+ "loss": 0.028,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2552785.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.027864238247275352,
+ "skip_count": 0.0,
+ "step": 1580,
+ "text_loss": 0.2700682580471039
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.427355444672733,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.19921875,
+ "learning_rate": 0.0009722729470034503,
+ "loss": 0.0224,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2556550.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004798175301402807,
+ "skip_count": 0.0,
+ "step": 1582,
+ "text_loss": 0.6559903025627136
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.436747872028177,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.078125,
+ "learning_rate": 0.0009721712176624591,
+ "loss": 0.0242,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2559862.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013764148578047752,
+ "skip_count": 0.0,
+ "step": 1584,
+ "text_loss": 0.2257535308599472
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 7.446140299383622,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10986328125,
+ "learning_rate": 0.0009720693073845667,
+ "loss": 0.032,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2562766.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01937069371342659,
+ "skip_count": 2.0,
+ "step": 1586,
+ "text_loss": 0.178413525223732
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 7.455532726739067,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.150390625,
+ "learning_rate": 0.0009719672162088252,
+ "loss": 0.0306,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 2566583.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06224144622683525,
+ "skip_count": 0.0,
+ "step": 1588,
+ "text_loss": 0.3992367684841156
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 27.0,
+ "epoch": 7.464925154094511,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.185546875,
+ "learning_rate": 0.0009718649441743559,
+ "loss": 0.0239,
+ "macro_f1": 0.9449735879898071,
+ "num_tokens": 2569516.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.06937911361455917,
+ "skip_count": 4.0,
+ "step": 1590,
+ "text_loss": 0.1945122629404068
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.00097176249132035,
+ "loss": 0.0229,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2572418.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034326619934290648,
+ "skip_count": 0.0,
+ "step": 1592,
+ "text_loss": 0.6259906888008118
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 7.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08642578125,
+ "learning_rate": 0.0009716598576860676,
+ "loss": 0.0278,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2575235.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004557516425848007,
+ "skip_count": 0.0,
+ "step": 1594,
+ "text_loss": 0.6638736724853516
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 7.493102436160846,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.193359375,
+ "learning_rate": 0.0009715570433108378,
+ "loss": 0.0198,
+ "macro_f1": 1.0,
+ "num_tokens": 2578157.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015363055281341076,
+ "skip_count": 1.0,
+ "step": 1596,
+ "text_loss": 0.6530464887619019
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 7.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1484375,
+ "learning_rate": 0.0009714540482340595,
+ "loss": 0.0268,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2581801.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01257144846022129,
+ "skip_count": 0.0,
+ "step": 1598,
+ "text_loss": 0.5916110277175903
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.5118872908717345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.058837890625,
+ "learning_rate": 0.0009713508724952006,
+ "loss": 0.0177,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2585204.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003175645601004362,
+ "skip_count": 0.0,
+ "step": 1600,
+ "text_loss": 0.27901601791381836
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.521279718227179,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12353515625,
+ "learning_rate": 0.0009712475161337981,
+ "loss": 0.0261,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2588286.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004122321493923664,
+ "skip_count": 0.0,
+ "step": 1602,
+ "text_loss": 0.42420244216918945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07470703125,
+ "learning_rate": 0.0009711439791894585,
+ "loss": 0.0341,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2591476.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011215819045901299,
+ "skip_count": 1.0,
+ "step": 1604,
+ "text_loss": 0.5549933910369873
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 7.540064572938069,
+ "f1_execute": 0.9599999785423279,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.0703125,
+ "learning_rate": 0.0009710402617018574,
+ "loss": 0.0172,
+ "macro_f1": 0.8200000524520874,
+ "num_tokens": 2594336.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02916567400097847,
+ "skip_count": 2.0,
+ "step": 1606,
+ "text_loss": 0.3263779282569885
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.549457000293513,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0009709363637107393,
+ "loss": 0.0209,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2597462.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015897957608103752,
+ "skip_count": 1.0,
+ "step": 1608,
+ "text_loss": 0.20917139947414398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.558849427648958,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009708322852559184,
+ "loss": 0.0229,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2601543.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002211357234045863,
+ "skip_count": 0.0,
+ "step": 1610,
+ "text_loss": 0.450550377368927
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 7.568241855004403,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1728515625,
+ "learning_rate": 0.0009707280263772776,
+ "loss": 0.0277,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2604462.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01615734025835991,
+ "skip_count": 2.0,
+ "step": 1612,
+ "text_loss": 0.6908381581306458
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 7.577634282359847,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0888671875,
+ "learning_rate": 0.0009706235871147688,
+ "loss": 0.0241,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2607484.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022048067301511765,
+ "skip_count": 2.0,
+ "step": 1614,
+ "text_loss": 0.36691340804100037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.587026709715292,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10546875,
+ "learning_rate": 0.0009705189675084138,
+ "loss": 0.0176,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2610204.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008503952994942665,
+ "skip_count": 1.0,
+ "step": 1616,
+ "text_loss": 0.5226598381996155
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.596419137070737,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09228515625,
+ "learning_rate": 0.0009704141675983029,
+ "loss": 0.0248,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2613128.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019020626787096262,
+ "skip_count": 0.0,
+ "step": 1618,
+ "text_loss": 0.6465088725090027
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5714285969734192,
+ "avg_layers": 24.0,
+ "epoch": 7.6058115644261814,
+ "f1_execute": 0.9333333373069763,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.7272727489471436,
+ "grad_norm": 0.107421875,
+ "learning_rate": 0.0009703091874245956,
+ "loss": 0.032,
+ "macro_f1": 0.5535354018211365,
+ "num_tokens": 2616360.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.11837691068649292,
+ "skip_count": 7.0,
+ "step": 1620,
+ "text_loss": 0.2987039089202881
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.615203991781626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0009702040270275204,
+ "loss": 0.0181,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2619606.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0065958453342318535,
+ "skip_count": 0.0,
+ "step": 1622,
+ "text_loss": 0.6262096166610718
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.103515625,
+ "learning_rate": 0.000970098686447375,
+ "loss": 0.0257,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2622499.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013632026500999928,
+ "skip_count": 1.0,
+ "step": 1624,
+ "text_loss": 0.2392602562904358
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 7.633988846492516,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.125,
+ "learning_rate": 0.0009699931657245264,
+ "loss": 0.0245,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2626002.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012147823348641396,
+ "skip_count": 2.0,
+ "step": 1626,
+ "text_loss": 0.4742976129055023
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 7.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0849609375,
+ "learning_rate": 0.0009698874648994098,
+ "loss": 0.0285,
+ "macro_f1": 1.0,
+ "num_tokens": 2629847.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010692884214222431,
+ "skip_count": 3.0,
+ "step": 1628,
+ "text_loss": 0.5090685486793518
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.6527737012034045,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1240234375,
+ "learning_rate": 0.0009697815840125304,
+ "loss": 0.0265,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2633529.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011442207731306553,
+ "skip_count": 0.0,
+ "step": 1630,
+ "text_loss": 0.1874329298734665
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2119140625,
+ "learning_rate": 0.0009696755231044618,
+ "loss": 0.0207,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2636321.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026681360322982073,
+ "skip_count": 0.0,
+ "step": 1632,
+ "text_loss": 0.7650400400161743
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.671558555914294,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10498046875,
+ "learning_rate": 0.0009695692822158466,
+ "loss": 0.0242,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2638840.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.033965807408094406,
+ "skip_count": 0.0,
+ "step": 1634,
+ "text_loss": 0.6175784468650818
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.680950983269739,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0009694628613873968,
+ "loss": 0.018,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2641886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007568214554339647,
+ "skip_count": 0.0,
+ "step": 1636,
+ "text_loss": 0.43139931559562683
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.690343410625183,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.193359375,
+ "learning_rate": 0.0009693562606598929,
+ "loss": 0.025,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2645028.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004973865579813719,
+ "skip_count": 0.0,
+ "step": 1638,
+ "text_loss": 0.6430339217185974
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.699735837980628,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06982421875,
+ "learning_rate": 0.0009692494800741844,
+ "loss": 0.0313,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2648209.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.049863800406455994,
+ "skip_count": 0.0,
+ "step": 1640,
+ "text_loss": 0.28138160705566406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 7.709128265336073,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08544921875,
+ "learning_rate": 0.0009691425196711901,
+ "loss": 0.0398,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2651171.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02112230286002159,
+ "skip_count": 0.0,
+ "step": 1642,
+ "text_loss": 0.3745322525501251
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.718520692691517,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0703125,
+ "learning_rate": 0.0009690353794918971,
+ "loss": 0.0275,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2654093.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024304776452481747,
+ "skip_count": 0.0,
+ "step": 1644,
+ "text_loss": 0.4275154173374176
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.727913120046962,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0771484375,
+ "learning_rate": 0.000968928059577362,
+ "loss": 0.0244,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2657079.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009320619516074657,
+ "skip_count": 1.0,
+ "step": 1646,
+ "text_loss": 0.46650025248527527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 7.737305547402407,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09814453125,
+ "learning_rate": 0.0009688205599687099,
+ "loss": 0.0209,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2660951.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011913162656128407,
+ "skip_count": 0.0,
+ "step": 1648,
+ "text_loss": 0.46644100546836853
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.7466979747578515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1083984375,
+ "learning_rate": 0.0009687128807071347,
+ "loss": 0.0284,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2663823.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013754756189882755,
+ "skip_count": 0.0,
+ "step": 1650,
+ "text_loss": 0.40808847546577454
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.756090402113296,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.103515625,
+ "learning_rate": 0.0009686050218338996,
+ "loss": 0.0286,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2667079.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009099726565182209,
+ "skip_count": 0.0,
+ "step": 1652,
+ "text_loss": 0.2389989197254181
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.765482829468741,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08837890625,
+ "learning_rate": 0.0009684969833903359,
+ "loss": 0.0283,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2670162.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034928603563457727,
+ "skip_count": 1.0,
+ "step": 1654,
+ "text_loss": 0.6930749416351318
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.774875256824186,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10888671875,
+ "learning_rate": 0.0009683887654178445,
+ "loss": 0.0261,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2673031.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008340462110936642,
+ "skip_count": 1.0,
+ "step": 1656,
+ "text_loss": 0.277752548456192
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06884765625,
+ "learning_rate": 0.0009682803679578947,
+ "loss": 0.0259,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2676092.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004337446764111519,
+ "skip_count": 0.0,
+ "step": 1658,
+ "text_loss": 0.5176776051521301
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.7936601115350745,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.169921875,
+ "learning_rate": 0.0009681717910520244,
+ "loss": 0.0242,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2679479.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.034611742943525314,
+ "skip_count": 2.0,
+ "step": 1660,
+ "text_loss": 0.21485982835292816
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 7.80305253889052,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.07958984375,
+ "learning_rate": 0.0009680630347418406,
+ "loss": 0.022,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2683289.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03297121450304985,
+ "skip_count": 2.0,
+ "step": 1662,
+ "text_loss": 0.33801013231277466
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.812444966245964,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1728515625,
+ "learning_rate": 0.000967954099069019,
+ "loss": 0.0411,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2685879.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04551183059811592,
+ "skip_count": 1.0,
+ "step": 1664,
+ "text_loss": 0.41123488545417786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.821837393601409,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1240234375,
+ "learning_rate": 0.0009678449840753038,
+ "loss": 0.0324,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2688910.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05866450071334839,
+ "skip_count": 2.0,
+ "step": 1666,
+ "text_loss": 0.1740892380475998
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09228515625,
+ "learning_rate": 0.0009677356898025082,
+ "loss": 0.023,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2691680.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009243223816156387,
+ "skip_count": 0.0,
+ "step": 1668,
+ "text_loss": 0.2512350380420685
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.8406222483122985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09619140625,
+ "learning_rate": 0.000967626216292514,
+ "loss": 0.0195,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2694895.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005576452240347862,
+ "skip_count": 0.0,
+ "step": 1670,
+ "text_loss": 0.43294376134872437
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 26.0,
+ "epoch": 7.850014675667743,
+ "f1_execute": 0.9411764740943909,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.09130859375,
+ "learning_rate": 0.0009675165635872715,
+ "loss": 0.0306,
+ "macro_f1": 0.44705885648727417,
+ "num_tokens": 2697806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05372785031795502,
+ "skip_count": 3.0,
+ "step": 1672,
+ "text_loss": 0.1614082306623459
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 7.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11669921875,
+ "learning_rate": 0.0009674067317288,
+ "loss": 0.0296,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2700529.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.018131591379642487,
+ "skip_count": 0.0,
+ "step": 1674,
+ "text_loss": 0.2093173861503601
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.868799530378633,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08203125,
+ "learning_rate": 0.0009672967207591869,
+ "loss": 0.0257,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2703650.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0673515796661377,
+ "skip_count": 1.0,
+ "step": 1676,
+ "text_loss": 0.3029400110244751
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 7.878191957734077,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11669921875,
+ "learning_rate": 0.0009671865307205892,
+ "loss": 0.021,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 2707615.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03821169584989548,
+ "skip_count": 1.0,
+ "step": 1678,
+ "text_loss": 0.2262786477804184
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 25.0,
+ "epoch": 7.8875843850895215,
+ "f1_execute": 0.9756097793579102,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.1396484375,
+ "learning_rate": 0.0009670761616552315,
+ "loss": 0.0465,
+ "macro_f1": 0.9615669250488281,
+ "num_tokens": 2710894.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.042625464498996735,
+ "skip_count": 6.0,
+ "step": 1680,
+ "text_loss": 0.29623574018478394
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.896976812444966,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.169921875,
+ "learning_rate": 0.0009669656136054074,
+ "loss": 0.0289,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2714330.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037571541033685207,
+ "skip_count": 0.0,
+ "step": 1682,
+ "text_loss": 0.7510389089584351
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.906369239800411,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07421875,
+ "learning_rate": 0.0009668548866134795,
+ "loss": 0.0256,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2717176.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004142968449741602,
+ "skip_count": 0.0,
+ "step": 1684,
+ "text_loss": 0.3273485600948334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 7.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.0009667439807218783,
+ "loss": 0.0233,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2720628.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008753842674195766,
+ "skip_count": 2.0,
+ "step": 1686,
+ "text_loss": 0.4314708709716797
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 7.9251540945113,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0732421875,
+ "learning_rate": 0.0009666328959731033,
+ "loss": 0.0211,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 2723739.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.022674910724163055,
+ "skip_count": 1.0,
+ "step": 1688,
+ "text_loss": 0.25734150409698486
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 7.934546521866745,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0009665216324097222,
+ "loss": 0.0324,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 2726644.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03932750225067139,
+ "skip_count": 3.0,
+ "step": 1690,
+ "text_loss": 0.24511034786701202
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.94393894922219,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09765625,
+ "learning_rate": 0.0009664101900743714,
+ "loss": 0.0255,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2729662.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012672754004597664,
+ "skip_count": 1.0,
+ "step": 1692,
+ "text_loss": 0.39431414008140564
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 7.953331376577634,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.076171875,
+ "learning_rate": 0.000966298569009756,
+ "loss": 0.0231,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2732578.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01548632513731718,
+ "skip_count": 2.0,
+ "step": 1694,
+ "text_loss": 0.12439999729394913
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.962723803933079,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0849609375,
+ "learning_rate": 0.0009661867692586494,
+ "loss": 0.0153,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 2735887.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05622401833534241,
+ "skip_count": 2.0,
+ "step": 1696,
+ "text_loss": 0.29024389386177063
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.972116231288524,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.087890625,
+ "learning_rate": 0.0009660747908638933,
+ "loss": 0.0205,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2739293.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.041060201823711395,
+ "skip_count": 1.0,
+ "step": 1698,
+ "text_loss": 0.39461007714271545
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.9815086586439685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1767578125,
+ "learning_rate": 0.0009659626338683981,
+ "loss": 0.0369,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2742468.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007251353468745947,
+ "skip_count": 0.0,
+ "step": 1700,
+ "text_loss": 0.2751767635345459
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 7.990901085999413,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07763671875,
+ "learning_rate": 0.0009658502983151427,
+ "loss": 0.0186,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2745123.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012847424484789371,
+ "skip_count": 1.0,
+ "step": 1702,
+ "text_loss": 0.4756404757499695
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.11767578125,
+ "learning_rate": 0.0009657377842471742,
+ "loss": 0.0313,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2748016.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007060411386191845,
+ "skip_count": 1.0,
+ "step": 1704,
+ "text_loss": 0.9571210145950317
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 8.009392427355445,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10009765625,
+ "learning_rate": 0.0009656250917076081,
+ "loss": 0.0188,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2750717.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016748681664466858,
+ "skip_count": 2.0,
+ "step": 1706,
+ "text_loss": 0.14542843401432037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.018784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060302734375,
+ "learning_rate": 0.0009655122207396285,
+ "loss": 0.017,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2753635.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013607042841613293,
+ "skip_count": 0.0,
+ "step": 1708,
+ "text_loss": 0.21836471557617188
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0732421875,
+ "learning_rate": 0.0009653991713864878,
+ "loss": 0.0205,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2756643.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012097888393327594,
+ "skip_count": 0.0,
+ "step": 1710,
+ "text_loss": 0.635187029838562
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1171875,
+ "learning_rate": 0.0009652859436915066,
+ "loss": 0.0231,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2759432.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006196760106831789,
+ "skip_count": 0.0,
+ "step": 1712,
+ "text_loss": 0.5629420876502991
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.046962136777223,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0009651725376980743,
+ "loss": 0.0177,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2762538.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0042513771913945675,
+ "skip_count": 0.0,
+ "step": 1714,
+ "text_loss": 0.39522525668144226
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 29.0,
+ "epoch": 8.056354564132668,
+ "f1_execute": 0.9583333134651184,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.1494140625,
+ "learning_rate": 0.0009650589534496479,
+ "loss": 0.0194,
+ "macro_f1": 0.8194444179534912,
+ "num_tokens": 2765571.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03596706688404083,
+ "skip_count": 3.0,
+ "step": 1716,
+ "text_loss": 0.6252416968345642
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04833984375,
+ "learning_rate": 0.0009649451909897532,
+ "loss": 0.0178,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2769206.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025788163766264915,
+ "skip_count": 0.0,
+ "step": 1718,
+ "text_loss": 0.8851634860038757
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.075139418843557,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10791015625,
+ "learning_rate": 0.0009648312503619843,
+ "loss": 0.0265,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2772488.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004443451762199402,
+ "skip_count": 0.0,
+ "step": 1720,
+ "text_loss": 0.8568580746650696
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 8.084531846199003,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0009647171316100034,
+ "loss": 0.0265,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 2776482.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.022948263213038445,
+ "skip_count": 3.0,
+ "step": 1722,
+ "text_loss": 0.13431036472320557
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1572265625,
+ "learning_rate": 0.0009646028347775409,
+ "loss": 0.0204,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2778966.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011328035034239292,
+ "skip_count": 1.0,
+ "step": 1724,
+ "text_loss": 0.2085491120815277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.103316700909891,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08984375,
+ "learning_rate": 0.0009644883599083958,
+ "loss": 0.0238,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2781968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002208018908277154,
+ "skip_count": 0.0,
+ "step": 1726,
+ "text_loss": 0.4948323965072632
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.112709128265337,
+ "f1_execute": 0.9411764740943909,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0009643737070464349,
+ "loss": 0.0158,
+ "macro_f1": 0.6470588445663452,
+ "num_tokens": 2784666.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04391832649707794,
+ "skip_count": 2.0,
+ "step": 1728,
+ "text_loss": 0.39060094952583313
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046630859375,
+ "learning_rate": 0.0009642588762355935,
+ "loss": 0.0212,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2787558.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004497280344367027,
+ "skip_count": 1.0,
+ "step": 1730,
+ "text_loss": 0.34908708930015564
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.131493982976226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07275390625,
+ "learning_rate": 0.0009641438675198748,
+ "loss": 0.0175,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2790474.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00583475548774004,
+ "skip_count": 0.0,
+ "step": 1732,
+ "text_loss": 0.5720033049583435
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.140886410331671,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08154296875,
+ "learning_rate": 0.0009640286809433508,
+ "loss": 0.0235,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2793272.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007826375775039196,
+ "skip_count": 0.0,
+ "step": 1734,
+ "text_loss": 0.32181721925735474
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.0009639133165501606,
+ "loss": 0.0192,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2797726.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019055595621466637,
+ "skip_count": 0.0,
+ "step": 1736,
+ "text_loss": 0.620936393737793
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.134765625,
+ "learning_rate": 0.0009637977743845124,
+ "loss": 0.0229,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2800706.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028302327264100313,
+ "skip_count": 0.0,
+ "step": 1738,
+ "text_loss": 0.6473138332366943
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.169063692398003,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0009636820544906823,
+ "loss": 0.0146,
+ "macro_f1": 1.0,
+ "num_tokens": 2803847.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01105099730193615,
+ "skip_count": 2.0,
+ "step": 1740,
+ "text_loss": 0.4401201903820038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 8.178456119753449,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.1455078125,
+ "learning_rate": 0.0009635661569130141,
+ "loss": 0.0195,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 2807235.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02619045600295067,
+ "skip_count": 3.0,
+ "step": 1742,
+ "text_loss": 0.459264874458313
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.187848547108894,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.0009634500816959202,
+ "loss": 0.0162,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2810396.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007915694266557693,
+ "skip_count": 2.0,
+ "step": 1744,
+ "text_loss": 0.5084020495414734
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 8.197240974464338,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1748046875,
+ "learning_rate": 0.0009633338288838805,
+ "loss": 0.0271,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2813215.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.08364596217870712,
+ "skip_count": 0.0,
+ "step": 1746,
+ "text_loss": 0.27681824564933777
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 30.0,
+ "epoch": 8.206633401819783,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.0009632173985214438,
+ "loss": 0.0156,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 2816452.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.028805451467633247,
+ "skip_count": 2.0,
+ "step": 1748,
+ "text_loss": 0.4678419530391693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.216025829175228,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0625,
+ "learning_rate": 0.000963100790653226,
+ "loss": 0.0188,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2819364.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03056817688047886,
+ "skip_count": 1.0,
+ "step": 1750,
+ "text_loss": 0.3078109920024872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.225418256530672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0009629840053239116,
+ "loss": 0.0205,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2823469.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019477814203128219,
+ "skip_count": 0.0,
+ "step": 1752,
+ "text_loss": 0.45501336455345154
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.057373046875,
+ "learning_rate": 0.000962867042578253,
+ "loss": 0.0173,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2826716.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032963966950774193,
+ "skip_count": 0.0,
+ "step": 1754,
+ "text_loss": 0.49234694242477417
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.244203111241562,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0947265625,
+ "learning_rate": 0.0009627499024610707,
+ "loss": 0.0239,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2829733.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010289114899933338,
+ "skip_count": 1.0,
+ "step": 1756,
+ "text_loss": 0.22335539758205414
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.253595538597006,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0888671875,
+ "learning_rate": 0.0009626325850172527,
+ "loss": 0.0174,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2833350.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03249066323041916,
+ "skip_count": 1.0,
+ "step": 1758,
+ "text_loss": 0.6581931114196777
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.262987965952451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0703125,
+ "learning_rate": 0.0009625150902917555,
+ "loss": 0.0185,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2836558.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00870000571012497,
+ "skip_count": 0.0,
+ "step": 1760,
+ "text_loss": 0.22938725352287292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1259765625,
+ "learning_rate": 0.0009623974183296031,
+ "loss": 0.0192,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2840560.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007767196744680405,
+ "skip_count": 0.0,
+ "step": 1762,
+ "text_loss": 0.24473799765110016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09228515625,
+ "learning_rate": 0.0009622795691758876,
+ "loss": 0.0244,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2843548.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021693643648177385,
+ "skip_count": 0.0,
+ "step": 1764,
+ "text_loss": 0.3084608018398285
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.0009621615428757693,
+ "loss": 0.0149,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2847076.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024727333802729845,
+ "skip_count": 0.0,
+ "step": 1766,
+ "text_loss": 0.5251734852790833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.300557675374229,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 0.000962043339474476,
+ "loss": 0.0194,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2849751.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005174890160560608,
+ "skip_count": 0.0,
+ "step": 1768,
+ "text_loss": 0.4410129189491272
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.309950102729674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06103515625,
+ "learning_rate": 0.0009619249590173032,
+ "loss": 0.016,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2853916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006785830482840538,
+ "skip_count": 2.0,
+ "step": 1770,
+ "text_loss": 0.550076425075531
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 8.31934253008512,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.06591796875,
+ "learning_rate": 0.0009618064015496149,
+ "loss": 0.0192,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 2857372.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021370256319642067,
+ "skip_count": 3.0,
+ "step": 1772,
+ "text_loss": 0.1988629847764969
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.072265625,
+ "learning_rate": 0.0009616876671168423,
+ "loss": 0.0162,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2861028.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004313841462135315,
+ "skip_count": 1.0,
+ "step": 1774,
+ "text_loss": 0.42581331729888916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.338127384796008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1103515625,
+ "learning_rate": 0.0009615687557644847,
+ "loss": 0.0268,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2864847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025742491707205772,
+ "skip_count": 0.0,
+ "step": 1776,
+ "text_loss": 0.46510905027389526
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1494140625,
+ "learning_rate": 0.0009614496675381093,
+ "loss": 0.0109,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2867392.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016813480760902166,
+ "skip_count": 0.0,
+ "step": 1778,
+ "text_loss": 0.5922174453735352
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0810546875,
+ "learning_rate": 0.0009613304024833507,
+ "loss": 0.0166,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2871273.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004948933608829975,
+ "skip_count": 0.0,
+ "step": 1780,
+ "text_loss": 0.6776977777481079
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.366304666862343,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07470703125,
+ "learning_rate": 0.0009612109606459117,
+ "loss": 0.0186,
+ "macro_f1": 1.0,
+ "num_tokens": 2874172.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.016950147226452827,
+ "skip_count": 2.0,
+ "step": 1782,
+ "text_loss": 0.48758944869041443
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.375697094217786,
+ "f1_execute": 0.9599999785423279,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.08251953125,
+ "learning_rate": 0.0009610913420715623,
+ "loss": 0.0237,
+ "macro_f1": 0.7644444704055786,
+ "num_tokens": 2877528.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04880943149328232,
+ "skip_count": 1.0,
+ "step": 1784,
+ "text_loss": 0.4404778480529785
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.385089521573232,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 0.0009609715468061411,
+ "loss": 0.0205,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2880627.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004678630735725164,
+ "skip_count": 0.0,
+ "step": 1786,
+ "text_loss": 0.7295402884483337
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.394481948928677,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07958984375,
+ "learning_rate": 0.0009608515748955535,
+ "loss": 0.0205,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2883333.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026695074047893286,
+ "skip_count": 0.0,
+ "step": 1788,
+ "text_loss": 0.9697831273078918
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 8.40387437628412,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.107421875,
+ "learning_rate": 0.000960731426385773,
+ "loss": 0.0157,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 2887444.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.029743613675236702,
+ "skip_count": 2.0,
+ "step": 1790,
+ "text_loss": 0.4737568199634552
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10107421875,
+ "learning_rate": 0.0009606111013228407,
+ "loss": 0.0207,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2890221.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016153788892552257,
+ "skip_count": 0.0,
+ "step": 1792,
+ "text_loss": 0.6693558096885681
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.422659230995011,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08349609375,
+ "learning_rate": 0.0009604905997528655,
+ "loss": 0.02,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2893262.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01965433731675148,
+ "skip_count": 1.0,
+ "step": 1794,
+ "text_loss": 0.45227760076522827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.432051658350455,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08642578125,
+ "learning_rate": 0.0009603699217220239,
+ "loss": 0.0117,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 2896823.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.024017298594117165,
+ "skip_count": 2.0,
+ "step": 1796,
+ "text_loss": 0.48865509033203125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08837890625,
+ "learning_rate": 0.0009602490672765597,
+ "loss": 0.0182,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2899707.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012420224957168102,
+ "skip_count": 0.0,
+ "step": 1798,
+ "text_loss": 0.43292415142059326
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07861328125,
+ "learning_rate": 0.0009601280364627848,
+ "loss": 0.0196,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2902795.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020389219280332327,
+ "skip_count": 0.0,
+ "step": 1800,
+ "text_loss": 0.41021591424942017
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.460228940416789,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0009600068293270783,
+ "loss": 0.0142,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2905769.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002006303984671831,
+ "skip_count": 0.0,
+ "step": 1802,
+ "text_loss": 0.46892106533050537
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08740234375,
+ "learning_rate": 0.000959885445915887,
+ "loss": 0.017,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2909475.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003734810510650277,
+ "skip_count": 0.0,
+ "step": 1804,
+ "text_loss": 0.45364710688591003
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 8.479013795127678,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.11669921875,
+ "learning_rate": 0.0009597638862757254,
+ "loss": 0.0182,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 2914348.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.038971323519945145,
+ "skip_count": 2.0,
+ "step": 1806,
+ "text_loss": 0.42913779616355896
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.488406222483123,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.080078125,
+ "learning_rate": 0.0009596421504531751,
+ "loss": 0.0249,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2917467.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04800829663872719,
+ "skip_count": 0.0,
+ "step": 1808,
+ "text_loss": 0.17332297563552856
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 8.497798649838568,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1083984375,
+ "learning_rate": 0.0009595202384948858,
+ "loss": 0.0227,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2920223.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009164143353700638,
+ "skip_count": 0.0,
+ "step": 1810,
+ "text_loss": 0.33740702271461487
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0947265625,
+ "learning_rate": 0.0009593981504475742,
+ "loss": 0.0275,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2923780.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011236993595957756,
+ "skip_count": 2.0,
+ "step": 1812,
+ "text_loss": 0.1609916388988495
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 8.516583504549457,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.0009592758863580248,
+ "loss": 0.0259,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2926259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019026532769203186,
+ "skip_count": 2.0,
+ "step": 1814,
+ "text_loss": 0.6460903882980347
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 8.525975931904902,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09814453125,
+ "learning_rate": 0.0009591534462730894,
+ "loss": 0.0206,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 2929173.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0608333982527256,
+ "skip_count": 0.0,
+ "step": 1816,
+ "text_loss": 0.476126492023468
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.000959030830239687,
+ "loss": 0.0175,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2932703.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0093300249427557,
+ "skip_count": 0.0,
+ "step": 1818,
+ "text_loss": 0.5471875667572021
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.544760786615791,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.2001953125,
+ "learning_rate": 0.0009589080383048048,
+ "loss": 0.0235,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2936195.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010434109717607498,
+ "skip_count": 0.0,
+ "step": 1820,
+ "text_loss": 0.5068115592002869
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0986328125,
+ "learning_rate": 0.0009587850705154964,
+ "loss": 0.0291,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2939412.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004347751382738352,
+ "skip_count": 0.0,
+ "step": 1822,
+ "text_loss": 0.4241984784603119
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 8.56354564132668,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0859375,
+ "learning_rate": 0.0009586619269188836,
+ "loss": 0.0224,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 2942318.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.034238871186971664,
+ "skip_count": 1.0,
+ "step": 1824,
+ "text_loss": 0.2328975349664688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.572938068682125,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11181640625,
+ "learning_rate": 0.0009585386075621553,
+ "loss": 0.027,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2945731.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006097695790231228,
+ "skip_count": 0.0,
+ "step": 1826,
+ "text_loss": 0.22816994786262512
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.582330496037569,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0908203125,
+ "learning_rate": 0.0009584151124925676,
+ "loss": 0.0208,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2948944.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007790776435285807,
+ "skip_count": 1.0,
+ "step": 1828,
+ "text_loss": 0.5009413361549377
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07275390625,
+ "learning_rate": 0.0009582914417574438,
+ "loss": 0.0145,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2951723.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009144559502601624,
+ "skip_count": 2.0,
+ "step": 1830,
+ "text_loss": 0.1402502954006195
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 8.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 0.0009581675954041751,
+ "loss": 0.0166,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2954726.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006593191530555487,
+ "skip_count": 0.0,
+ "step": 1832,
+ "text_loss": 0.4871736466884613
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.610507778103903,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0869140625,
+ "learning_rate": 0.0009580435734802196,
+ "loss": 0.0206,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2957853.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01241068821400404,
+ "skip_count": 0.0,
+ "step": 1834,
+ "text_loss": 0.30100154876708984
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.619900205459349,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1298828125,
+ "learning_rate": 0.0009579193760331027,
+ "loss": 0.022,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2960783.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002219218760728836,
+ "skip_count": 0.0,
+ "step": 1836,
+ "text_loss": 0.4961516559123993
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.629292632814794,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.12255859375,
+ "learning_rate": 0.0009577950031104169,
+ "loss": 0.0166,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 2963328.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.029363535344600677,
+ "skip_count": 2.0,
+ "step": 1838,
+ "text_loss": 0.42814353108406067
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.25,
+ "avg_layers": 28.0,
+ "epoch": 8.638685060170237,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.1044921875,
+ "learning_rate": 0.0009576704547598226,
+ "loss": 0.0257,
+ "macro_f1": 0.7795917987823486,
+ "num_tokens": 2966108.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0579402856528759,
+ "skip_count": 4.0,
+ "step": 1840,
+ "text_loss": 0.20523512363433838
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 8.648077487525683,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0625,
+ "learning_rate": 0.0009575457310290463,
+ "loss": 0.0121,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2969137.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008810589089989662,
+ "skip_count": 0.0,
+ "step": 1842,
+ "text_loss": 0.6199528574943542
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0009574208319658831,
+ "loss": 0.0208,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 2972407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012295129708945751,
+ "skip_count": 1.0,
+ "step": 1844,
+ "text_loss": 0.66938316822052
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 8.666862342236572,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.000957295757618194,
+ "loss": 0.0152,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 2976045.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06162935495376587,
+ "skip_count": 2.0,
+ "step": 1846,
+ "text_loss": 0.5381782650947571
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0830078125,
+ "learning_rate": 0.0009571705080339079,
+ "loss": 0.0144,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2979025.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003950524143874645,
+ "skip_count": 0.0,
+ "step": 1848,
+ "text_loss": 0.5831671357154846
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11376953125,
+ "learning_rate": 0.0009570450832610208,
+ "loss": 0.0209,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2982276.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010354886762797832,
+ "skip_count": 0.0,
+ "step": 1850,
+ "text_loss": 0.27448201179504395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 8.695039624302906,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.061279296875,
+ "learning_rate": 0.0009569194833475956,
+ "loss": 0.0199,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2985691.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010167439468204975,
+ "skip_count": 0.0,
+ "step": 1852,
+ "text_loss": 0.5264663696289062
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.704432051658351,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1328125,
+ "learning_rate": 0.0009567937083417624,
+ "loss": 0.0194,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2989126.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0371871180832386,
+ "skip_count": 1.0,
+ "step": 1854,
+ "text_loss": 0.2008018046617508
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 0.0009566677582917185,
+ "loss": 0.0184,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 2992814.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010190588422119617,
+ "skip_count": 0.0,
+ "step": 1856,
+ "text_loss": 0.749717116355896
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.72321690636924,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.080078125,
+ "learning_rate": 0.0009565416332457282,
+ "loss": 0.0132,
+ "macro_f1": 0.6538461446762085,
+ "num_tokens": 2995729.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.022285036742687225,
+ "skip_count": 1.0,
+ "step": 1858,
+ "text_loss": 0.5870219469070435
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.732609333724685,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07666015625,
+ "learning_rate": 0.0009564153332521228,
+ "loss": 0.0224,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 2998812.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011050296947360039,
+ "skip_count": 1.0,
+ "step": 1860,
+ "text_loss": 0.8444408774375916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.742001761080129,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06005859375,
+ "learning_rate": 0.0009562888583593005,
+ "loss": 0.0163,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3001799.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007125461008399725,
+ "skip_count": 0.0,
+ "step": 1862,
+ "text_loss": 0.41510361433029175
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.751394188435574,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06884765625,
+ "learning_rate": 0.0009561622086157272,
+ "loss": 0.0236,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3005088.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0049054501578211784,
+ "skip_count": 0.0,
+ "step": 1864,
+ "text_loss": 0.3801248073577881
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 8.760786615791018,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.054443359375,
+ "learning_rate": 0.000956035384069935,
+ "loss": 0.0238,
+ "macro_f1": 1.0,
+ "num_tokens": 3008178.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005162427201867104,
+ "skip_count": 1.0,
+ "step": 1866,
+ "text_loss": 0.2687684893608093
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.770179043146463,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10400390625,
+ "learning_rate": 0.0009559083847705233,
+ "loss": 0.0214,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3010923.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.028984658420085907,
+ "skip_count": 1.0,
+ "step": 1868,
+ "text_loss": 0.6277349591255188
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 8.779571470501908,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08349609375,
+ "learning_rate": 0.0009557812107661584,
+ "loss": 0.0208,
+ "macro_f1": 1.0,
+ "num_tokens": 3015030.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012200530618429184,
+ "skip_count": 1.0,
+ "step": 1870,
+ "text_loss": 0.6293368339538574
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.788963897857352,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11962890625,
+ "learning_rate": 0.0009556538621055739,
+ "loss": 0.0268,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3019067.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06365182995796204,
+ "skip_count": 1.0,
+ "step": 1872,
+ "text_loss": 0.39046618342399597
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 8.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.115234375,
+ "learning_rate": 0.0009555263388375699,
+ "loss": 0.014,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3022166.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0041703456081449986,
+ "skip_count": 1.0,
+ "step": 1874,
+ "text_loss": 0.42232340574264526
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.807748752568243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11572265625,
+ "learning_rate": 0.0009553986410110134,
+ "loss": 0.016,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3025865.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005841755773872137,
+ "skip_count": 0.0,
+ "step": 1876,
+ "text_loss": 0.37600573897361755
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.817141179923686,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09228515625,
+ "learning_rate": 0.0009552707686748388,
+ "loss": 0.0219,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3029950.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05165952071547508,
+ "skip_count": 1.0,
+ "step": 1878,
+ "text_loss": 0.33717799186706543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.826533607279131,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0849609375,
+ "learning_rate": 0.0009551427218780467,
+ "loss": 0.0219,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3033649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.020680008456110954,
+ "skip_count": 2.0,
+ "step": 1880,
+ "text_loss": 0.5011783838272095
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.835926034634575,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.15625,
+ "learning_rate": 0.0009550145006697048,
+ "loss": 0.0217,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3036847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.07626450061798096,
+ "skip_count": 2.0,
+ "step": 1882,
+ "text_loss": 0.3066408336162567
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 8.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056396484375,
+ "learning_rate": 0.0009548861050989482,
+ "loss": 0.0136,
+ "macro_f1": 1.0,
+ "num_tokens": 3040353.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010884666815400124,
+ "skip_count": 1.0,
+ "step": 1884,
+ "text_loss": 0.49779415130615234
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0908203125,
+ "learning_rate": 0.0009547575352149778,
+ "loss": 0.0213,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3043504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006704333238303661,
+ "skip_count": 2.0,
+ "step": 1886,
+ "text_loss": 0.12284614145755768
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 8.86410331670091,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.11474609375,
+ "learning_rate": 0.0009546287910670621,
+ "loss": 0.0211,
+ "macro_f1": 0.5427350401878357,
+ "num_tokens": 3046422.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04799000173807144,
+ "skip_count": 2.0,
+ "step": 1888,
+ "text_loss": 0.1824081838130951
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.873495744056354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1484375,
+ "learning_rate": 0.0009544998727045361,
+ "loss": 0.0306,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3049819.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008139612153172493,
+ "skip_count": 0.0,
+ "step": 1890,
+ "text_loss": 0.18929053843021393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 8.8828881714118,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.09375,
+ "learning_rate": 0.0009543707801768015,
+ "loss": 0.0175,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 3052766.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02966771461069584,
+ "skip_count": 3.0,
+ "step": 1892,
+ "text_loss": 0.247748002409935
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 25.0,
+ "epoch": 8.892280598767243,
+ "f1_execute": 0.9411764740943909,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0009542415135333267,
+ "loss": 0.0193,
+ "macro_f1": 0.44705885648727417,
+ "num_tokens": 3056427.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03637036308646202,
+ "skip_count": 2.0,
+ "step": 1894,
+ "text_loss": 0.2583999037742615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.901673026122689,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0595703125,
+ "learning_rate": 0.0009541120728236472,
+ "loss": 0.0136,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3059497.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007026574574410915,
+ "skip_count": 0.0,
+ "step": 1896,
+ "text_loss": 0.5222375988960266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.076171875,
+ "learning_rate": 0.0009539824580973646,
+ "loss": 0.0219,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3062187.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003449335927143693,
+ "skip_count": 0.0,
+ "step": 1898,
+ "text_loss": 0.5736427307128906
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0009538526694041477,
+ "loss": 0.0163,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3066100.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035463871899992228,
+ "skip_count": 0.0,
+ "step": 1900,
+ "text_loss": 0.5471583604812622
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 8.929850308189023,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.080078125,
+ "learning_rate": 0.0009537227067937318,
+ "loss": 0.0233,
+ "macro_f1": 1.0,
+ "num_tokens": 3068737.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.00597514258697629,
+ "skip_count": 3.0,
+ "step": 1902,
+ "text_loss": 0.36644190549850464
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.939242735544468,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.166015625,
+ "learning_rate": 0.0009535925703159186,
+ "loss": 0.0301,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3071686.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.025420479476451874,
+ "skip_count": 2.0,
+ "step": 1904,
+ "text_loss": 0.535789966583252
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.948635162899912,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07568359375,
+ "learning_rate": 0.0009534622600205769,
+ "loss": 0.0145,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3074954.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014377486892044544,
+ "skip_count": 0.0,
+ "step": 1906,
+ "text_loss": 0.19009549915790558
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.958027590255357,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11083984375,
+ "learning_rate": 0.0009533317759576416,
+ "loss": 0.0197,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3077540.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004848944488912821,
+ "skip_count": 0.0,
+ "step": 1908,
+ "text_loss": 0.5022001266479492
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 8.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07470703125,
+ "learning_rate": 0.0009532011181771148,
+ "loss": 0.0217,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3080445.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009480170905590057,
+ "skip_count": 2.0,
+ "step": 1910,
+ "text_loss": 0.35135936737060547
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10400390625,
+ "learning_rate": 0.0009530702867290644,
+ "loss": 0.0185,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3083657.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019353039097040892,
+ "skip_count": 0.0,
+ "step": 1912,
+ "text_loss": 0.5123994946479797
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.986204872321691,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1455078125,
+ "learning_rate": 0.0009529392816636256,
+ "loss": 0.0249,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3086837.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010921972570940852,
+ "skip_count": 0.0,
+ "step": 1914,
+ "text_loss": 0.44477662444114685
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 8.995597299677135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.19140625,
+ "learning_rate": 0.0009528081030309995,
+ "loss": 0.0351,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3089892.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018027103506028652,
+ "skip_count": 0.0,
+ "step": 1916,
+ "text_loss": 0.7356183528900146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.004696213677722,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07568359375,
+ "learning_rate": 0.0009526767508814542,
+ "loss": 0.0236,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3093058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003243023296818137,
+ "skip_count": 0.0,
+ "step": 1918,
+ "text_loss": 0.48823556303977966
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.014088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.080078125,
+ "learning_rate": 0.0009525452252653239,
+ "loss": 0.0175,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3096404.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009360014460980892,
+ "skip_count": 0.0,
+ "step": 1920,
+ "text_loss": 0.21498437225818634
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 9.023481068388612,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.140625,
+ "learning_rate": 0.0009524135262330098,
+ "loss": 0.0224,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 3099520.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.017444295808672905,
+ "skip_count": 3.0,
+ "step": 1922,
+ "text_loss": 0.27608850598335266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 9.032873495744056,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 0.0009522816538349789,
+ "loss": 0.0162,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3102956.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.06424452364444733,
+ "skip_count": 2.0,
+ "step": 1924,
+ "text_loss": 0.21558666229248047
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 9.042265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0009521496081217651,
+ "loss": 0.0112,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3106565.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002270506462082267,
+ "skip_count": 0.0,
+ "step": 1926,
+ "text_loss": 0.5641813278198242
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 9.051658350454945,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.095703125,
+ "learning_rate": 0.0009520173891439684,
+ "loss": 0.0216,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3109314.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011512448079884052,
+ "skip_count": 1.0,
+ "step": 1928,
+ "text_loss": 0.6351624727249146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0830078125,
+ "learning_rate": 0.0009518849969522556,
+ "loss": 0.0198,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3112956.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003883908037096262,
+ "skip_count": 0.0,
+ "step": 1930,
+ "text_loss": 0.35160085558891296
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.070443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10888671875,
+ "learning_rate": 0.0009517524315973595,
+ "loss": 0.019,
+ "macro_f1": 1.0,
+ "num_tokens": 3115593.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009479222819209099,
+ "skip_count": 3.0,
+ "step": 1932,
+ "text_loss": 0.2900560200214386
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.079835632521279,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0771484375,
+ "learning_rate": 0.0009516196931300794,
+ "loss": 0.0153,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3118516.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017834696918725967,
+ "skip_count": 2.0,
+ "step": 1934,
+ "text_loss": 0.20094378292560577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12890625,
+ "learning_rate": 0.0009514867816012809,
+ "loss": 0.02,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3122242.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017964740982279181,
+ "skip_count": 0.0,
+ "step": 1936,
+ "text_loss": 0.6498590707778931
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0009513536970618961,
+ "loss": 0.013,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3125645.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007437168620526791,
+ "skip_count": 2.0,
+ "step": 1938,
+ "text_loss": 0.25863033533096313
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 9.108012914587613,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0625,
+ "learning_rate": 0.0009512204395629232,
+ "loss": 0.0184,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3128740.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008759932243265212,
+ "skip_count": 1.0,
+ "step": 1940,
+ "text_loss": 0.5638351440429688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.117405341943059,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06884765625,
+ "learning_rate": 0.0009510870091554264,
+ "loss": 0.0153,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3131742.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.019906625151634216,
+ "skip_count": 0.0,
+ "step": 1942,
+ "text_loss": 0.8410717844963074
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.126797769298504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12255859375,
+ "learning_rate": 0.0009509534058905369,
+ "loss": 0.016,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3134407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009229081333614886,
+ "skip_count": 0.0,
+ "step": 1944,
+ "text_loss": 0.47506049275398254
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.136190196653947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 0.0009508196298194517,
+ "loss": 0.0123,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3137053.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003630586201325059,
+ "skip_count": 0.0,
+ "step": 1946,
+ "text_loss": 0.32225799560546875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08349609375,
+ "learning_rate": 0.0009506856809934338,
+ "loss": 0.0119,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3140943.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007580445148050785,
+ "skip_count": 0.0,
+ "step": 1948,
+ "text_loss": 0.3120577931404114
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.0009505515594638127,
+ "loss": 0.0126,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3144298.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004471861757338047,
+ "skip_count": 0.0,
+ "step": 1950,
+ "text_loss": 0.22052447497844696
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 9.164367478720282,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.09130859375,
+ "learning_rate": 0.0009504172652819843,
+ "loss": 0.023,
+ "macro_f1": 1.0,
+ "num_tokens": 3147069.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009606664068996906,
+ "skip_count": 1.0,
+ "step": 1952,
+ "text_loss": 0.34773921966552734
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 9.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0625,
+ "learning_rate": 0.0009502827984994099,
+ "loss": 0.0148,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3149992.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006443799939006567,
+ "skip_count": 1.0,
+ "step": 1954,
+ "text_loss": 0.6442171335220337
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 0.0009501481591676177,
+ "loss": 0.0188,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3153167.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003219039412215352,
+ "skip_count": 0.0,
+ "step": 1956,
+ "text_loss": 0.43369221687316895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.192544760786616,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07470703125,
+ "learning_rate": 0.000950013347338202,
+ "loss": 0.0152,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3156590.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.025551019236445427,
+ "skip_count": 1.0,
+ "step": 1958,
+ "text_loss": 0.294479101896286
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 9.201937188142061,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1630859375,
+ "learning_rate": 0.0009498783630628225,
+ "loss": 0.0158,
+ "macro_f1": 1.0,
+ "num_tokens": 3159451.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013802438974380493,
+ "skip_count": 2.0,
+ "step": 1960,
+ "text_loss": 0.20888492465019226
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.211329615497505,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07666015625,
+ "learning_rate": 0.0009497432063932057,
+ "loss": 0.0137,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 3162889.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02852988988161087,
+ "skip_count": 2.0,
+ "step": 1962,
+ "text_loss": 0.5027125477790833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 0.0009496078773811437,
+ "loss": 0.0136,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3165979.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01784522272646427,
+ "skip_count": 2.0,
+ "step": 1964,
+ "text_loss": 0.1696339100599289
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060302734375,
+ "learning_rate": 0.000949472376078495,
+ "loss": 0.016,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3168683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017019887454807758,
+ "skip_count": 0.0,
+ "step": 1966,
+ "text_loss": 0.48905447125434875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.239506897563839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.000949336702537184,
+ "loss": 0.0108,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3171968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004817947279661894,
+ "skip_count": 2.0,
+ "step": 1968,
+ "text_loss": 0.20984773337841034
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.248899324919284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.0009492008568092007,
+ "loss": 0.0103,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3175947.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012963006738573313,
+ "skip_count": 0.0,
+ "step": 1970,
+ "text_loss": 0.5215106010437012
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 9.258291752274728,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.044921875,
+ "learning_rate": 0.0009490648389466019,
+ "loss": 0.0135,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 3179348.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03950481489300728,
+ "skip_count": 2.0,
+ "step": 1972,
+ "text_loss": 0.24640929698944092
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.09326171875,
+ "learning_rate": 0.0009489286490015097,
+ "loss": 0.0183,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3182640.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0043345349840819836,
+ "skip_count": 2.0,
+ "step": 1974,
+ "text_loss": 0.6362852454185486
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.277076606985618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07958984375,
+ "learning_rate": 0.0009487922870261122,
+ "loss": 0.0155,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3185657.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015687479171901941,
+ "skip_count": 0.0,
+ "step": 1976,
+ "text_loss": 0.8977144360542297
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.286469034341062,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.061279296875,
+ "learning_rate": 0.0009486557530726638,
+ "loss": 0.0139,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3188772.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010977238416671753,
+ "skip_count": 0.0,
+ "step": 1978,
+ "text_loss": 0.38512736558914185
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 9.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11279296875,
+ "learning_rate": 0.0009485190471934844,
+ "loss": 0.0196,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3193131.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.002264744369313121,
+ "skip_count": 0.0,
+ "step": 1980,
+ "text_loss": 0.4171289801597595
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.305253889051952,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09033203125,
+ "learning_rate": 0.00094838216944096,
+ "loss": 0.0219,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3196668.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.042320676147937775,
+ "skip_count": 1.0,
+ "step": 1982,
+ "text_loss": 0.19008000195026398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 9.314646316407396,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 0.0009482451198675424,
+ "loss": 0.0151,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 3200282.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01796630397439003,
+ "skip_count": 1.0,
+ "step": 1984,
+ "text_loss": 0.5009249448776245
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 9.324038743762841,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.061767578125,
+ "learning_rate": 0.0009481078985257494,
+ "loss": 0.0147,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3204439.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01052347756922245,
+ "skip_count": 1.0,
+ "step": 1986,
+ "text_loss": 0.15319275856018066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.333431171118287,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0732421875,
+ "learning_rate": 0.0009479705054681644,
+ "loss": 0.015,
+ "macro_f1": 0.3076923191547394,
+ "num_tokens": 3207590.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.09640293568372726,
+ "skip_count": 3.0,
+ "step": 1988,
+ "text_loss": 0.3654652535915375
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.34282359847373,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0009478329407474366,
+ "loss": 0.0183,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3211172.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012670112773776054,
+ "skip_count": 1.0,
+ "step": 1990,
+ "text_loss": 0.5817596316337585
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 9.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05859375,
+ "learning_rate": 0.000947695204416281,
+ "loss": 0.0121,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3214050.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005263707600533962,
+ "skip_count": 0.0,
+ "step": 1992,
+ "text_loss": 0.5985888242721558
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.361608453184619,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0009475572965274787,
+ "loss": 0.0144,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3217318.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0682850033044815,
+ "skip_count": 0.0,
+ "step": 1994,
+ "text_loss": 0.316506564617157
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.371000880540064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0595703125,
+ "learning_rate": 0.000947419217133876,
+ "loss": 0.019,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3220012.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008508823812007904,
+ "skip_count": 2.0,
+ "step": 1996,
+ "text_loss": 0.09665893763303757
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 9.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053466796875,
+ "learning_rate": 0.0009472809662883852,
+ "loss": 0.0155,
+ "macro_f1": 1.0,
+ "num_tokens": 3223019.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01100847590714693,
+ "skip_count": 2.0,
+ "step": 1998,
+ "text_loss": 0.4938808083534241
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.389785735250953,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.0009471425440439844,
+ "loss": 0.0135,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 3226013.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04953207075595856,
+ "skip_count": 3.0,
+ "step": 2000,
+ "text_loss": 0.22258254885673523
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 9.399178162606399,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07568359375,
+ "learning_rate": 0.0009470039504537173,
+ "loss": 0.0186,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 3230031.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.052884332835674286,
+ "skip_count": 2.0,
+ "step": 2002,
+ "text_loss": 0.1741616576910019
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 9.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0869140625,
+ "learning_rate": 0.0009468651855706931,
+ "loss": 0.0204,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3232991.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008056716993451118,
+ "skip_count": 0.0,
+ "step": 2004,
+ "text_loss": 0.3173636198043823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0009467262494480868,
+ "loss": 0.0136,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3236390.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0053409393876791,
+ "skip_count": 0.0,
+ "step": 2006,
+ "text_loss": 0.5806330442428589
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.427355444672733,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.000946587142139139,
+ "loss": 0.0147,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3239267.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015652200672775507,
+ "skip_count": 0.0,
+ "step": 2008,
+ "text_loss": 0.6214317679405212
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.436747872028178,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.11376953125,
+ "learning_rate": 0.000946447863697156,
+ "loss": 0.0151,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 3242569.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011673987843096256,
+ "skip_count": 2.0,
+ "step": 2010,
+ "text_loss": 0.532565712928772
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.446140299383622,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0009463084141755093,
+ "loss": 0.0159,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3245669.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.028480790555477142,
+ "skip_count": 1.0,
+ "step": 2012,
+ "text_loss": 0.25210800766944885
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.455532726739067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0869140625,
+ "learning_rate": 0.0009461687936276364,
+ "loss": 0.0132,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3248751.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007234727032482624,
+ "skip_count": 0.0,
+ "step": 2014,
+ "text_loss": 0.35922971367836
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 9.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0009460290021070402,
+ "loss": 0.0195,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3252614.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.014691276475787163,
+ "skip_count": 0.0,
+ "step": 2016,
+ "text_loss": 0.2747853398323059
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051513671875,
+ "learning_rate": 0.0009458890396672888,
+ "loss": 0.0186,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3256374.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002385235857218504,
+ "skip_count": 0.0,
+ "step": 2018,
+ "text_loss": 0.5268719792366028
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 9.483710008805401,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.0009457489063620164,
+ "loss": 0.0133,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 3259792.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.047268565744161606,
+ "skip_count": 2.0,
+ "step": 2020,
+ "text_loss": 0.7785539627075195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.493102436160845,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1494140625,
+ "learning_rate": 0.0009456086022449221,
+ "loss": 0.0218,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3262833.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015878718346357346,
+ "skip_count": 1.0,
+ "step": 2022,
+ "text_loss": 0.42270028591156006
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.50249486351629,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08935546875,
+ "learning_rate": 0.0009454681273697711,
+ "loss": 0.0117,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3265718.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.030749641358852386,
+ "skip_count": 0.0,
+ "step": 2024,
+ "text_loss": 0.18668225407600403
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 9.511887290871735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.0009453274817903931,
+ "loss": 0.012,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3268158.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011538166552782059,
+ "skip_count": 1.0,
+ "step": 2026,
+ "text_loss": 0.34090787172317505
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.521279718227179,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.099609375,
+ "learning_rate": 0.000945186665560684,
+ "loss": 0.0218,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3271082.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009527760557830334,
+ "skip_count": 0.0,
+ "step": 2028,
+ "text_loss": 0.2110334187746048
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.530672145582624,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.119140625,
+ "learning_rate": 0.000945045678734605,
+ "loss": 0.0175,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 3273488.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03317151218652725,
+ "skip_count": 3.0,
+ "step": 2030,
+ "text_loss": 0.2233227640390396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.540064572938068,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12451171875,
+ "learning_rate": 0.0009449045213661822,
+ "loss": 0.0201,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3276646.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018510591238737106,
+ "skip_count": 1.0,
+ "step": 2032,
+ "text_loss": 0.16100332140922546
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 9.549457000293513,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.1318359375,
+ "learning_rate": 0.0009447631935095077,
+ "loss": 0.0185,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 3279441.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.028113311156630516,
+ "skip_count": 4.0,
+ "step": 2034,
+ "text_loss": 0.29208317399024963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.558849427648958,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.0009446216952187384,
+ "loss": 0.0164,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3282697.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008379172533750534,
+ "skip_count": 0.0,
+ "step": 2036,
+ "text_loss": 0.16026398539543152
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 0.0009444800265480967,
+ "loss": 0.0178,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3285574.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00941354501992464,
+ "skip_count": 0.0,
+ "step": 2038,
+ "text_loss": 0.29523080587387085
+ },
+ {
+ "acc_repeat": 0.75,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 9.577634282359847,
+ "f1_execute": 0.9230769276618958,
+ "f1_repeat": 0.8571428656578064,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.076171875,
+ "learning_rate": 0.0009443381875518703,
+ "loss": 0.0197,
+ "macro_f1": 0.8600732684135437,
+ "num_tokens": 3289159.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.04974055662751198,
+ "skip_count": 6.0,
+ "step": 2040,
+ "text_loss": 0.23033179342746735
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.587026709715293,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0537109375,
+ "learning_rate": 0.0009441961782844123,
+ "loss": 0.0146,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3293598.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022241825237870216,
+ "skip_count": 1.0,
+ "step": 2042,
+ "text_loss": 0.8299165368080139
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 0.0009440539988001408,
+ "loss": 0.0159,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3296648.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011019332334399223,
+ "skip_count": 0.0,
+ "step": 2044,
+ "text_loss": 0.18207129836082458
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.605811564426181,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0009439116491535394,
+ "loss": 0.0118,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3300058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002889640862122178,
+ "skip_count": 0.0,
+ "step": 2046,
+ "text_loss": 0.7051978707313538
+ },
+ {
+ "acc_repeat": 0.3333333432674408,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 9.615203991781627,
+ "f1_execute": 0.9333333373069763,
+ "f1_repeat": 0.5,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.078125,
+ "learning_rate": 0.0009437691293991563,
+ "loss": 0.0192,
+ "macro_f1": 0.7634921073913574,
+ "num_tokens": 3303296.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.07741832733154297,
+ "skip_count": 4.0,
+ "step": 2048,
+ "text_loss": 0.15563532710075378
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.09521484375,
+ "learning_rate": 0.0009436264395916061,
+ "loss": 0.0209,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3306204.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014225383289158344,
+ "skip_count": 2.0,
+ "step": 2050,
+ "text_loss": 0.18117287755012512
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.633988846492516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1416015625,
+ "learning_rate": 0.0009434835797855672,
+ "loss": 0.0165,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3309444.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023932650219649076,
+ "skip_count": 0.0,
+ "step": 2052,
+ "text_loss": 0.4645874798297882
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.643381273847961,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0009433405500357839,
+ "loss": 0.0153,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3312488.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03193361684679985,
+ "skip_count": 1.0,
+ "step": 2054,
+ "text_loss": 0.5291082859039307
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0009431973503970655,
+ "loss": 0.0134,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3315765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020529816392809153,
+ "skip_count": 0.0,
+ "step": 2056,
+ "text_loss": 0.5877931118011475
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.66216612855885,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07275390625,
+ "learning_rate": 0.0009430539809242864,
+ "loss": 0.0185,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3318877.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.07907948642969131,
+ "skip_count": 0.0,
+ "step": 2058,
+ "text_loss": 0.3836737871170044
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 9.671558555914293,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.095703125,
+ "learning_rate": 0.0009429104416723862,
+ "loss": 0.0163,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3322576.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.003006070153787732,
+ "skip_count": 0.0,
+ "step": 2060,
+ "text_loss": 0.3480920195579529
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.680950983269739,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 0.0009427667326963689,
+ "loss": 0.0127,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3325974.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005013179033994675,
+ "skip_count": 0.0,
+ "step": 2062,
+ "text_loss": 0.931358814239502
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.690343410625184,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0986328125,
+ "learning_rate": 0.0009426228540513047,
+ "loss": 0.0206,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3329398.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0059848143719136715,
+ "skip_count": 0.0,
+ "step": 2064,
+ "text_loss": 0.47568953037261963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.699735837980628,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0830078125,
+ "learning_rate": 0.0009424788057923277,
+ "loss": 0.0131,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3332029.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00783882662653923,
+ "skip_count": 0.0,
+ "step": 2066,
+ "text_loss": 0.22887596487998962
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 9.709128265336073,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0712890625,
+ "learning_rate": 0.0009423345879746376,
+ "loss": 0.0128,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3334858.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01866884157061577,
+ "skip_count": 2.0,
+ "step": 2068,
+ "text_loss": 0.17724967002868652
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.718520692691518,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06591796875,
+ "learning_rate": 0.000942190200653499,
+ "loss": 0.0162,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3338094.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.028636593371629715,
+ "skip_count": 2.0,
+ "step": 2070,
+ "text_loss": 0.34344956278800964
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 9.727913120046962,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.07568359375,
+ "learning_rate": 0.0009420456438842413,
+ "loss": 0.0165,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3340526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023245645686984062,
+ "skip_count": 2.0,
+ "step": 2072,
+ "text_loss": 0.7276164293289185
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.737305547402407,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11328125,
+ "learning_rate": 0.000941900917722259,
+ "loss": 0.0143,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3343303.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01565689593553543,
+ "skip_count": 0.0,
+ "step": 2074,
+ "text_loss": 0.5665070414543152
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1201171875,
+ "learning_rate": 0.0009417560222230115,
+ "loss": 0.0245,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3346409.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035056080669164658,
+ "skip_count": 0.0,
+ "step": 2076,
+ "text_loss": 0.5112795233726501
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.756090402113296,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06982421875,
+ "learning_rate": 0.0009416109574420229,
+ "loss": 0.0132,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3349220.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027565446216613054,
+ "skip_count": 0.0,
+ "step": 2078,
+ "text_loss": 0.5240910053253174
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 9.765482829468741,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08203125,
+ "learning_rate": 0.0009414657234348823,
+ "loss": 0.0186,
+ "macro_f1": 1.0,
+ "num_tokens": 3352627.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.01652451977133751,
+ "skip_count": 2.0,
+ "step": 2080,
+ "text_loss": 1.0217112302780151
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.774875256824185,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1630859375,
+ "learning_rate": 0.0009413203202572438,
+ "loss": 0.0179,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3355392.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.1012420505285263,
+ "skip_count": 2.0,
+ "step": 2082,
+ "text_loss": 0.4085482358932495
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08251953125,
+ "learning_rate": 0.000941174747964826,
+ "loss": 0.0154,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3358425.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004962718114256859,
+ "skip_count": 0.0,
+ "step": 2084,
+ "text_loss": 0.5833504796028137
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 9.793660111535075,
+ "f1_execute": 0.9583333134651184,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.11376953125,
+ "learning_rate": 0.0009410290066134124,
+ "loss": 0.0211,
+ "macro_f1": 0.8083333373069763,
+ "num_tokens": 3361925.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.07889176905155182,
+ "skip_count": 3.0,
+ "step": 2086,
+ "text_loss": 0.38126569986343384
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.803052538890519,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.051513671875,
+ "learning_rate": 0.0009408830962588517,
+ "loss": 0.0195,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 3365963.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.033715736120939255,
+ "skip_count": 2.0,
+ "step": 2088,
+ "text_loss": 0.23213914036750793
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.812444966245964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0732421875,
+ "learning_rate": 0.0009407370169570567,
+ "loss": 0.0169,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3369422.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014188943896442652,
+ "skip_count": 0.0,
+ "step": 2090,
+ "text_loss": 0.4648318886756897
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.82183739360141,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0712890625,
+ "learning_rate": 0.0009405907687640054,
+ "loss": 0.013,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3372506.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015339684672653675,
+ "skip_count": 1.0,
+ "step": 2092,
+ "text_loss": 0.2563800811767578
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 9.831229820956853,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.054443359375,
+ "learning_rate": 0.0009404443517357404,
+ "loss": 0.0146,
+ "macro_f1": 0.542222261428833,
+ "num_tokens": 3375653.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.06562861055135727,
+ "skip_count": 0.0,
+ "step": 2094,
+ "text_loss": 0.797835111618042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.840622248312298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.000940297765928369,
+ "loss": 0.0136,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3379018.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005745889153331518,
+ "skip_count": 0.0,
+ "step": 2096,
+ "text_loss": 0.4238114655017853
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0712890625,
+ "learning_rate": 0.0009401510113980631,
+ "loss": 0.0207,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3382855.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026634482201188803,
+ "skip_count": 0.0,
+ "step": 2098,
+ "text_loss": 0.4967166483402252
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0791015625,
+ "learning_rate": 0.0009400040882010592,
+ "loss": 0.0166,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3386386.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020642587915062904,
+ "skip_count": 0.0,
+ "step": 2100,
+ "text_loss": 0.44390562176704407
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.868799530378633,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.056640625,
+ "learning_rate": 0.0009398569963936589,
+ "loss": 0.017,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3389958.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013722737319767475,
+ "skip_count": 1.0,
+ "step": 2102,
+ "text_loss": 0.7207565903663635
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.878191957734076,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08837890625,
+ "learning_rate": 0.0009397097360322276,
+ "loss": 0.017,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3392892.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002051608171314001,
+ "skip_count": 0.0,
+ "step": 2104,
+ "text_loss": 0.3196398913860321
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.887584385089522,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07470703125,
+ "learning_rate": 0.000939562307173196,
+ "loss": 0.022,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3396636.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007085663266479969,
+ "skip_count": 0.0,
+ "step": 2106,
+ "text_loss": 0.5663776397705078
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 9.896976812444967,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.11328125,
+ "learning_rate": 0.0009394147098730592,
+ "loss": 0.02,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3399475.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019473131746053696,
+ "skip_count": 2.0,
+ "step": 2108,
+ "text_loss": 0.7708223462104797
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0009392669441883767,
+ "loss": 0.0134,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3402350.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028328890912234783,
+ "skip_count": 0.0,
+ "step": 2110,
+ "text_loss": 0.5888006091117859
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10693359375,
+ "learning_rate": 0.0009391190101757724,
+ "loss": 0.0166,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3405561.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023098422214388847,
+ "skip_count": 2.0,
+ "step": 2112,
+ "text_loss": 0.09865197539329529
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.925154094511301,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10107421875,
+ "learning_rate": 0.000938970907891935,
+ "loss": 0.0247,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3408513.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002896632067859173,
+ "skip_count": 0.0,
+ "step": 2114,
+ "text_loss": 0.6613234281539917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.934546521866745,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0947265625,
+ "learning_rate": 0.0009388226373936179,
+ "loss": 0.0211,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3411195.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015814457088708878,
+ "skip_count": 0.0,
+ "step": 2116,
+ "text_loss": 0.17363053560256958
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.94393894922219,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12451171875,
+ "learning_rate": 0.0009386741987376381,
+ "loss": 0.015,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 3414875.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02676783688366413,
+ "skip_count": 0.0,
+ "step": 2118,
+ "text_loss": 0.674056887626648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 9.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0751953125,
+ "learning_rate": 0.0009385255919808778,
+ "loss": 0.0203,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3418410.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01022857241332531,
+ "skip_count": 1.0,
+ "step": 2120,
+ "text_loss": 0.235092431306839
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 9.962723803933079,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0888671875,
+ "learning_rate": 0.0009383768171802836,
+ "loss": 0.0244,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3421289.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013572212308645248,
+ "skip_count": 2.0,
+ "step": 2122,
+ "text_loss": 0.5992844104766846
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 9.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0009382278743928659,
+ "loss": 0.0201,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3424781.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0051873656921088696,
+ "skip_count": 2.0,
+ "step": 2124,
+ "text_loss": 0.29915499687194824
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 9.981508658643968,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.07421875,
+ "learning_rate": 0.0009380787636757001,
+ "loss": 0.0155,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 3427942.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030079292133450508,
+ "skip_count": 4.0,
+ "step": 2126,
+ "text_loss": 0.24181491136550903
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 9.990901085999413,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0009379294850859256,
+ "loss": 0.0141,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3431314.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002675612922757864,
+ "skip_count": 0.0,
+ "step": 2128,
+ "text_loss": 0.4669873118400574
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.0009377800386807465,
+ "loss": 0.0177,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3435020.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009334275498986244,
+ "skip_count": 0.0,
+ "step": 2130,
+ "text_loss": 0.6478219628334045
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 10.009392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.134765625,
+ "learning_rate": 0.0009376304245174306,
+ "loss": 0.0137,
+ "macro_f1": 0.6000000238418579,
+ "num_tokens": 3438276.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.038227908313274384,
+ "skip_count": 2.0,
+ "step": 2132,
+ "text_loss": 0.4401201903820038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.018784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0009374806426533104,
+ "loss": 0.0113,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3440938.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006901399698108435,
+ "skip_count": 0.0,
+ "step": 2134,
+ "text_loss": 0.5948942303657532
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.0009373306931457827,
+ "loss": 0.0121,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3444028.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037061909679323435,
+ "skip_count": 0.0,
+ "step": 2136,
+ "text_loss": 0.5349751114845276
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.056884765625,
+ "learning_rate": 0.0009371805760523086,
+ "loss": 0.0111,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3448331.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025877030566334724,
+ "skip_count": 0.0,
+ "step": 2138,
+ "text_loss": 0.4591051936149597
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 10.046962136777223,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.0009370302914304129,
+ "loss": 0.0144,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 3451434.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018742674961686134,
+ "skip_count": 3.0,
+ "step": 2140,
+ "text_loss": 0.23470863699913025
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.056354564132668,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0009368798393376851,
+ "loss": 0.0122,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3454375.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02382594160735607,
+ "skip_count": 1.0,
+ "step": 2142,
+ "text_loss": 0.6077954769134521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 10.065746991488112,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.05517578125,
+ "learning_rate": 0.0009367292198317787,
+ "loss": 0.0164,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3457591.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03331060707569122,
+ "skip_count": 2.0,
+ "step": 2144,
+ "text_loss": 0.3691073954105377
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.075139418843557,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0009365784329704115,
+ "loss": 0.0186,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3460895.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016955457394942641,
+ "skip_count": 0.0,
+ "step": 2146,
+ "text_loss": 0.3947436511516571
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 10.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 0.0009364274788113651,
+ "loss": 0.0096,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3464101.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006169239990413189,
+ "skip_count": 0.0,
+ "step": 2148,
+ "text_loss": 0.3348555266857147
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 10.093924273554446,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0009362763574124858,
+ "loss": 0.019,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 3467417.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.024033790454268456,
+ "skip_count": 1.0,
+ "step": 2150,
+ "text_loss": 0.496633380651474
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.103316700909891,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0009361250688316829,
+ "loss": 0.0142,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3470917.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024986129719763994,
+ "skip_count": 0.0,
+ "step": 2152,
+ "text_loss": 0.6857671737670898
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 10.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0546875,
+ "learning_rate": 0.0009359736131269312,
+ "loss": 0.0153,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3473624.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008183322846889496,
+ "skip_count": 1.0,
+ "step": 2154,
+ "text_loss": 0.13883116841316223
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 10.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.0009358219903562684,
+ "loss": 0.0106,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3476472.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011198793537914753,
+ "skip_count": 3.0,
+ "step": 2156,
+ "text_loss": 0.24243666231632233
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.131493982976226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0009356702005777969,
+ "loss": 0.0125,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3479688.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002520184963941574,
+ "skip_count": 0.0,
+ "step": 2158,
+ "text_loss": 0.6407818794250488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.140886410331671,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0791015625,
+ "learning_rate": 0.0009355182438496825,
+ "loss": 0.0142,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3482598.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011065017897635698,
+ "skip_count": 0.0,
+ "step": 2160,
+ "text_loss": 0.7214245796203613
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 0.0009353661202301557,
+ "loss": 0.0144,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3486271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017824085662141442,
+ "skip_count": 0.0,
+ "step": 2162,
+ "text_loss": 0.5140969157218933
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.053466796875,
+ "learning_rate": 0.0009352138297775101,
+ "loss": 0.0145,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3489206.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001542879967018962,
+ "skip_count": 0.0,
+ "step": 2164,
+ "text_loss": 0.7956416606903076
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 10.169063692398003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0771484375,
+ "learning_rate": 0.000935061372550104,
+ "loss": 0.0134,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3492003.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01420794241130352,
+ "skip_count": 3.0,
+ "step": 2166,
+ "text_loss": 0.27489882707595825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 10.178456119753449,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.0009349087486063594,
+ "loss": 0.0166,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3494784.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003614309709519148,
+ "skip_count": 1.0,
+ "step": 2168,
+ "text_loss": 0.2962227761745453
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 10.187848547108894,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1259765625,
+ "learning_rate": 0.0009347559580047618,
+ "loss": 0.0175,
+ "macro_f1": 0.8814815282821655,
+ "num_tokens": 3497886.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.02122853323817253,
+ "skip_count": 4.0,
+ "step": 2170,
+ "text_loss": 0.5919580459594727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 10.197240974464338,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.000934603000803861,
+ "loss": 0.0135,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3500939.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02042219042778015,
+ "skip_count": 1.0,
+ "step": 2172,
+ "text_loss": 0.28722381591796875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0009344498770622704,
+ "loss": 0.013,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3504852.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004345106892287731,
+ "skip_count": 0.0,
+ "step": 2174,
+ "text_loss": 0.603236734867096
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.216025829175228,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1064453125,
+ "learning_rate": 0.0009342965868386673,
+ "loss": 0.0101,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3508320.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00368050136603415,
+ "skip_count": 0.0,
+ "step": 2176,
+ "text_loss": 0.6020491719245911
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.225418256530672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060302734375,
+ "learning_rate": 0.000934143130191793,
+ "loss": 0.0108,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3511278.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013425769284367561,
+ "skip_count": 0.0,
+ "step": 2178,
+ "text_loss": 0.5954724550247192
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060546875,
+ "learning_rate": 0.000933989507180452,
+ "loss": 0.0149,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3514361.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002896249992772937,
+ "skip_count": 0.0,
+ "step": 2180,
+ "text_loss": 0.39175131916999817
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 10.244203111241562,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052978515625,
+ "learning_rate": 0.0009338357178635135,
+ "loss": 0.0147,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 3517962.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011538350023329258,
+ "skip_count": 1.0,
+ "step": 2182,
+ "text_loss": 0.4482830762863159
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.253595538597006,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0869140625,
+ "learning_rate": 0.0009336817622999093,
+ "loss": 0.011,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3521299.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.022787930443882942,
+ "skip_count": 0.0,
+ "step": 2184,
+ "text_loss": 0.35177817940711975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.262987965952451,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0009335276405486357,
+ "loss": 0.0139,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3524611.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011597735807299614,
+ "skip_count": 1.0,
+ "step": 2186,
+ "text_loss": 0.24868851900100708
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11181640625,
+ "learning_rate": 0.0009333733526687524,
+ "loss": 0.0196,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3528012.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014253967441618443,
+ "skip_count": 0.0,
+ "step": 2188,
+ "text_loss": 0.3970910310745239
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.054931640625,
+ "learning_rate": 0.000933218898719383,
+ "loss": 0.0162,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3530908.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001659149187617004,
+ "skip_count": 0.0,
+ "step": 2190,
+ "text_loss": 0.7618573307991028
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0009330642787597141,
+ "loss": 0.0159,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3533993.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005574346985667944,
+ "skip_count": 0.0,
+ "step": 2192,
+ "text_loss": 0.16470147669315338
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.300557675374229,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0791015625,
+ "learning_rate": 0.0009329094928489969,
+ "loss": 0.0121,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3537310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026400673668831587,
+ "skip_count": 0.0,
+ "step": 2194,
+ "text_loss": 0.3400416374206543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 10.309950102729674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0849609375,
+ "learning_rate": 0.0009327545410465452,
+ "loss": 0.0124,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3540045.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008448398672044277,
+ "skip_count": 3.0,
+ "step": 2196,
+ "text_loss": 0.3110542297363281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.31934253008512,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.0009325994234117372,
+ "loss": 0.0122,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3544097.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.037553198635578156,
+ "skip_count": 2.0,
+ "step": 2198,
+ "text_loss": 0.36126700043678284
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 10.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09716796875,
+ "learning_rate": 0.000932444140004014,
+ "loss": 0.0124,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3547054.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006464479025453329,
+ "skip_count": 0.0,
+ "step": 2200,
+ "text_loss": 0.4947047233581543
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 10.338127384796008,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1015625,
+ "learning_rate": 0.0009322886908828805,
+ "loss": 0.0138,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3549903.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005384812597185373,
+ "skip_count": 0.0,
+ "step": 2202,
+ "text_loss": 0.5923738479614258
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 10.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0009321330761079052,
+ "loss": 0.0149,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3553745.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015346619300544262,
+ "skip_count": 2.0,
+ "step": 2204,
+ "text_loss": 0.1904175877571106
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 10.356912239506897,
+ "f1_execute": 0.9268292784690857,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.06494140625,
+ "learning_rate": 0.00093197729573872,
+ "loss": 0.0203,
+ "macro_f1": 0.8422764539718628,
+ "num_tokens": 3557235.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.1207597479224205,
+ "skip_count": 6.0,
+ "step": 2206,
+ "text_loss": 0.3904837667942047
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.366304666862343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0771484375,
+ "learning_rate": 0.0009318213498350202,
+ "loss": 0.0109,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3560795.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003334777895361185,
+ "skip_count": 0.0,
+ "step": 2208,
+ "text_loss": 0.4268290102481842
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.375697094217786,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0537109375,
+ "learning_rate": 0.0009316652384565645,
+ "loss": 0.0123,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3563754.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004230072256177664,
+ "skip_count": 0.0,
+ "step": 2210,
+ "text_loss": 0.40049710869789124
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.385089521573232,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046875,
+ "learning_rate": 0.0009315089616631751,
+ "loss": 0.0106,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3567173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006645230459980667,
+ "skip_count": 0.0,
+ "step": 2212,
+ "text_loss": 0.42568323016166687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.394481948928677,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07470703125,
+ "learning_rate": 0.0009313525195147376,
+ "loss": 0.0126,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3570831.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0097877848893404,
+ "skip_count": 0.0,
+ "step": 2214,
+ "text_loss": 0.45808279514312744
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 28.0,
+ "epoch": 10.40387437628412,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.5,
+ "grad_norm": 0.076171875,
+ "learning_rate": 0.000931195912071201,
+ "loss": 0.0187,
+ "macro_f1": 0.7018141150474548,
+ "num_tokens": 3573745.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.07351134717464447,
+ "skip_count": 3.0,
+ "step": 2216,
+ "text_loss": 0.285696804523468
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07666015625,
+ "learning_rate": 0.0009310391393925775,
+ "loss": 0.0125,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3576785.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033160944003611803,
+ "skip_count": 0.0,
+ "step": 2218,
+ "text_loss": 0.17516443133354187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 10.422659230995011,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 0.0009308822015389424,
+ "loss": 0.0241,
+ "macro_f1": 0.5427350401878357,
+ "num_tokens": 3580695.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.052930232137441635,
+ "skip_count": 1.0,
+ "step": 2220,
+ "text_loss": 0.5918155908584595
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 10.432051658350455,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.072265625,
+ "learning_rate": 0.0009307250985704352,
+ "loss": 0.0128,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 3583729.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.025454653427004814,
+ "skip_count": 4.0,
+ "step": 2222,
+ "text_loss": 0.2652169466018677
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 0.0009305678305472575,
+ "loss": 0.0158,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3586775.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011279845610260963,
+ "skip_count": 0.0,
+ "step": 2224,
+ "text_loss": 0.3511691987514496
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10791015625,
+ "learning_rate": 0.000930410397529675,
+ "loss": 0.017,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3589676.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002700264798477292,
+ "skip_count": 0.0,
+ "step": 2226,
+ "text_loss": 0.24045433104038239
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 10.460228940416789,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048095703125,
+ "learning_rate": 0.000930252799578016,
+ "loss": 0.0146,
+ "macro_f1": 1.0,
+ "num_tokens": 3593242.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00826631672680378,
+ "skip_count": 2.0,
+ "step": 2228,
+ "text_loss": 0.3777645528316498
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 10.469621367772234,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.0009300950367526728,
+ "loss": 0.0131,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 3596807.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.036221496760845184,
+ "skip_count": 2.0,
+ "step": 2230,
+ "text_loss": 0.502962589263916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0703125,
+ "learning_rate": 0.0009299371091141001,
+ "loss": 0.0131,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3600150.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006449893582612276,
+ "skip_count": 0.0,
+ "step": 2232,
+ "text_loss": 0.20256924629211426
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 10.488406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.0009297790167228161,
+ "loss": 0.012,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3602988.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007872486487030983,
+ "skip_count": 2.0,
+ "step": 2234,
+ "text_loss": 0.42476826906204224
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.497798649838568,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 0.0009296207596394022,
+ "loss": 0.0101,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3606071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.027397040277719498,
+ "skip_count": 2.0,
+ "step": 2236,
+ "text_loss": 0.23432791233062744
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0595703125,
+ "learning_rate": 0.0009294623379245028,
+ "loss": 0.0117,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3609389.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01042645052075386,
+ "skip_count": 0.0,
+ "step": 2238,
+ "text_loss": 0.16665785014629364
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.516583504549457,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 0.0009293037516388252,
+ "loss": 0.0161,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3612105.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012458425480872393,
+ "skip_count": 0.0,
+ "step": 2240,
+ "text_loss": 0.59421306848526
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 10.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0751953125,
+ "learning_rate": 0.0009291450008431404,
+ "loss": 0.0185,
+ "macro_f1": 1.0,
+ "num_tokens": 3615439.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005781981628388166,
+ "skip_count": 1.0,
+ "step": 2242,
+ "text_loss": 0.510798454284668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 10.535368359260346,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.0966796875,
+ "learning_rate": 0.0009289860855982814,
+ "loss": 0.0166,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 3618842.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.031195320188999176,
+ "skip_count": 3.0,
+ "step": 2244,
+ "text_loss": 0.7574363350868225
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.544760786615791,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.0009288270059651454,
+ "loss": 0.0133,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3621823.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001746491645462811,
+ "skip_count": 0.0,
+ "step": 2246,
+ "text_loss": 0.5125683546066284
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 10.554153213971237,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.220703125,
+ "learning_rate": 0.0009286677620046918,
+ "loss": 0.0159,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3624502.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03792348504066467,
+ "skip_count": 2.0,
+ "step": 2248,
+ "text_loss": 0.7533677220344543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07763671875,
+ "learning_rate": 0.0009285083537779429,
+ "loss": 0.0116,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3627057.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009684451506473124,
+ "skip_count": 0.0,
+ "step": 2250,
+ "text_loss": 0.2219279706478119
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 10.572938068682125,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.11767578125,
+ "learning_rate": 0.0009283487813459845,
+ "loss": 0.0148,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3629720.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022757573053240776,
+ "skip_count": 2.0,
+ "step": 2252,
+ "text_loss": 0.6903313994407654
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 10.582330496037569,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1376953125,
+ "learning_rate": 0.0009281890447699652,
+ "loss": 0.015,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3633234.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003613058477640152,
+ "skip_count": 0.0,
+ "step": 2254,
+ "text_loss": 0.6278893351554871
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0009280291441110961,
+ "loss": 0.0116,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3636289.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006214062683284283,
+ "skip_count": 0.0,
+ "step": 2256,
+ "text_loss": 0.3011114001274109
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 10.60111535074846,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0009278690794306517,
+ "loss": 0.014,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3640251.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.052556321024894714,
+ "skip_count": 2.0,
+ "step": 2258,
+ "text_loss": 0.19894185662269592
+ },
+ {
+ "acc_repeat": 0.75,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 10.610507778103903,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.8571428656578064,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08251953125,
+ "learning_rate": 0.0009277088507899689,
+ "loss": 0.0163,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 3643527.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.0572301521897316,
+ "skip_count": 1.0,
+ "step": 2260,
+ "text_loss": 0.5593410134315491
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.619900205459349,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 0.0009275484582504475,
+ "loss": 0.0104,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3646959.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008010074496269226,
+ "skip_count": 0.0,
+ "step": 2262,
+ "text_loss": 0.2128177285194397
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 28.0,
+ "epoch": 10.629292632814794,
+ "f1_execute": 0.95652174949646,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.0009273879018735505,
+ "loss": 0.0138,
+ "macro_f1": 0.8521739840507507,
+ "num_tokens": 3651298.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.035729870200157166,
+ "skip_count": 3.0,
+ "step": 2264,
+ "text_loss": 0.2987811267375946
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.638685060170237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1474609375,
+ "learning_rate": 0.0009272271817208031,
+ "loss": 0.0182,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3655609.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002379779238253832,
+ "skip_count": 0.0,
+ "step": 2266,
+ "text_loss": 0.6024088263511658
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.0009270662978537939,
+ "loss": 0.0098,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3658444.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008943650871515274,
+ "skip_count": 0.0,
+ "step": 2268,
+ "text_loss": 0.1741207242012024
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 10.657469914881126,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.0009269052503341736,
+ "loss": 0.0161,
+ "macro_f1": 0.6595745086669922,
+ "num_tokens": 3662282.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.030201267451047897,
+ "skip_count": 4.0,
+ "step": 2270,
+ "text_loss": 0.7300035953521729
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.666862342236572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0009267440392236562,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3665531.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026635683607310057,
+ "skip_count": 0.0,
+ "step": 2272,
+ "text_loss": 0.31535038352012634
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0009265826645840178,
+ "loss": 0.0151,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3668407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004258926957845688,
+ "skip_count": 0.0,
+ "step": 2274,
+ "text_loss": 0.7272579073905945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 10.68564719694746,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.125,
+ "learning_rate": 0.0009264211264770976,
+ "loss": 0.0154,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 3671503.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.038987524807453156,
+ "skip_count": 4.0,
+ "step": 2276,
+ "text_loss": 0.7488982677459717
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 10.695039624302906,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.099609375,
+ "learning_rate": 0.0009262594249647975,
+ "loss": 0.0164,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3674107.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007211760152131319,
+ "skip_count": 1.0,
+ "step": 2278,
+ "text_loss": 0.1992369294166565
+ },
+ {
+ "acc_repeat": 0.75,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 10.704432051658351,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.8571428656578064,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0546875,
+ "learning_rate": 0.0009260975601090815,
+ "loss": 0.0112,
+ "macro_f1": 0.9446290731430054,
+ "num_tokens": 3677184.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.02538592554628849,
+ "skip_count": 3.0,
+ "step": 2280,
+ "text_loss": 0.46402135491371155
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0009259355319719768,
+ "loss": 0.0162,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3680683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038464947137981653,
+ "skip_count": 0.0,
+ "step": 2282,
+ "text_loss": 0.5804527401924133
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1611328125,
+ "learning_rate": 0.0009257733406155726,
+ "loss": 0.0169,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3683928.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004841136280447245,
+ "skip_count": 0.0,
+ "step": 2284,
+ "text_loss": 0.4834538400173187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.0009256109861020212,
+ "loss": 0.0115,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3687101.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002191900508478284,
+ "skip_count": 0.0,
+ "step": 2286,
+ "text_loss": 0.8199604749679565
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 10.742001761080129,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0927734375,
+ "learning_rate": 0.000925448468493537,
+ "loss": 0.0162,
+ "macro_f1": 0.5427350401878357,
+ "num_tokens": 3690490.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03488675877451897,
+ "skip_count": 2.0,
+ "step": 2288,
+ "text_loss": 0.33263635635375977
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 10.751394188435574,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.0009252857878523971,
+ "loss": 0.0134,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3694109.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002897309372201562,
+ "skip_count": 0.0,
+ "step": 2290,
+ "text_loss": 0.47494807839393616
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 10.760786615791018,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05810546875,
+ "learning_rate": 0.000925122944240941,
+ "loss": 0.0153,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3697233.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01842675730586052,
+ "skip_count": 2.0,
+ "step": 2292,
+ "text_loss": 0.14693495631217957
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 10.770179043146463,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0009249599377215707,
+ "loss": 0.0146,
+ "macro_f1": 0.5866667032241821,
+ "num_tokens": 3700376.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04169808700680733,
+ "skip_count": 3.0,
+ "step": 2294,
+ "text_loss": 0.38051268458366394
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.779571470501908,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.0009247967683567507,
+ "loss": 0.0112,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3703212.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012183113023638725,
+ "skip_count": 1.0,
+ "step": 2296,
+ "text_loss": 0.23789077997207642
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 10.788963897857352,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.05712890625,
+ "learning_rate": 0.0009246334362090077,
+ "loss": 0.0137,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 3706490.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01880069635808468,
+ "skip_count": 2.0,
+ "step": 2298,
+ "text_loss": 0.29067978262901306
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.798356325212797,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08203125,
+ "learning_rate": 0.000924469941340931,
+ "loss": 0.0173,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3709804.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.027359159663319588,
+ "skip_count": 0.0,
+ "step": 2300,
+ "text_loss": 0.67828369140625
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.807748752568243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07275390625,
+ "learning_rate": 0.000924306283815172,
+ "loss": 0.0153,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3712824.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003152279881760478,
+ "skip_count": 0.0,
+ "step": 2302,
+ "text_loss": 0.8333184719085693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 10.817141179923686,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0703125,
+ "learning_rate": 0.0009241424636944445,
+ "loss": 0.0159,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3715385.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0442950464785099,
+ "skip_count": 2.0,
+ "step": 2304,
+ "text_loss": 0.41893699765205383
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 10.826533607279131,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.058837890625,
+ "learning_rate": 0.0009239784810415249,
+ "loss": 0.0137,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 3719080.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015729321166872978,
+ "skip_count": 2.0,
+ "step": 2306,
+ "text_loss": 0.13360483944416046
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 10.835926034634575,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.06787109375,
+ "learning_rate": 0.0009238143359192514,
+ "loss": 0.0136,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 3722439.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.028816604986786842,
+ "skip_count": 3.0,
+ "step": 2308,
+ "text_loss": 0.39594101905822754
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 10.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.000923650028390525,
+ "loss": 0.0166,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3725092.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036455015651881695,
+ "skip_count": 2.0,
+ "step": 2310,
+ "text_loss": 0.6169708371162415
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 10.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.09814453125,
+ "learning_rate": 0.0009234855585183086,
+ "loss": 0.014,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3728412.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007565604057163,
+ "skip_count": 1.0,
+ "step": 2312,
+ "text_loss": 0.21257059276103973
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 28.0,
+ "epoch": 10.86410331670091,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.0009233209263656273,
+ "loss": 0.0184,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 3731467.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.02510629966855049,
+ "skip_count": 3.0,
+ "step": 2314,
+ "text_loss": 0.21639840304851532
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.873495744056354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.057861328125,
+ "learning_rate": 0.0009231561319955684,
+ "loss": 0.0154,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3734906.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00872227642685175,
+ "skip_count": 0.0,
+ "step": 2316,
+ "text_loss": 0.35639774799346924
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08349609375,
+ "learning_rate": 0.0009229911754712815,
+ "loss": 0.0176,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3737943.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004695790819823742,
+ "skip_count": 0.0,
+ "step": 2318,
+ "text_loss": 0.5269573330879211
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.892280598767243,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0009228260568559781,
+ "loss": 0.0115,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3741833.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0217357836663723,
+ "skip_count": 0.0,
+ "step": 2320,
+ "text_loss": 0.5110208988189697
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.901673026122689,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1953125,
+ "learning_rate": 0.0009226607762129322,
+ "loss": 0.0201,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3744642.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.05595960095524788,
+ "skip_count": 1.0,
+ "step": 2322,
+ "text_loss": 0.6291998624801636
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.056884765625,
+ "learning_rate": 0.0009224953336054796,
+ "loss": 0.0161,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3748127.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0071634589694440365,
+ "skip_count": 0.0,
+ "step": 2324,
+ "text_loss": 0.7404762506484985
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 0.000922329729097018,
+ "loss": 0.0169,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3751373.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011676300782710314,
+ "skip_count": 0.0,
+ "step": 2326,
+ "text_loss": 0.2915459871292114
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.929850308189023,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.061279296875,
+ "learning_rate": 0.0009221639627510075,
+ "loss": 0.0126,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3754518.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01039792038500309,
+ "skip_count": 0.0,
+ "step": 2328,
+ "text_loss": 0.22066321969032288
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0751953125,
+ "learning_rate": 0.0009219980346309702,
+ "loss": 0.0128,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3757621.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032070958986878395,
+ "skip_count": 0.0,
+ "step": 2330,
+ "text_loss": 0.5558560490608215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.948635162899912,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.076171875,
+ "learning_rate": 0.0009218319448004899,
+ "loss": 0.0118,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3760885.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007085457909852266,
+ "skip_count": 0.0,
+ "step": 2332,
+ "text_loss": 0.4348253607749939
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 10.958027590255357,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1103515625,
+ "learning_rate": 0.0009216656933232129,
+ "loss": 0.016,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3764462.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005504854489117861,
+ "skip_count": 1.0,
+ "step": 2334,
+ "text_loss": 0.35828644037246704
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 0.0009214992802628463,
+ "loss": 0.0131,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3767159.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013970810687169433,
+ "skip_count": 0.0,
+ "step": 2336,
+ "text_loss": 0.2956557869911194
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.976812444966246,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08203125,
+ "learning_rate": 0.0009213327056831607,
+ "loss": 0.0181,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3770408.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0427570566534996,
+ "skip_count": 1.0,
+ "step": 2338,
+ "text_loss": 0.14883014559745789
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.986204872321691,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0009211659696479875,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3773474.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011273405980318785,
+ "skip_count": 0.0,
+ "step": 2340,
+ "text_loss": 0.26011669635772705
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 10.995597299677135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.059814453125,
+ "learning_rate": 0.00092099907222122,
+ "loss": 0.0148,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3776909.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016178421210497618,
+ "skip_count": 0.0,
+ "step": 2342,
+ "text_loss": 0.49078530073165894
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.004696213677722,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.000920832013466814,
+ "loss": 0.0129,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3780741.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005510095041245222,
+ "skip_count": 0.0,
+ "step": 2344,
+ "text_loss": 0.4870249927043915
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.014088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0009206647934487866,
+ "loss": 0.0114,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3784673.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0047357892617583275,
+ "skip_count": 0.0,
+ "step": 2346,
+ "text_loss": 0.3251725733280182
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 0.0009204974122312167,
+ "loss": 0.0142,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3787503.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00795028731226921,
+ "skip_count": 1.0,
+ "step": 2348,
+ "text_loss": 0.18282145261764526
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060546875,
+ "learning_rate": 0.0009203298698782452,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3790528.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0009506374481134117,
+ "skip_count": 0.0,
+ "step": 2350,
+ "text_loss": 0.4093080461025238
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.042265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 0.0009201621664540747,
+ "loss": 0.0155,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3794134.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005159572698175907,
+ "skip_count": 0.0,
+ "step": 2352,
+ "text_loss": 0.5451981425285339
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.051658350454945,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07666015625,
+ "learning_rate": 0.0009199943020229694,
+ "loss": 0.0148,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3797414.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002356168581172824,
+ "skip_count": 0.0,
+ "step": 2354,
+ "text_loss": 0.3070453405380249
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0810546875,
+ "learning_rate": 0.0009198262766492554,
+ "loss": 0.0141,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3800094.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0051761893555521965,
+ "skip_count": 1.0,
+ "step": 2356,
+ "text_loss": 0.5880904197692871
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.070443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.00091965809039732,
+ "loss": 0.0132,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3803280.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025952060241252184,
+ "skip_count": 0.0,
+ "step": 2358,
+ "text_loss": 0.5210731625556946
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.079835632521279,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06787109375,
+ "learning_rate": 0.0009194897433316127,
+ "loss": 0.0125,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3805866.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0042560105212032795,
+ "skip_count": 2.0,
+ "step": 2360,
+ "text_loss": 0.6472984552383423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07568359375,
+ "learning_rate": 0.0009193212355166446,
+ "loss": 0.0109,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3808952.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026232977397739887,
+ "skip_count": 0.0,
+ "step": 2362,
+ "text_loss": 0.450063556432724
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0009191525670169881,
+ "loss": 0.0109,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3812080.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034355956595391035,
+ "skip_count": 0.0,
+ "step": 2364,
+ "text_loss": 0.49727216362953186
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.108012914587613,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.000918983737897277,
+ "loss": 0.0112,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3815282.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0055653867311775684,
+ "skip_count": 1.0,
+ "step": 2366,
+ "text_loss": 0.6336377859115601
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 11.117405341943059,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0009188147482222071,
+ "loss": 0.008,
+ "macro_f1": 1.0,
+ "num_tokens": 3818106.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.011016021482646465,
+ "skip_count": 2.0,
+ "step": 2368,
+ "text_loss": 0.22513329982757568
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.126797769298504,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0009186455980565358,
+ "loss": 0.0105,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3821228.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.014039464294910431,
+ "skip_count": 0.0,
+ "step": 2370,
+ "text_loss": 0.21331638097763062
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.136190196653947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0009184762874650816,
+ "loss": 0.0128,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3825048.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001088051125407219,
+ "skip_count": 0.0,
+ "step": 2372,
+ "text_loss": 0.6031543612480164
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.095703125,
+ "learning_rate": 0.0009183068165127245,
+ "loss": 0.013,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3828781.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006263940595090389,
+ "skip_count": 1.0,
+ "step": 2374,
+ "text_loss": 0.6249601244926453
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06982421875,
+ "learning_rate": 0.0009181371852644062,
+ "loss": 0.0133,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3832507.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001987969037145376,
+ "skip_count": 0.0,
+ "step": 2376,
+ "text_loss": 0.37972065806388855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.164367478720282,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0908203125,
+ "learning_rate": 0.0009179673937851299,
+ "loss": 0.0158,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3835644.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007635094691067934,
+ "skip_count": 1.0,
+ "step": 2378,
+ "text_loss": 0.46319663524627686
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0830078125,
+ "learning_rate": 0.0009177974421399598,
+ "loss": 0.0137,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3838700.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01617279462516308,
+ "skip_count": 2.0,
+ "step": 2380,
+ "text_loss": 0.32141056656837463
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056396484375,
+ "learning_rate": 0.0009176273303940217,
+ "loss": 0.011,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3841953.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022273799404501915,
+ "skip_count": 2.0,
+ "step": 2382,
+ "text_loss": 0.5908139944076538
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.192544760786616,
+ "f1_execute": 0.9629629850387573,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0009174570586125026,
+ "loss": 0.0122,
+ "macro_f1": 0.32098767161369324,
+ "num_tokens": 3845763.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.030915161594748497,
+ "skip_count": 0.0,
+ "step": 2384,
+ "text_loss": 0.41400137543678284
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.201937188142061,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.0009172866268606513,
+ "loss": 0.0122,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3848984.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010480951517820358,
+ "skip_count": 2.0,
+ "step": 2386,
+ "text_loss": 0.2560874819755554
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056396484375,
+ "learning_rate": 0.0009171160352037775,
+ "loss": 0.0124,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3852118.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00809961836785078,
+ "skip_count": 1.0,
+ "step": 2388,
+ "text_loss": 0.28236693143844604
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 11.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 0.0009169452837072521,
+ "loss": 0.0105,
+ "macro_f1": 1.0,
+ "num_tokens": 3855314.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005569872446358204,
+ "skip_count": 1.0,
+ "step": 2390,
+ "text_loss": 0.4578137695789337
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1123046875,
+ "learning_rate": 0.0009167743724365073,
+ "loss": 0.0105,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3858301.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038610948249697685,
+ "skip_count": 1.0,
+ "step": 2392,
+ "text_loss": 0.14082716405391693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.239506897563839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1376953125,
+ "learning_rate": 0.0009166033014570368,
+ "loss": 0.0109,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3861296.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017607157351449132,
+ "skip_count": 0.0,
+ "step": 2394,
+ "text_loss": 0.384442001581192
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 11.248899324919284,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.0009164320708343954,
+ "loss": 0.0131,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3863985.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.009627950377762318,
+ "skip_count": 0.0,
+ "step": 2396,
+ "text_loss": 0.6969521045684814
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.258291752274728,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07666015625,
+ "learning_rate": 0.0009162606806341989,
+ "loss": 0.0107,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3866636.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006915586534887552,
+ "skip_count": 0.0,
+ "step": 2398,
+ "text_loss": 0.48069697618484497
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.0009160891309221242,
+ "loss": 0.0149,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3870867.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0013031222624704242,
+ "skip_count": 0.0,
+ "step": 2400,
+ "text_loss": 0.3882075846195221
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.277076606985618,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.0009159174217639096,
+ "loss": 0.0112,
+ "macro_f1": 0.5427350401878357,
+ "num_tokens": 3873663.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.06621067970991135,
+ "skip_count": 1.0,
+ "step": 2402,
+ "text_loss": 0.5740041136741638
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.286469034341062,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0009157455532253547,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3876788.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005957918707281351,
+ "skip_count": 0.0,
+ "step": 2404,
+ "text_loss": 0.26025933027267456
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 11.295861461696507,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.08642578125,
+ "learning_rate": 0.0009155735253723191,
+ "loss": 0.0126,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 3879942.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.039429809898138046,
+ "skip_count": 4.0,
+ "step": 2406,
+ "text_loss": 1.1349908113479614
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.305253889051952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 0.0009154013382707251,
+ "loss": 0.0113,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3882682.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012570557883009315,
+ "skip_count": 0.0,
+ "step": 2408,
+ "text_loss": 0.5611135363578796
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.314646316407396,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0009152289919865543,
+ "loss": 0.0123,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3886425.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017455556662753224,
+ "skip_count": 0.0,
+ "step": 2410,
+ "text_loss": 0.7523751854896545
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.324038743762841,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0009150564865858506,
+ "loss": 0.0114,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3889273.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011178011074662209,
+ "skip_count": 1.0,
+ "step": 2412,
+ "text_loss": 0.26942551136016846
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 11.333431171118287,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.0009148838221347182,
+ "loss": 0.0107,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 3892199.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.019628092646598816,
+ "skip_count": 0.0,
+ "step": 2414,
+ "text_loss": 0.5492315888404846
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.0009147109986993225,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3895362.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012255983427166939,
+ "skip_count": 0.0,
+ "step": 2416,
+ "text_loss": 0.23798216879367828
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11669921875,
+ "learning_rate": 0.0009145380163458899,
+ "loss": 0.0178,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3898476.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007018954027444124,
+ "skip_count": 0.0,
+ "step": 2418,
+ "text_loss": 0.1923145055770874
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.361608453184619,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0009143648751407074,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3901817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008574824314564466,
+ "skip_count": 0.0,
+ "step": 2420,
+ "text_loss": 0.4001806974411011
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 11.371000880540064,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.11328125,
+ "learning_rate": 0.0009141915751501231,
+ "loss": 0.0102,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3905461.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01572350226342678,
+ "skip_count": 2.0,
+ "step": 2422,
+ "text_loss": 0.19519129395484924
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0009140181164405458,
+ "loss": 0.0109,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3908878.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005503420252352953,
+ "skip_count": 0.0,
+ "step": 2424,
+ "text_loss": 0.6937088370323181
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0009138444990784454,
+ "loss": 0.013,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3912053.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007556677330285311,
+ "skip_count": 0.0,
+ "step": 2426,
+ "text_loss": 0.35431069135665894
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.399178162606399,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 0.000913670723130352,
+ "loss": 0.0117,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3915192.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013609991874545813,
+ "skip_count": 0.0,
+ "step": 2428,
+ "text_loss": 0.5171207189559937
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 11.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.0009134967886628573,
+ "loss": 0.0115,
+ "macro_f1": 1.0,
+ "num_tokens": 3917927.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.010895746760070324,
+ "skip_count": 2.0,
+ "step": 2430,
+ "text_loss": 0.2852934002876282
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.417963017317287,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0009133226957426133,
+ "loss": 0.0132,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3921460.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04196908697485924,
+ "skip_count": 0.0,
+ "step": 2432,
+ "text_loss": 0.4864770770072937
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.427355444672733,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1025390625,
+ "learning_rate": 0.0009131484444363324,
+ "loss": 0.0155,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3924662.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004484197124838829,
+ "skip_count": 0.0,
+ "step": 2434,
+ "text_loss": 0.7568684220314026
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.0009129740348107882,
+ "loss": 0.0114,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3927337.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004351360257714987,
+ "skip_count": 2.0,
+ "step": 2436,
+ "text_loss": 0.5953161716461182
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 11.446140299383622,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.04736328125,
+ "learning_rate": 0.0009127994669328151,
+ "loss": 0.0085,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 3930407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01664198748767376,
+ "skip_count": 4.0,
+ "step": 2438,
+ "text_loss": 0.5320524573326111
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.455532726739067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0595703125,
+ "learning_rate": 0.0009126247408693071,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3933184.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017819046042859554,
+ "skip_count": 1.0,
+ "step": 2440,
+ "text_loss": 0.6051273345947266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.0009124498566872204,
+ "loss": 0.0105,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3936620.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005519696045666933,
+ "skip_count": 0.0,
+ "step": 2442,
+ "text_loss": 0.12987950444221497
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.474317581449956,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 0.0009122748144535704,
+ "loss": 0.0111,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3940010.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04543351009488106,
+ "skip_count": 2.0,
+ "step": 2444,
+ "text_loss": 0.4642033576965332
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.483710008805401,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0009120996142354338,
+ "loss": 0.0121,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3943135.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00550565542653203,
+ "skip_count": 0.0,
+ "step": 2446,
+ "text_loss": 0.5697627067565918
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.493102436160845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05029296875,
+ "learning_rate": 0.0009119242560999477,
+ "loss": 0.0132,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3946650.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008842485956847668,
+ "skip_count": 0.0,
+ "step": 2448,
+ "text_loss": 0.17046524584293365
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08154296875,
+ "learning_rate": 0.0009117487401143095,
+ "loss": 0.0154,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3949470.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005900127813220024,
+ "skip_count": 0.0,
+ "step": 2450,
+ "text_loss": 0.37260866165161133
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 11.511887290871735,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0009115730663457773,
+ "loss": 0.0137,
+ "macro_f1": 1.0,
+ "num_tokens": 3952546.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003409258322790265,
+ "skip_count": 1.0,
+ "step": 2452,
+ "text_loss": 0.5308008193969727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.521279718227179,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0009113972348616698,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3955817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010098597034811974,
+ "skip_count": 1.0,
+ "step": 2454,
+ "text_loss": 0.39226648211479187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 11.530672145582624,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1640625,
+ "learning_rate": 0.0009112212457293658,
+ "loss": 0.0102,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3958911.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.08184818178415298,
+ "skip_count": 0.0,
+ "step": 2456,
+ "text_loss": 0.45411455631256104
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0009110450990163047,
+ "loss": 0.0127,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3962584.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009352223132736981,
+ "skip_count": 0.0,
+ "step": 2458,
+ "text_loss": 0.47292324900627136
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.549457000293513,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0009108687947899863,
+ "loss": 0.0077,
+ "macro_f1": 1.0,
+ "num_tokens": 3965597.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008150188252329826,
+ "skip_count": 2.0,
+ "step": 2460,
+ "text_loss": 0.33208340406417847
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 11.558849427648958,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0009106923331179707,
+ "loss": 0.0125,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 3968664.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.050999004393815994,
+ "skip_count": 2.0,
+ "step": 2462,
+ "text_loss": 0.2459995150566101
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0009105157140678782,
+ "loss": 0.0126,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 3971772.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006196586415171623,
+ "skip_count": 1.0,
+ "step": 2464,
+ "text_loss": 0.23956991732120514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.577634282359847,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0009103389377073896,
+ "loss": 0.01,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3976224.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008181816898286343,
+ "skip_count": 0.0,
+ "step": 2466,
+ "text_loss": 0.3235875070095062
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.057373046875,
+ "learning_rate": 0.0009101620041042462,
+ "loss": 0.0116,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3978876.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015451472718268633,
+ "skip_count": 0.0,
+ "step": 2468,
+ "text_loss": 0.4038759469985962
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.596419137070736,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09130859375,
+ "learning_rate": 0.000909984913326249,
+ "loss": 0.0131,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3981992.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021785033866763115,
+ "skip_count": 1.0,
+ "step": 2470,
+ "text_loss": 0.6346460580825806
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.605811564426181,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0712890625,
+ "learning_rate": 0.0009098076654412595,
+ "loss": 0.0094,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3984560.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011462471447885036,
+ "skip_count": 0.0,
+ "step": 2472,
+ "text_loss": 0.3449646532535553
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.0009096302605171996,
+ "loss": 0.0103,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3987548.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014367027906700969,
+ "skip_count": 0.0,
+ "step": 2474,
+ "text_loss": 0.5918350219726562
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0478515625,
+ "learning_rate": 0.0009094526986220513,
+ "loss": 0.0124,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 3990727.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008977655088528991,
+ "skip_count": 0.0,
+ "step": 2476,
+ "text_loss": 0.463350385427475
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.633988846492516,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0009092749798238563,
+ "loss": 0.015,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 3993757.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.016712551936507225,
+ "skip_count": 0.0,
+ "step": 2478,
+ "text_loss": 0.5621229410171509
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.643381273847961,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.000909097104190717,
+ "loss": 0.0172,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 3997259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04134179651737213,
+ "skip_count": 2.0,
+ "step": 2480,
+ "text_loss": 0.375476598739624
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0009089190717907956,
+ "loss": 0.0117,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4000563.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003462378401309252,
+ "skip_count": 0.0,
+ "step": 2482,
+ "text_loss": 0.5553798675537109
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06982421875,
+ "learning_rate": 0.0009087408826923146,
+ "loss": 0.0182,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4004065.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008057428523898125,
+ "skip_count": 2.0,
+ "step": 2484,
+ "text_loss": 0.4329465329647064
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.671558555914293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.0009085625369635564,
+ "loss": 0.0114,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4007119.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005759050603955984,
+ "skip_count": 0.0,
+ "step": 2486,
+ "text_loss": 0.501268744468689
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.680950983269739,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1240234375,
+ "learning_rate": 0.0009083840346728631,
+ "loss": 0.0122,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4010547.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.020763102918863297,
+ "skip_count": 0.0,
+ "step": 2488,
+ "text_loss": 0.480196475982666
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.690343410625184,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.0009082053758886374,
+ "loss": 0.0117,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4014600.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005801836494356394,
+ "skip_count": 1.0,
+ "step": 2490,
+ "text_loss": 0.18249782919883728
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 11.699735837980628,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0009080265606793416,
+ "loss": 0.0128,
+ "macro_f1": 1.0,
+ "num_tokens": 4017964.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004226063843816519,
+ "skip_count": 1.0,
+ "step": 2492,
+ "text_loss": 0.6573076248168945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.000907847589113498,
+ "loss": 0.0125,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4020694.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004281101748347282,
+ "skip_count": 2.0,
+ "step": 2494,
+ "text_loss": 0.3944586217403412
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.718520692691518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.061279296875,
+ "learning_rate": 0.000907668461259689,
+ "loss": 0.0152,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4023757.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008786370046436787,
+ "skip_count": 1.0,
+ "step": 2496,
+ "text_loss": 0.6452898979187012
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.727913120046962,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0009074891771865566,
+ "loss": 0.0125,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4026601.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005209595896303654,
+ "skip_count": 0.0,
+ "step": 2498,
+ "text_loss": 0.9633619785308838
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 11.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0009073097369628028,
+ "loss": 0.013,
+ "macro_f1": 1.0,
+ "num_tokens": 4030321.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.00860709697008133,
+ "skip_count": 1.0,
+ "step": 2500,
+ "text_loss": 0.48566827178001404
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.0009071301406571893,
+ "loss": 0.0132,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4033234.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035277456045150757,
+ "skip_count": 0.0,
+ "step": 2502,
+ "text_loss": 0.3771554231643677
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.756090402113296,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.000906950388338538,
+ "loss": 0.0136,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4036417.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013424850767478347,
+ "skip_count": 0.0,
+ "step": 2504,
+ "text_loss": 0.8962806463241577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.765482829468741,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09912109375,
+ "learning_rate": 0.0009067704800757301,
+ "loss": 0.0095,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4039564.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010423909407109022,
+ "skip_count": 0.0,
+ "step": 2506,
+ "text_loss": 0.43170279264450073
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.774875256824185,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.000906590415937707,
+ "loss": 0.0094,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4043212.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021780289709568024,
+ "skip_count": 1.0,
+ "step": 2508,
+ "text_loss": 0.41495826840400696
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0009064101959934696,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4046687.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007261929102241993,
+ "skip_count": 1.0,
+ "step": 2510,
+ "text_loss": 0.21821187436580658
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.057861328125,
+ "learning_rate": 0.0009062298203120783,
+ "loss": 0.0102,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4050735.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007447180338203907,
+ "skip_count": 2.0,
+ "step": 2512,
+ "text_loss": 0.1818767935037613
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.803052538890519,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06494140625,
+ "learning_rate": 0.0009060492889626535,
+ "loss": 0.0142,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4054426.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0718490406870842,
+ "skip_count": 0.0,
+ "step": 2514,
+ "text_loss": 0.22798970341682434
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.812444966245964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.099609375,
+ "learning_rate": 0.0009058686020143753,
+ "loss": 0.0183,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4057615.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0052676633931696415,
+ "skip_count": 0.0,
+ "step": 2516,
+ "text_loss": 0.1712338626384735
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0009056877595364832,
+ "loss": 0.0137,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4060338.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018052728846669197,
+ "skip_count": 0.0,
+ "step": 2518,
+ "text_loss": 0.6811438798904419
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.083984375,
+ "learning_rate": 0.0009055067615982761,
+ "loss": 0.0113,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4062887.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009029926732182503,
+ "skip_count": 0.0,
+ "step": 2520,
+ "text_loss": 0.5480356812477112
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.840622248312298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.0009053256082691133,
+ "loss": 0.0106,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4065357.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027515271212905645,
+ "skip_count": 0.0,
+ "step": 2522,
+ "text_loss": 0.5234101414680481
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08203125,
+ "learning_rate": 0.0009051442996184127,
+ "loss": 0.0174,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4068111.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002199822571128607,
+ "skip_count": 0.0,
+ "step": 2524,
+ "text_loss": 0.2418575882911682
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0625,
+ "learning_rate": 0.0009049628357156521,
+ "loss": 0.0143,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4071284.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006303096655756235,
+ "skip_count": 2.0,
+ "step": 2526,
+ "text_loss": 0.7948065996170044
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.868799530378633,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.000904781216630369,
+ "loss": 0.0068,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 4074750.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01791904680430889,
+ "skip_count": 2.0,
+ "step": 2528,
+ "text_loss": 0.809726357460022
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 11.878191957734076,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 0.0009045994424321602,
+ "loss": 0.0102,
+ "macro_f1": 1.0,
+ "num_tokens": 4078617.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.016553178429603577,
+ "skip_count": 2.0,
+ "step": 2530,
+ "text_loss": 0.8755000829696655
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.887584385089522,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.061767578125,
+ "learning_rate": 0.0009044175131906817,
+ "loss": 0.0145,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4080936.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00884837657213211,
+ "skip_count": 0.0,
+ "step": 2532,
+ "text_loss": 0.795871913433075
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.896976812444967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05029296875,
+ "learning_rate": 0.0009042354289756491,
+ "loss": 0.0122,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4084459.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024387789890170097,
+ "skip_count": 0.0,
+ "step": 2534,
+ "text_loss": 0.18875400722026825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0625,
+ "learning_rate": 0.0009040531898568379,
+ "loss": 0.0171,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4088464.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00491489190608263,
+ "skip_count": 0.0,
+ "step": 2536,
+ "text_loss": 0.334369033575058
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 11.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.091796875,
+ "learning_rate": 0.000903870795904082,
+ "loss": 0.0145,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4091659.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004592662677168846,
+ "skip_count": 2.0,
+ "step": 2538,
+ "text_loss": 0.21298295259475708
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 11.925154094511301,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 0.000903688247187275,
+ "loss": 0.0137,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4095496.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011647242121398449,
+ "skip_count": 2.0,
+ "step": 2540,
+ "text_loss": 0.2985081672668457
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.934546521866745,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.0009035055437763704,
+ "loss": 0.0124,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4098663.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021238960325717926,
+ "skip_count": 0.0,
+ "step": 2542,
+ "text_loss": 0.35359489917755127
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 11.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05859375,
+ "learning_rate": 0.0009033226857413803,
+ "loss": 0.0163,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4101588.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0024701557122170925,
+ "skip_count": 0.0,
+ "step": 2544,
+ "text_loss": 1.1577601432800293
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.080078125,
+ "learning_rate": 0.000903139673152376,
+ "loss": 0.012,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4104643.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002499542199075222,
+ "skip_count": 0.0,
+ "step": 2546,
+ "text_loss": 1.0173401832580566
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.962723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.059814453125,
+ "learning_rate": 0.0009029565060794885,
+ "loss": 0.0165,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4109247.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034200598020106554,
+ "skip_count": 0.0,
+ "step": 2548,
+ "text_loss": 0.5690504312515259
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 11.972116231288524,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.06884765625,
+ "learning_rate": 0.0009027731845929079,
+ "loss": 0.0155,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 4112597.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015981333330273628,
+ "skip_count": 1.0,
+ "step": 2550,
+ "text_loss": 0.294549822807312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 11.981508658643968,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.06103515625,
+ "learning_rate": 0.0009025897087628829,
+ "loss": 0.0064,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4115844.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02606951631605625,
+ "skip_count": 2.0,
+ "step": 2552,
+ "text_loss": 0.22692419588565826
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 11.990901085999413,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.080078125,
+ "learning_rate": 0.0009024060786597222,
+ "loss": 0.0202,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4118634.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001026194542646408,
+ "skip_count": 0.0,
+ "step": 2554,
+ "text_loss": 0.6807059645652771
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.000902222294353793,
+ "loss": 0.0124,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4122024.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001974924933165312,
+ "skip_count": 0.0,
+ "step": 2556,
+ "text_loss": 0.7373668551445007
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.009392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04833984375,
+ "learning_rate": 0.0009020383559155219,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 4124803.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004662613850086927,
+ "skip_count": 2.0,
+ "step": 2558,
+ "text_loss": 0.21808166801929474
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.018784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.0009018542634153943,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4127680.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006881687790155411,
+ "skip_count": 0.0,
+ "step": 2560,
+ "text_loss": 0.25192978978157043
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 12.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.0009016700169239551,
+ "loss": 0.0105,
+ "macro_f1": 1.0,
+ "num_tokens": 4130431.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005977808032184839,
+ "skip_count": 1.0,
+ "step": 2562,
+ "text_loss": 0.4700816869735718
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0009014856165118075,
+ "loss": 0.0153,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4133535.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007005698047578335,
+ "skip_count": 1.0,
+ "step": 2564,
+ "text_loss": 0.6558199524879456
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.046962136777223,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0009013010622496144,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4136534.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007262171246111393,
+ "skip_count": 0.0,
+ "step": 2566,
+ "text_loss": 0.2565421462059021
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 12.056354564132668,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0009011163542080971,
+ "loss": 0.0088,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 4139762.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.05431923270225525,
+ "skip_count": 3.0,
+ "step": 2568,
+ "text_loss": 0.19896510243415833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.0009009314924580363,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4143398.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003667369019240141,
+ "skip_count": 0.0,
+ "step": 2570,
+ "text_loss": 0.6581419110298157
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.075139418843557,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052978515625,
+ "learning_rate": 0.0009007464770702712,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4146248.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00132099783513695,
+ "skip_count": 0.0,
+ "step": 2572,
+ "text_loss": 0.5316711068153381
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0009005613081157002,
+ "loss": 0.0132,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4149455.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020061524119228125,
+ "skip_count": 0.0,
+ "step": 2574,
+ "text_loss": 0.5400773882865906
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05517578125,
+ "learning_rate": 0.0009003759856652802,
+ "loss": 0.0111,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4152774.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002621434163302183,
+ "skip_count": 1.0,
+ "step": 2576,
+ "text_loss": 0.3672606945037842
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.103316700909891,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051513671875,
+ "learning_rate": 0.0009001905097900273,
+ "loss": 0.0121,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4155835.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005290219560265541,
+ "skip_count": 0.0,
+ "step": 2578,
+ "text_loss": 0.8159038424491882
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0009000048805610161,
+ "loss": 0.0119,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4158874.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013576085912063718,
+ "skip_count": 0.0,
+ "step": 2580,
+ "text_loss": 0.5518951416015625
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.138671875,
+ "learning_rate": 0.00089981909804938,
+ "loss": 0.0143,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4162076.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021483441814780235,
+ "skip_count": 0.0,
+ "step": 2582,
+ "text_loss": 0.43552228808403015
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.25,
+ "avg_layers": 28.0,
+ "epoch": 12.131493982976226,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0008996331623263114,
+ "loss": 0.0117,
+ "macro_f1": 0.7795917987823486,
+ "num_tokens": 4165041.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0544300302863121,
+ "skip_count": 4.0,
+ "step": 2584,
+ "text_loss": 0.24812501668930054
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.140886410331671,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 0.0008994470734630611,
+ "loss": 0.0101,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4168290.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017150711501017213,
+ "skip_count": 0.0,
+ "step": 2586,
+ "text_loss": 0.6392097473144531
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 12.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0008992608315309388,
+ "loss": 0.015,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4171310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0046473173424601555,
+ "skip_count": 2.0,
+ "step": 2588,
+ "text_loss": 0.6534156799316406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.15967126504256,
+ "f1_execute": 0.943396270275116,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06591796875,
+ "learning_rate": 0.0008990744366013125,
+ "loss": 0.0105,
+ "macro_f1": 0.3144654333591461,
+ "num_tokens": 4174042.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.060913100838661194,
+ "skip_count": 1.0,
+ "step": 2590,
+ "text_loss": 0.5365690588951111
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 12.169063692398003,
+ "f1_execute": 0.9583333134651184,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.055419921875,
+ "learning_rate": 0.0008988878887456093,
+ "loss": 0.0118,
+ "macro_f1": 0.6051587462425232,
+ "num_tokens": 4177666.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06268956512212753,
+ "skip_count": 4.0,
+ "step": 2592,
+ "text_loss": 0.226226806640625
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.178456119753449,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0008987011880353149,
+ "loss": 0.0089,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 4180490.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030141465365886688,
+ "skip_count": 2.0,
+ "step": 2594,
+ "text_loss": 0.2581401765346527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 12.187848547108894,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0008985143345419729,
+ "loss": 0.0082,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4183300.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018745863810181618,
+ "skip_count": 2.0,
+ "step": 2596,
+ "text_loss": 0.7778542637825012
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 12.197240974464338,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0008983273283371862,
+ "loss": 0.0096,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4186535.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.026792079210281372,
+ "skip_count": 2.0,
+ "step": 2598,
+ "text_loss": 0.34700271487236023
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0008981401694926159,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4189082.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001914160675369203,
+ "skip_count": 0.0,
+ "step": 2600,
+ "text_loss": 0.6879339218139648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 12.216025829175228,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.0008979528580799815,
+ "loss": 0.0136,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4192330.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007978348061442375,
+ "skip_count": 2.0,
+ "step": 2602,
+ "text_loss": 0.3524550497531891
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 12.225418256530672,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.0008977653941710613,
+ "loss": 0.0134,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4196117.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0035376469604671,
+ "skip_count": 0.0,
+ "step": 2604,
+ "text_loss": 0.42356348037719727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05810546875,
+ "learning_rate": 0.0008975777778376916,
+ "loss": 0.0156,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4200423.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008262477815151215,
+ "skip_count": 1.0,
+ "step": 2606,
+ "text_loss": 0.5272893905639648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.244203111241562,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0732421875,
+ "learning_rate": 0.0008973900091517675,
+ "loss": 0.0114,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4203257.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022957922890782356,
+ "skip_count": 1.0,
+ "step": 2608,
+ "text_loss": 0.2713734805583954
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 12.253595538597006,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043701171875,
+ "learning_rate": 0.000897202088185242,
+ "loss": 0.0085,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4206243.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006623407825827599,
+ "skip_count": 2.0,
+ "step": 2610,
+ "text_loss": 0.5920525789260864
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.262987965952451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.0008970140150101274,
+ "loss": 0.0116,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4209264.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008602747693657875,
+ "skip_count": 0.0,
+ "step": 2612,
+ "text_loss": 0.33421996235847473
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0008968257896984932,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4212058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024653903674334288,
+ "skip_count": 1.0,
+ "step": 2614,
+ "text_loss": 0.37923356890678406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 0.0008966374123224677,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4214929.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010878405533730984,
+ "skip_count": 0.0,
+ "step": 2616,
+ "text_loss": 0.4350503981113434
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.291165248018785,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0008964488829542376,
+ "loss": 0.0083,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4219170.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02864212542772293,
+ "skip_count": 1.0,
+ "step": 2618,
+ "text_loss": 0.26250728964805603
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 12.300557675374229,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0008962602016660478,
+ "loss": 0.0096,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4222077.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010444172658026218,
+ "skip_count": 2.0,
+ "step": 2620,
+ "text_loss": 0.4718937575817108
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.309950102729674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0478515625,
+ "learning_rate": 0.0008960713685302011,
+ "loss": 0.0105,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4225383.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006409442983567715,
+ "skip_count": 1.0,
+ "step": 2622,
+ "text_loss": 0.30420538783073425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.31934253008512,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0008958823836190588,
+ "loss": 0.005,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4228349.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009996986016631126,
+ "skip_count": 1.0,
+ "step": 2624,
+ "text_loss": 0.5392362475395203
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0008956932470050404,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4232007.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014383369125425816,
+ "skip_count": 0.0,
+ "step": 2626,
+ "text_loss": 0.7112401127815247
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 12.338127384796008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0008955039587606233,
+ "loss": 0.0109,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4235122.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00781513936817646,
+ "skip_count": 3.0,
+ "step": 2628,
+ "text_loss": 0.17802883684635162
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 12.347519812151454,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.0008953145189583429,
+ "loss": 0.0126,
+ "macro_f1": 0.542222261428833,
+ "num_tokens": 4238248.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.062252625823020935,
+ "skip_count": 4.0,
+ "step": 2630,
+ "text_loss": 0.5551572442054749
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0008951249276707933,
+ "loss": 0.0116,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4241042.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011421777307987213,
+ "skip_count": 0.0,
+ "step": 2632,
+ "text_loss": 0.7092233896255493
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.366304666862343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07177734375,
+ "learning_rate": 0.0008949351849706261,
+ "loss": 0.0117,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4243939.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032689040526747704,
+ "skip_count": 0.0,
+ "step": 2634,
+ "text_loss": 0.19925718009471893
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 12.375697094217786,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0008947452909305509,
+ "loss": 0.0109,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4247535.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002066014800220728,
+ "skip_count": 0.0,
+ "step": 2636,
+ "text_loss": 0.5249715447425842
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 29.0,
+ "epoch": 12.385089521573232,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.09326171875,
+ "learning_rate": 0.0008945552456233356,
+ "loss": 0.0169,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 4251441.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.029332537204027176,
+ "skip_count": 2.0,
+ "step": 2638,
+ "text_loss": 0.19229578971862793
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.394481948928677,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.078125,
+ "learning_rate": 0.0008943650491218058,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4254314.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0075911120511591434,
+ "skip_count": 0.0,
+ "step": 2640,
+ "text_loss": 0.27059751749038696
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.0008941747014988453,
+ "loss": 0.0156,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4257442.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009030844084918499,
+ "skip_count": 0.0,
+ "step": 2642,
+ "text_loss": 0.36747801303863525
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.123046875,
+ "learning_rate": 0.0008939842028273956,
+ "loss": 0.0112,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4260386.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007844001986086369,
+ "skip_count": 1.0,
+ "step": 2644,
+ "text_loss": 0.6397647857666016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.422659230995011,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.0008937935531804562,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4263516.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018789108144119382,
+ "skip_count": 0.0,
+ "step": 2646,
+ "text_loss": 0.4795534908771515
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.432051658350455,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06494140625,
+ "learning_rate": 0.0008936027526310844,
+ "loss": 0.0098,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4266744.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0348590686917305,
+ "skip_count": 1.0,
+ "step": 2648,
+ "text_loss": 0.27691999077796936
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07275390625,
+ "learning_rate": 0.000893411801252395,
+ "loss": 0.015,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4269766.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004543309565633535,
+ "skip_count": 1.0,
+ "step": 2650,
+ "text_loss": 0.18867231905460358
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0008932206991175615,
+ "loss": 0.0141,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4273513.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035277456045150757,
+ "skip_count": 1.0,
+ "step": 2652,
+ "text_loss": 0.45613357424736023
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 12.460228940416789,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055908203125,
+ "learning_rate": 0.0008930294462998143,
+ "loss": 0.015,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4276878.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011337592266499996,
+ "skip_count": 0.0,
+ "step": 2654,
+ "text_loss": 0.24733254313468933
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0869140625,
+ "learning_rate": 0.0008928380428724419,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4279915.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010295971296727657,
+ "skip_count": 1.0,
+ "step": 2656,
+ "text_loss": 0.41722849011421204
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 12.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.0008926464889087903,
+ "loss": 0.0116,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4282888.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017198545392602682,
+ "skip_count": 2.0,
+ "step": 2658,
+ "text_loss": 0.738322377204895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.488406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0008924547844822634,
+ "loss": 0.0103,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4285805.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001339946174994111,
+ "skip_count": 0.0,
+ "step": 2660,
+ "text_loss": 0.4802379906177521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.497798649838568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05322265625,
+ "learning_rate": 0.000892262929666323,
+ "loss": 0.0103,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4290282.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022340165451169014,
+ "skip_count": 0.0,
+ "step": 2662,
+ "text_loss": 0.6503544449806213
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0008920709245344878,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4294106.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005288850050419569,
+ "skip_count": 1.0,
+ "step": 2664,
+ "text_loss": 0.12312037497758865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.516583504549457,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0008918787691603347,
+ "loss": 0.0121,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4298013.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004259659443050623,
+ "skip_count": 1.0,
+ "step": 2666,
+ "text_loss": 0.3070000112056732
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.000891686463617498,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4300799.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009489355608820915,
+ "skip_count": 1.0,
+ "step": 2668,
+ "text_loss": 0.18535588681697845
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055908203125,
+ "learning_rate": 0.0008914940079796696,
+ "loss": 0.0114,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4304641.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025417013093829155,
+ "skip_count": 0.0,
+ "step": 2670,
+ "text_loss": 0.482585072517395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.544760786615791,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 0.0008913014023205988,
+ "loss": 0.0108,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4307462.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006371749565005302,
+ "skip_count": 0.0,
+ "step": 2672,
+ "text_loss": 0.7064456939697266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0008911086467140925,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4310396.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027512952219694853,
+ "skip_count": 0.0,
+ "step": 2674,
+ "text_loss": 0.23532851040363312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05712890625,
+ "learning_rate": 0.000890915741234015,
+ "loss": 0.0133,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4314781.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008253013715147972,
+ "skip_count": 1.0,
+ "step": 2676,
+ "text_loss": 0.30950358510017395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 12.572938068682125,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0008907226859542879,
+ "loss": 0.0105,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4317988.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005409995559602976,
+ "skip_count": 2.0,
+ "step": 2678,
+ "text_loss": 0.4930732846260071
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 12.582330496037569,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.060546875,
+ "learning_rate": 0.0008905294809488907,
+ "loss": 0.0084,
+ "macro_f1": 1.0,
+ "num_tokens": 4321014.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0029942214023321867,
+ "skip_count": 1.0,
+ "step": 2680,
+ "text_loss": 0.6224040389060974
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06982421875,
+ "learning_rate": 0.0008903361262918595,
+ "loss": 0.0115,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4324268.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008411120623350143,
+ "skip_count": 1.0,
+ "step": 2682,
+ "text_loss": 0.16296671330928802
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 12.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05126953125,
+ "learning_rate": 0.0008901426220572884,
+ "loss": 0.0138,
+ "macro_f1": 1.0,
+ "num_tokens": 4327494.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01039006095379591,
+ "skip_count": 4.0,
+ "step": 2684,
+ "text_loss": 0.43866512179374695
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.610507778103903,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060791015625,
+ "learning_rate": 0.0008899489683193286,
+ "loss": 0.0107,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4330936.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009329111780971289,
+ "skip_count": 0.0,
+ "step": 2686,
+ "text_loss": 0.44250962138175964
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.619900205459349,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07421875,
+ "learning_rate": 0.0008897551651521885,
+ "loss": 0.0111,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4334123.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003197216661646962,
+ "skip_count": 0.0,
+ "step": 2688,
+ "text_loss": 0.48313501477241516
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.629292632814794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09716796875,
+ "learning_rate": 0.0008895612126301339,
+ "loss": 0.0157,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4337610.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033548236824572086,
+ "skip_count": 0.0,
+ "step": 2690,
+ "text_loss": 0.4715327322483063
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.638685060170237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051513671875,
+ "learning_rate": 0.0008893671108274877,
+ "loss": 0.0118,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4341026.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024757643695920706,
+ "skip_count": 0.0,
+ "step": 2692,
+ "text_loss": 0.43402785062789917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0008891728598186302,
+ "loss": 0.0109,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4344422.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003317243419587612,
+ "skip_count": 0.0,
+ "step": 2694,
+ "text_loss": 0.8498559594154358
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 12.657469914881126,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0008889784596779986,
+ "loss": 0.009,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 4347507.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01577926240861416,
+ "skip_count": 3.0,
+ "step": 2696,
+ "text_loss": 0.5646669864654541
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.666862342236572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11328125,
+ "learning_rate": 0.0008887839104800876,
+ "loss": 0.0124,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4350414.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002953822258859873,
+ "skip_count": 0.0,
+ "step": 2698,
+ "text_loss": 0.5145012140274048
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05029296875,
+ "learning_rate": 0.0008885892122994486,
+ "loss": 0.0112,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4354110.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005849295295774937,
+ "skip_count": 0.0,
+ "step": 2700,
+ "text_loss": 0.580982506275177
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.0008883943652106903,
+ "loss": 0.0086,
+ "macro_f1": 1.0,
+ "num_tokens": 4357323.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012347398325800896,
+ "skip_count": 2.0,
+ "step": 2702,
+ "text_loss": 0.2234988808631897
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.695039624302906,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 0.0008881993692884787,
+ "loss": 0.0128,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4360228.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003574999049305916,
+ "skip_count": 1.0,
+ "step": 2704,
+ "text_loss": 0.4261806607246399
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.704432051658351,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0008880042246075365,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4363905.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031574300955981016,
+ "skip_count": 0.0,
+ "step": 2706,
+ "text_loss": 0.691118061542511
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.0008878089312426433,
+ "loss": 0.0091,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4366736.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003195564029738307,
+ "skip_count": 0.0,
+ "step": 2708,
+ "text_loss": 0.613926112651825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6000000238418579,
+ "avg_layers": 25.0,
+ "epoch": 12.72321690636924,
+ "f1_execute": 0.9583333134651184,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.75,
+ "grad_norm": 0.054443359375,
+ "learning_rate": 0.0008876134892686363,
+ "loss": 0.011,
+ "macro_f1": 0.5694444179534912,
+ "num_tokens": 4370146.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.038784291595220566,
+ "skip_count": 5.0,
+ "step": 2710,
+ "text_loss": 0.2723451852798462
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0830078125,
+ "learning_rate": 0.000887417898760409,
+ "loss": 0.0126,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4373653.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006457131239585578,
+ "skip_count": 0.0,
+ "step": 2712,
+ "text_loss": 0.31667640805244446
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.742001761080129,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10498046875,
+ "learning_rate": 0.000887222159792912,
+ "loss": 0.0155,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 4376993.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.045078590512275696,
+ "skip_count": 1.0,
+ "step": 2714,
+ "text_loss": 0.5872798562049866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.751394188435574,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0008870262724411528,
+ "loss": 0.012,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4380160.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003628545207902789,
+ "skip_count": 0.0,
+ "step": 2716,
+ "text_loss": 0.7468157410621643
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 12.760786615791018,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.11181640625,
+ "learning_rate": 0.0008868302367801962,
+ "loss": 0.0118,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 4383100.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.05404464527964592,
+ "skip_count": 3.0,
+ "step": 2718,
+ "text_loss": 0.2970244884490967
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0008866340528851629,
+ "loss": 0.0103,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4386700.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007000274024903774,
+ "skip_count": 0.0,
+ "step": 2720,
+ "text_loss": 0.34521186351776123
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 12.779571470501908,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.052978515625,
+ "learning_rate": 0.0008864377208312313,
+ "loss": 0.0082,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 4390299.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02025366574525833,
+ "skip_count": 2.0,
+ "step": 2722,
+ "text_loss": 1.0536936521530151
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.788963897857352,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.000886241240693636,
+ "loss": 0.0098,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4393353.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00251673418097198,
+ "skip_count": 0.0,
+ "step": 2724,
+ "text_loss": 0.5678093433380127
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 12.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 0.0008860446125476686,
+ "loss": 0.0135,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4396446.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009532532654702663,
+ "skip_count": 0.0,
+ "step": 2726,
+ "text_loss": 0.23775041103363037
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 12.807748752568243,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.091796875,
+ "learning_rate": 0.0008858478364686776,
+ "loss": 0.0099,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4399977.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008062181062996387,
+ "skip_count": 0.0,
+ "step": 2728,
+ "text_loss": 0.18888695538043976
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.817141179923686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0008856509125320678,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4404406.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007731119985692203,
+ "skip_count": 0.0,
+ "step": 2730,
+ "text_loss": 0.47331541776657104
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.826533607279131,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.0008854538408133006,
+ "loss": 0.0114,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4407165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003115242812782526,
+ "skip_count": 1.0,
+ "step": 2732,
+ "text_loss": 0.491370290517807
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0008852566213878947,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4410101.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008958528051152825,
+ "skip_count": 0.0,
+ "step": 2734,
+ "text_loss": 0.42188262939453125
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 12.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07763671875,
+ "learning_rate": 0.0008850592543314246,
+ "loss": 0.0118,
+ "macro_f1": 1.0,
+ "num_tokens": 4413015.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01139112375676632,
+ "skip_count": 1.0,
+ "step": 2736,
+ "text_loss": 0.4716498553752899
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 12.854710889345466,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 0.0008848617397195218,
+ "loss": 0.0084,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 4416404.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01609630137681961,
+ "skip_count": 1.0,
+ "step": 2738,
+ "text_loss": 0.19490821659564972
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0008846640776278745,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4419408.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001489170710556209,
+ "skip_count": 0.0,
+ "step": 2740,
+ "text_loss": 0.6443108320236206
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 12.873495744056354,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0008844662681322269,
+ "loss": 0.0144,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4422067.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0014755792217329144,
+ "skip_count": 0.0,
+ "step": 2742,
+ "text_loss": 0.9150356650352478
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.0008842683113083801,
+ "loss": 0.0149,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4425647.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008962674997746944,
+ "skip_count": 1.0,
+ "step": 2744,
+ "text_loss": 0.7103227972984314
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 12.892280598767243,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0751953125,
+ "learning_rate": 0.0008840702072321915,
+ "loss": 0.0104,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 4428855.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02554207295179367,
+ "skip_count": 3.0,
+ "step": 2746,
+ "text_loss": 0.27141591906547546
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.901673026122689,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.0008838719559795751,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4432838.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011747616808861494,
+ "skip_count": 0.0,
+ "step": 2748,
+ "text_loss": 0.4007738530635834
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 12.911065453478134,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0008836735576265009,
+ "loss": 0.0073,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4435793.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017564335837960243,
+ "skip_count": 2.0,
+ "step": 2750,
+ "text_loss": 0.5972410440444946
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 12.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044921875,
+ "learning_rate": 0.0008834750122489956,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4438871.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007004009559750557,
+ "skip_count": 0.0,
+ "step": 2752,
+ "text_loss": 0.2294853925704956
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.929850308189023,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.0008832763199231423,
+ "loss": 0.0107,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4441846.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014562139986082911,
+ "skip_count": 0.0,
+ "step": 2754,
+ "text_loss": 0.722432017326355
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.939242735544468,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0751953125,
+ "learning_rate": 0.0008830774807250802,
+ "loss": 0.013,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4444786.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.024773593991994858,
+ "skip_count": 0.0,
+ "step": 2756,
+ "text_loss": 0.507905125617981
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 12.948635162899912,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0008828784947310049,
+ "loss": 0.0129,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 4448442.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04959975928068161,
+ "skip_count": 2.0,
+ "step": 2758,
+ "text_loss": 0.3617522418498993
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 12.958027590255357,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1025390625,
+ "learning_rate": 0.000882679362017168,
+ "loss": 0.0149,
+ "macro_f1": 1.0,
+ "num_tokens": 4451401.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005783245898783207,
+ "skip_count": 2.0,
+ "step": 2760,
+ "text_loss": 0.49187400937080383
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0791015625,
+ "learning_rate": 0.0008824800826598778,
+ "loss": 0.0127,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4454537.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00656260596588254,
+ "skip_count": 0.0,
+ "step": 2762,
+ "text_loss": 0.6823583245277405
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 12.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0546875,
+ "learning_rate": 0.0008822806567354983,
+ "loss": 0.0111,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4457706.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005298966076225042,
+ "skip_count": 0.0,
+ "step": 2764,
+ "text_loss": 0.554322361946106
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.986204872321691,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046630859375,
+ "learning_rate": 0.0008820810843204501,
+ "loss": 0.0096,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4460710.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03164982795715332,
+ "skip_count": 1.0,
+ "step": 2766,
+ "text_loss": 0.1656961441040039
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 12.995597299677135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.072265625,
+ "learning_rate": 0.0008818813654912095,
+ "loss": 0.0162,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4464001.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000715116853825748,
+ "skip_count": 0.0,
+ "step": 2768,
+ "text_loss": 0.5818144083023071
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.004696213677722,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.056396484375,
+ "learning_rate": 0.0008816815003243093,
+ "loss": 0.0133,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4467364.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002851625671610236,
+ "skip_count": 0.0,
+ "step": 2770,
+ "text_loss": 0.6068631410598755
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.014088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0008814814888963383,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4470681.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004729873035103083,
+ "skip_count": 1.0,
+ "step": 2772,
+ "text_loss": 0.5386646389961243
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.000881281331283941,
+ "loss": 0.0091,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4473734.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031853127293288708,
+ "skip_count": 1.0,
+ "step": 2774,
+ "text_loss": 0.5695263147354126
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0008810810275638182,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4478404.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008977465913631022,
+ "skip_count": 0.0,
+ "step": 2776,
+ "text_loss": 0.4750773310661316
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.042265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0008808805778127269,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4481287.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00469845999032259,
+ "skip_count": 0.0,
+ "step": 2778,
+ "text_loss": 0.14078612625598907
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 13.051658350454945,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.0008806799821074796,
+ "loss": 0.0079,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4483929.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01789761893451214,
+ "skip_count": 2.0,
+ "step": 2780,
+ "text_loss": 0.2167191207408905
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.056396484375,
+ "learning_rate": 0.0008804792405249451,
+ "loss": 0.0123,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4487468.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001018838956952095,
+ "skip_count": 0.0,
+ "step": 2782,
+ "text_loss": 0.5424665212631226
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 28.0,
+ "epoch": 13.070443205165835,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.000880278353142048,
+ "loss": 0.0077,
+ "macro_f1": 0.8200000524520874,
+ "num_tokens": 4490942.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03260354697704315,
+ "skip_count": 3.0,
+ "step": 2784,
+ "text_loss": 0.20994654297828674
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.079835632521279,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05322265625,
+ "learning_rate": 0.0008800773200357683,
+ "loss": 0.0122,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4493986.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003019835101440549,
+ "skip_count": 0.0,
+ "step": 2786,
+ "text_loss": 0.5709528923034668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0008798761412831429,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4498232.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00285192858427763,
+ "skip_count": 0.0,
+ "step": 2788,
+ "text_loss": 0.5103896260261536
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044921875,
+ "learning_rate": 0.0008796748169612634,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4501231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012469831854104996,
+ "skip_count": 0.0,
+ "step": 2790,
+ "text_loss": 0.43669697642326355
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.108012914587613,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0008794733471472778,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4504208.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011512776836752892,
+ "skip_count": 1.0,
+ "step": 2792,
+ "text_loss": 0.2299770563840866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.117405341943059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0008792717319183899,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4507013.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00834917277097702,
+ "skip_count": 0.0,
+ "step": 2794,
+ "text_loss": 0.2130603939294815
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.126797769298504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0008790699713518587,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4510286.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008616939187049866,
+ "skip_count": 2.0,
+ "step": 2796,
+ "text_loss": 0.4377101957798004
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.136190196653947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0008788680655249994,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4513762.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003408568911254406,
+ "skip_count": 0.0,
+ "step": 2798,
+ "text_loss": 0.435138463973999
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 13.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0008786660145151826,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4516696.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0029398901388049126,
+ "skip_count": 0.0,
+ "step": 2800,
+ "text_loss": 0.3195655047893524
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0008784638183998348,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4519760.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013777425047010183,
+ "skip_count": 0.0,
+ "step": 2802,
+ "text_loss": 0.8129430413246155
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.164367478720282,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0008782614772564379,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4522106.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031694830395281315,
+ "skip_count": 0.0,
+ "step": 2804,
+ "text_loss": 0.18083660304546356
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0008780589911625293,
+ "loss": 0.0114,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4525743.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002161208540201187,
+ "skip_count": 0.0,
+ "step": 2806,
+ "text_loss": 0.8228182792663574
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07177734375,
+ "learning_rate": 0.0008778563601957021,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4529573.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028444856870919466,
+ "skip_count": 1.0,
+ "step": 2808,
+ "text_loss": 0.3715563118457794
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.192544760786616,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0008776535844336049,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4532452.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003807213855907321,
+ "skip_count": 0.0,
+ "step": 2810,
+ "text_loss": 0.6012523174285889
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.201937188142061,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0008774506639539417,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4536077.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006698979996144772,
+ "skip_count": 0.0,
+ "step": 2812,
+ "text_loss": 0.27097949385643005
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.099609375,
+ "learning_rate": 0.0008772475988344722,
+ "loss": 0.013,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4539057.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004849409218877554,
+ "skip_count": 1.0,
+ "step": 2814,
+ "text_loss": 1.026973843574524
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 13.22072204285295,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0008770443891530109,
+ "loss": 0.0115,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 4542253.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019148651510477066,
+ "skip_count": 3.0,
+ "step": 2816,
+ "text_loss": 0.2717585563659668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.230114470208395,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 0.0008768410349874286,
+ "loss": 0.0098,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 4545047.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02231316640973091,
+ "skip_count": 2.0,
+ "step": 2818,
+ "text_loss": 0.274346262216568
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.239506897563839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0008766375364156508,
+ "loss": 0.0091,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4548371.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008014129474759102,
+ "skip_count": 2.0,
+ "step": 2820,
+ "text_loss": 0.22850871086120605
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.248899324919284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.0008764338935156586,
+ "loss": 0.0095,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4551276.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014544493751600385,
+ "skip_count": 0.0,
+ "step": 2822,
+ "text_loss": 0.6308462023735046
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 13.258291752274728,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.000876230106365488,
+ "loss": 0.0123,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4554143.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00818584579974413,
+ "skip_count": 3.0,
+ "step": 2824,
+ "text_loss": 0.3484207093715668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 13.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0008760261750432312,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4557256.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006275608204305172,
+ "skip_count": 3.0,
+ "step": 2826,
+ "text_loss": 0.1927330046892166
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 13.277076606985618,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0008758220996270348,
+ "loss": 0.0103,
+ "macro_f1": 1.0,
+ "num_tokens": 4560202.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0055974251590669155,
+ "skip_count": 2.0,
+ "step": 2828,
+ "text_loss": 0.7796496748924255
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.286469034341062,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.0008756178801951007,
+ "loss": 0.0129,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4563508.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019799957517534494,
+ "skip_count": 0.0,
+ "step": 2830,
+ "text_loss": 0.49633297324180603
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 0.0008754135168256865,
+ "loss": 0.0095,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4566776.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004538947716355324,
+ "skip_count": 0.0,
+ "step": 2832,
+ "text_loss": 0.5346745252609253
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.305253889051952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0008752090095971044,
+ "loss": 0.0091,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4569787.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001663343166001141,
+ "skip_count": 0.0,
+ "step": 2834,
+ "text_loss": 0.5524004697799683
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.314646316407396,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.000875004358587722,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4572813.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022988212294876575,
+ "skip_count": 0.0,
+ "step": 2836,
+ "text_loss": 0.4232870042324066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.324038743762841,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.000874799563875962,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4575563.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007781553082168102,
+ "skip_count": 1.0,
+ "step": 2838,
+ "text_loss": 0.19239822030067444
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 13.333431171118287,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0008745946255403021,
+ "loss": 0.0072,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4578117.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01872488670051098,
+ "skip_count": 2.0,
+ "step": 2840,
+ "text_loss": 0.2148810178041458
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 13.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0008743895436592749,
+ "loss": 0.0078,
+ "macro_f1": 1.0,
+ "num_tokens": 4582330.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005634195636957884,
+ "skip_count": 1.0,
+ "step": 2842,
+ "text_loss": 0.4929640591144562
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048583984375,
+ "learning_rate": 0.0008741843183114685,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4585765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008928569150157273,
+ "skip_count": 0.0,
+ "step": 2844,
+ "text_loss": 0.32702967524528503
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 13.361608453184619,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.0008739789495755253,
+ "loss": 0.0094,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4589000.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014715569093823433,
+ "skip_count": 4.0,
+ "step": 2846,
+ "text_loss": 0.25125816464424133
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.371000880540064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.0008737734375301433,
+ "loss": 0.0135,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4592391.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017551190685480833,
+ "skip_count": 0.0,
+ "step": 2848,
+ "text_loss": 0.6595172882080078
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0008735677822540749,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4596662.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006456313421949744,
+ "skip_count": 0.0,
+ "step": 2850,
+ "text_loss": 0.6290773153305054
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0008733619838261276,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4599682.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00765060493722558,
+ "skip_count": 2.0,
+ "step": 2852,
+ "text_loss": 0.3268161416053772
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.399178162606399,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0008731560423251637,
+ "loss": 0.01,
+ "macro_f1": 1.0,
+ "num_tokens": 4603324.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01161442045122385,
+ "skip_count": 2.0,
+ "step": 2854,
+ "text_loss": 0.3029932975769043
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 25.0,
+ "epoch": 13.408570589961844,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.888888955116272,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.0008729499578301005,
+ "loss": 0.0098,
+ "macro_f1": 0.9555556178092957,
+ "num_tokens": 4606975.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02055389992892742,
+ "skip_count": 5.0,
+ "step": 2856,
+ "text_loss": 0.6268532872200012
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.00087274373041991,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4609629.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013911726418882608,
+ "skip_count": 0.0,
+ "step": 2858,
+ "text_loss": 0.534355640411377
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 13.427355444672733,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.0008725373601736188,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4612913.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01010701060295105,
+ "skip_count": 0.0,
+ "step": 2860,
+ "text_loss": 0.3391380310058594
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.0008723308471703085,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4616718.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005969462916254997,
+ "skip_count": 1.0,
+ "step": 2862,
+ "text_loss": 0.47250816226005554
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.446140299383622,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046630859375,
+ "learning_rate": 0.0008721241914891152,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4619680.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027780034579336643,
+ "skip_count": 0.0,
+ "step": 2864,
+ "text_loss": 0.3249278664588928
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.455532726739067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.0008719173932092295,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4622700.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015912104863673449,
+ "skip_count": 0.0,
+ "step": 2866,
+ "text_loss": 0.7789985537528992
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05126953125,
+ "learning_rate": 0.0008717104524098973,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4626637.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036539011634886265,
+ "skip_count": 0.0,
+ "step": 2868,
+ "text_loss": 0.619088351726532
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10400390625,
+ "learning_rate": 0.0008715033691704187,
+ "loss": 0.0118,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4629863.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008402476087212563,
+ "skip_count": 1.0,
+ "step": 2870,
+ "text_loss": 0.5550018548965454
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.483710008805401,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 0.0008712961435701479,
+ "loss": 0.0161,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4632657.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01400839351117611,
+ "skip_count": 1.0,
+ "step": 2872,
+ "text_loss": 0.17368625104427338
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.493102436160845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.0008710887756884947,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4635885.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014573842054232955,
+ "skip_count": 0.0,
+ "step": 2874,
+ "text_loss": 0.5138643383979797
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0008708812656049225,
+ "loss": 0.009,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4639341.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002810224425047636,
+ "skip_count": 1.0,
+ "step": 2876,
+ "text_loss": 0.70310378074646
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 27.0,
+ "epoch": 13.511887290871735,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0008706736133989497,
+ "loss": 0.0105,
+ "macro_f1": 0.9449735879898071,
+ "num_tokens": 4642163.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.029783209785819054,
+ "skip_count": 4.0,
+ "step": 2878,
+ "text_loss": 0.26898008584976196
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.521279718227179,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0008704658191501491,
+ "loss": 0.0095,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4645858.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009193966398015618,
+ "skip_count": 0.0,
+ "step": 2880,
+ "text_loss": 0.6047570705413818
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 13.530672145582624,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.0008702578829381475,
+ "loss": 0.0131,
+ "macro_f1": 0.8814815282821655,
+ "num_tokens": 4649237.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.05698608607053757,
+ "skip_count": 4.0,
+ "step": 2882,
+ "text_loss": 0.10695219784975052
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0008700498048426269,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4652362.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011786938412114978,
+ "skip_count": 0.0,
+ "step": 2884,
+ "text_loss": 0.4442957937717438
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 13.549457000293513,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.0008698415849433229,
+ "loss": 0.0092,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4655616.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.02142646163702011,
+ "skip_count": 0.0,
+ "step": 2886,
+ "text_loss": 0.5820964574813843
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.558849427648958,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0008696332233200262,
+ "loss": 0.0121,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4659294.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004038636106997728,
+ "skip_count": 0.0,
+ "step": 2888,
+ "text_loss": 0.11847645789384842
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0478515625,
+ "learning_rate": 0.0008694247200525806,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4662512.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013256469974294305,
+ "skip_count": 0.0,
+ "step": 2890,
+ "text_loss": 0.4873582720756531
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.577634282359847,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0008692160752208856,
+ "loss": 0.0129,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4666190.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04477972164750099,
+ "skip_count": 1.0,
+ "step": 2892,
+ "text_loss": 0.44243401288986206
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.09521484375,
+ "learning_rate": 0.0008690072889048941,
+ "loss": 0.0127,
+ "macro_f1": 1.0,
+ "num_tokens": 4668884.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004407547414302826,
+ "skip_count": 2.0,
+ "step": 2894,
+ "text_loss": 0.6847127079963684
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0008687983611846133,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4672093.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005245382897555828,
+ "skip_count": 1.0,
+ "step": 2896,
+ "text_loss": 0.25583332777023315
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.605811564426181,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 0.0008685892921401049,
+ "loss": 0.0108,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4674917.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010470855049788952,
+ "skip_count": 0.0,
+ "step": 2898,
+ "text_loss": 0.41998377442359924
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0008683800818514844,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4677739.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009026622399687767,
+ "skip_count": 2.0,
+ "step": 2900,
+ "text_loss": 0.303053081035614
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.09619140625,
+ "learning_rate": 0.0008681707303989215,
+ "loss": 0.0108,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4680721.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004500916693359613,
+ "skip_count": 0.0,
+ "step": 2902,
+ "text_loss": 0.5573288798332214
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.633988846492516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06982421875,
+ "learning_rate": 0.0008679612378626404,
+ "loss": 0.0098,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4683339.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005047840531915426,
+ "skip_count": 1.0,
+ "step": 2904,
+ "text_loss": 0.321353554725647
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.643381273847961,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0008677516043229187,
+ "loss": 0.0083,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4686453.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010256914421916008,
+ "skip_count": 1.0,
+ "step": 2906,
+ "text_loss": 0.4300784468650818
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 13.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05029296875,
+ "learning_rate": 0.0008675418298600883,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4689645.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0022669637110084295,
+ "skip_count": 0.0,
+ "step": 2908,
+ "text_loss": 0.5064885020256042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0008673319145545358,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4692320.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011188550852239132,
+ "skip_count": 0.0,
+ "step": 2910,
+ "text_loss": 0.7114819884300232
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.671558555914293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0008671218584867003,
+ "loss": 0.0102,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4695116.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002966561820358038,
+ "skip_count": 2.0,
+ "step": 2912,
+ "text_loss": 0.5662392973899841
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.680950983269739,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 0.0008669116617370762,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4698040.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012894890969619155,
+ "skip_count": 0.0,
+ "step": 2914,
+ "text_loss": 0.718977689743042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.690343410625184,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1552734375,
+ "learning_rate": 0.0008667013243862111,
+ "loss": 0.0162,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4700963.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007232456118799746,
+ "skip_count": 0.0,
+ "step": 2916,
+ "text_loss": 0.3447718024253845
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.699735837980628,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.000866490846514707,
+ "loss": 0.0075,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4704471.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015166680328547955,
+ "skip_count": 0.0,
+ "step": 2918,
+ "text_loss": 0.454946368932724
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 13.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04736328125,
+ "learning_rate": 0.000866280228203219,
+ "loss": 0.0073,
+ "macro_f1": 1.0,
+ "num_tokens": 4707238.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0061312485486269,
+ "skip_count": 1.0,
+ "step": 2920,
+ "text_loss": 0.721788227558136
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.718520692691518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055908203125,
+ "learning_rate": 0.0008660694695324564,
+ "loss": 0.0125,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4711323.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00169933564029634,
+ "skip_count": 0.0,
+ "step": 2922,
+ "text_loss": 0.7562121748924255
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.727913120046962,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0008658585705831829,
+ "loss": 0.0128,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4714417.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022731393110007048,
+ "skip_count": 0.0,
+ "step": 2924,
+ "text_loss": 0.5726147890090942
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.737305547402407,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.0008656475314362148,
+ "loss": 0.0131,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 4717445.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.06477782875299454,
+ "skip_count": 3.0,
+ "step": 2926,
+ "text_loss": 0.4505867660045624
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 27.0,
+ "epoch": 13.74669797475785,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.0008654363521724229,
+ "loss": 0.0129,
+ "macro_f1": 0.9449735879898071,
+ "num_tokens": 4722253.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.027405790984630585,
+ "skip_count": 4.0,
+ "step": 2928,
+ "text_loss": 0.24767601490020752
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.756090402113296,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0537109375,
+ "learning_rate": 0.0008652250328727315,
+ "loss": 0.0112,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4725465.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006544729229062796,
+ "skip_count": 2.0,
+ "step": 2930,
+ "text_loss": 0.4478724002838135
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 13.765482829468741,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.0008650135736181184,
+ "loss": 0.0134,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4729213.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0055119614116847515,
+ "skip_count": 0.0,
+ "step": 2932,
+ "text_loss": 0.6749323010444641
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.774875256824185,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 0.0008648019744896154,
+ "loss": 0.0101,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4732280.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008374541997909546,
+ "skip_count": 0.0,
+ "step": 2934,
+ "text_loss": 0.4647359251976013
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 13.78426768417963,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 0.0008645902355683077,
+ "loss": 0.0091,
+ "macro_f1": 0.6595745086669922,
+ "num_tokens": 4736244.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.068686343729496,
+ "skip_count": 4.0,
+ "step": 2936,
+ "text_loss": 0.5356017351150513
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 13.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0008643783569353339,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4739810.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.017954571172595024,
+ "skip_count": 0.0,
+ "step": 2938,
+ "text_loss": 0.3145926296710968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.803052538890519,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.054443359375,
+ "learning_rate": 0.0008641663386718863,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4742720.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006261351052671671,
+ "skip_count": 1.0,
+ "step": 2940,
+ "text_loss": 0.3200613856315613
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 13.812444966245964,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0008639541808592109,
+ "loss": 0.0093,
+ "macro_f1": 1.0,
+ "num_tokens": 4745870.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0025341357104480267,
+ "skip_count": 1.0,
+ "step": 2942,
+ "text_loss": 0.5020416378974915
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0008637418835786067,
+ "loss": 0.0094,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4748943.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008970048278570175,
+ "skip_count": 2.0,
+ "step": 2944,
+ "text_loss": 0.14517110586166382
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055908203125,
+ "learning_rate": 0.0008635294469114265,
+ "loss": 0.0112,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4751360.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002133632078766823,
+ "skip_count": 0.0,
+ "step": 2946,
+ "text_loss": 0.5367856025695801
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.840622248312298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08837890625,
+ "learning_rate": 0.0008633168709390766,
+ "loss": 0.0116,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4754403.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011866620043292642,
+ "skip_count": 0.0,
+ "step": 2948,
+ "text_loss": 0.38302522897720337
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 13.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0008631041557430163,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4757867.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0026854004245251417,
+ "skip_count": 0.0,
+ "step": 2950,
+ "text_loss": 0.43433454632759094
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05859375,
+ "learning_rate": 0.0008628913014047585,
+ "loss": 0.0102,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4761171.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002433479530736804,
+ "skip_count": 0.0,
+ "step": 2952,
+ "text_loss": 0.4725971519947052
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.868799530378633,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0008626783080058696,
+ "loss": 0.0066,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4764752.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.017182493582367897,
+ "skip_count": 0.0,
+ "step": 2954,
+ "text_loss": 0.460641473531723
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.878191957734076,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.12353515625,
+ "learning_rate": 0.0008624651756279687,
+ "loss": 0.0198,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4767453.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018134774873033166,
+ "skip_count": 0.0,
+ "step": 2956,
+ "text_loss": 0.4091459810733795
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 13.887584385089522,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053466796875,
+ "learning_rate": 0.000862251904352729,
+ "loss": 0.0108,
+ "macro_f1": 0.9259259104728699,
+ "num_tokens": 4771110.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.0365753099322319,
+ "skip_count": 3.0,
+ "step": 2958,
+ "text_loss": 0.22408585250377655
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.896976812444967,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05029296875,
+ "learning_rate": 0.000862038494261876,
+ "loss": 0.0109,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4774464.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.024343067780137062,
+ "skip_count": 1.0,
+ "step": 2960,
+ "text_loss": 0.16483014822006226
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0008618249454371891,
+ "loss": 0.01,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4777894.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008310087723657489,
+ "skip_count": 0.0,
+ "step": 2962,
+ "text_loss": 0.5573428869247437
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0008616112579605006,
+ "loss": 0.0117,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4781116.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0065494864247739315,
+ "skip_count": 0.0,
+ "step": 2964,
+ "text_loss": 0.18816794455051422
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.925154094511301,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.0008613974319136957,
+ "loss": 0.009,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4784886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019726944155991077,
+ "skip_count": 0.0,
+ "step": 2966,
+ "text_loss": 0.5097305774688721
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.934546521866745,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0849609375,
+ "learning_rate": 0.0008611834673787134,
+ "loss": 0.0118,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4787563.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006327496841549873,
+ "skip_count": 0.0,
+ "step": 2968,
+ "text_loss": 0.6953814029693604
+ },
+ {
+ "acc_repeat": 0.3333333432674408,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 13.94393894922219,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 0.5,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056884765625,
+ "learning_rate": 0.0008609693644375449,
+ "loss": 0.0086,
+ "macro_f1": 0.8200000524520874,
+ "num_tokens": 4790421.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.042896661907434464,
+ "skip_count": 1.0,
+ "step": 2970,
+ "text_loss": 0.2573051154613495
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 13.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.14453125,
+ "learning_rate": 0.000860755123172235,
+ "loss": 0.0096,
+ "macro_f1": 1.0,
+ "num_tokens": 4793786.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.013228793628513813,
+ "skip_count": 1.0,
+ "step": 2972,
+ "text_loss": 0.46614497900009155
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 13.962723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.0008605407436648815,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4796864.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007294759154319763,
+ "skip_count": 2.0,
+ "step": 2974,
+ "text_loss": 0.21555091440677643
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 13.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.057861328125,
+ "learning_rate": 0.0008603262259976348,
+ "loss": 0.0129,
+ "macro_f1": 1.0,
+ "num_tokens": 4800080.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0024024227168411016,
+ "skip_count": 5.0,
+ "step": 2976,
+ "text_loss": 0.7855485081672668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.981508658643968,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07666015625,
+ "learning_rate": 0.0008601115702526987,
+ "loss": 0.0113,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4802899.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001433031284250319,
+ "skip_count": 0.0,
+ "step": 2978,
+ "text_loss": 0.6777765154838562
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 13.990901085999413,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.0008598967765123293,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4805835.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003073975909501314,
+ "skip_count": 0.0,
+ "step": 2980,
+ "text_loss": 0.5926910638809204
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 26.0,
+ "epoch": 14.0,
+ "f1_execute": 0.9333333373069763,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.05322265625,
+ "learning_rate": 0.0008596818448588364,
+ "loss": 0.0139,
+ "macro_f1": 0.8666667342185974,
+ "num_tokens": 4809028.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06438573449850082,
+ "skip_count": 6.0,
+ "step": 2982,
+ "text_loss": 0.23975612223148346
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.009392427355445,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0008594667753745821,
+ "loss": 0.0054,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 4812831.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014817612245678902,
+ "skip_count": 1.0,
+ "step": 2984,
+ "text_loss": 0.17292268574237823
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 14.018784854710889,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07421875,
+ "learning_rate": 0.0008592515681419813,
+ "loss": 0.0078,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4816005.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.025407327339053154,
+ "skip_count": 0.0,
+ "step": 2986,
+ "text_loss": 0.6403061151504517
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0008590362232435018,
+ "loss": 0.0103,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4818901.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006826757453382015,
+ "skip_count": 0.0,
+ "step": 2988,
+ "text_loss": 0.2572069466114044
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0008588207407616644,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4823120.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009054148104041815,
+ "skip_count": 0.0,
+ "step": 2990,
+ "text_loss": 0.4827076196670532
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.046962136777223,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0008586051207790422,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4825774.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012294676853343844,
+ "skip_count": 0.0,
+ "step": 2992,
+ "text_loss": 0.40157821774482727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 14.056354564132668,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.052734375,
+ "learning_rate": 0.0008583893633782612,
+ "loss": 0.0084,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 4828841.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011474622413516045,
+ "skip_count": 2.0,
+ "step": 2994,
+ "text_loss": 0.14842072129249573
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.058837890625,
+ "learning_rate": 0.0008581734686419999,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4831458.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009154081344604492,
+ "skip_count": 2.0,
+ "step": 2996,
+ "text_loss": 0.365400105714798
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.075139418843557,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.00085795743665299,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4834609.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002899336162954569,
+ "skip_count": 0.0,
+ "step": 2998,
+ "text_loss": 0.5574684143066406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.0008577412674940152,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4838324.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034664268605411053,
+ "skip_count": 0.0,
+ "step": 3000,
+ "text_loss": 0.6752855777740479
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0008575249612479117,
+ "loss": 0.0127,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4841877.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036425739526748657,
+ "skip_count": 2.0,
+ "step": 3002,
+ "text_loss": 0.6332980394363403
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.103316700909891,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048095703125,
+ "learning_rate": 0.0008573085179975685,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4845840.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013783496106043458,
+ "skip_count": 0.0,
+ "step": 3004,
+ "text_loss": 0.4219617545604706
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0008570919378259274,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4848766.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004823608323931694,
+ "skip_count": 1.0,
+ "step": 3006,
+ "text_loss": 0.7987180948257446
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.000856875220815982,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4852310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014760984340682626,
+ "skip_count": 0.0,
+ "step": 3008,
+ "text_loss": 0.35592713952064514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.131493982976226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0008566583670507788,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4856146.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031717263627797365,
+ "skip_count": 1.0,
+ "step": 3010,
+ "text_loss": 0.19379083812236786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.140886410331671,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.0008564413766134164,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4859386.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003361492184922099,
+ "skip_count": 0.0,
+ "step": 3012,
+ "text_loss": 0.39129266142845154
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048583984375,
+ "learning_rate": 0.0008562242495870463,
+ "loss": 0.0113,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4862661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010563990799710155,
+ "skip_count": 0.0,
+ "step": 3014,
+ "text_loss": 0.5966938734054565
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0008560069860548716,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4865410.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001233913702890277,
+ "skip_count": 0.0,
+ "step": 3016,
+ "text_loss": 0.3386077880859375
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.169063692398003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055419921875,
+ "learning_rate": 0.0008557895861001484,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4868931.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018066301709041,
+ "skip_count": 0.0,
+ "step": 3018,
+ "text_loss": 0.5222050547599792
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.178456119753449,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0008555720498061845,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4873492.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0050385501235723495,
+ "skip_count": 1.0,
+ "step": 3020,
+ "text_loss": 0.4558849334716797
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.187848547108894,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0008553543772563403,
+ "loss": 0.009,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4877026.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004828717093914747,
+ "skip_count": 0.0,
+ "step": 3022,
+ "text_loss": 0.36598992347717285
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 25.0,
+ "epoch": 14.197240974464338,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.888888955116272,
+ "grad_norm": 0.06103515625,
+ "learning_rate": 0.0008551365685340285,
+ "loss": 0.0084,
+ "macro_f1": 0.9555556178092957,
+ "num_tokens": 4879655.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02049369551241398,
+ "skip_count": 5.0,
+ "step": 3024,
+ "text_loss": 0.5069093704223633
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 14.206633401819783,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0008549186237227138,
+ "loss": 0.0088,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 4882606.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03947242721915245,
+ "skip_count": 2.0,
+ "step": 3026,
+ "text_loss": 0.2600715458393097
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 14.216025829175228,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.0008547005429059128,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4885246.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0026363315992057323,
+ "skip_count": 0.0,
+ "step": 3028,
+ "text_loss": 0.37642326951026917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.225418256530672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0008544823261671948,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4888109.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003858231008052826,
+ "skip_count": 0.0,
+ "step": 3030,
+ "text_loss": 0.5875385999679565
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 14.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.061279296875,
+ "learning_rate": 0.0008542639735901804,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 4891168.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004789089784026146,
+ "skip_count": 1.0,
+ "step": 3032,
+ "text_loss": 0.6417325139045715
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.244203111241562,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0008540454852585434,
+ "loss": 0.0115,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4894355.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007334680762141943,
+ "skip_count": 2.0,
+ "step": 3034,
+ "text_loss": 0.23697198927402496
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
+ "epoch": 14.253595538597006,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0008538268612560084,
+ "loss": 0.0058,
+ "macro_f1": 0.4871794879436493,
+ "num_tokens": 4897543.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022096361964941025,
+ "skip_count": 3.0,
+ "step": 3036,
+ "text_loss": 0.1989550143480301
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.262987965952451,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 0.0008536081016663527,
+ "loss": 0.0101,
+ "macro_f1": 1.0,
+ "num_tokens": 4900752.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0037680594250559807,
+ "skip_count": 2.0,
+ "step": 3038,
+ "text_loss": 0.5001366138458252
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0008533892065734055,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4903581.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032373068388551474,
+ "skip_count": 1.0,
+ "step": 3040,
+ "text_loss": 0.5019411444664001
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0008531701760610476,
+ "loss": 0.0121,
+ "macro_f1": 1.0,
+ "num_tokens": 4907108.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0078013185411691666,
+ "skip_count": 2.0,
+ "step": 3042,
+ "text_loss": 0.3460627794265747
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 28.0,
+ "epoch": 14.291165248018785,
+ "f1_execute": 0.9600000381469727,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.5,
+ "grad_norm": 0.04833984375,
+ "learning_rate": 0.000852951010213212,
+ "loss": 0.0089,
+ "macro_f1": 0.8200000524520874,
+ "num_tokens": 4911269.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03576689213514328,
+ "skip_count": 3.0,
+ "step": 3044,
+ "text_loss": 0.268994003534317
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 14.300557675374229,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0008527317091138835,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 4914203.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0032140621915459633,
+ "skip_count": 1.0,
+ "step": 3046,
+ "text_loss": 0.9998719692230225
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.309950102729674,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0008525122728470987,
+ "loss": 0.0102,
+ "macro_f1": 1.0,
+ "num_tokens": 4918562.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008559177629649639,
+ "skip_count": 3.0,
+ "step": 3048,
+ "text_loss": 0.3062439560890198
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.31934253008512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0008522927014969459,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4921940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008735597133636475,
+ "skip_count": 2.0,
+ "step": 3050,
+ "text_loss": 0.3637430965900421
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05517578125,
+ "learning_rate": 0.0008520729951475652,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4925416.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012709591537714005,
+ "skip_count": 0.0,
+ "step": 3052,
+ "text_loss": 0.542036235332489
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.338127384796008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.0008518531538831488,
+ "loss": 0.0096,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4928695.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010660928674042225,
+ "skip_count": 1.0,
+ "step": 3054,
+ "text_loss": 0.43144503235816956
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.059326171875,
+ "learning_rate": 0.00085163317778794,
+ "loss": 0.0096,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4931504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004558971151709557,
+ "skip_count": 2.0,
+ "step": 3056,
+ "text_loss": 0.5257010459899902
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.0008514130669462341,
+ "loss": 0.0105,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4934935.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010774781927466393,
+ "skip_count": 2.0,
+ "step": 3058,
+ "text_loss": 0.26061776280403137
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.366304666862343,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0008511928214423782,
+ "loss": 0.0103,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 4938047.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.014763157814741135,
+ "skip_count": 2.0,
+ "step": 3060,
+ "text_loss": 0.2856905460357666
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 14.375697094217786,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.0008509724413607705,
+ "loss": 0.0087,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4941041.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004613345488905907,
+ "skip_count": 0.0,
+ "step": 3062,
+ "text_loss": 0.2870287001132965
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.385089521573232,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 0.0008507519267858612,
+ "loss": 0.015,
+ "macro_f1": 1.0,
+ "num_tokens": 4944708.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008584189228713512,
+ "skip_count": 2.0,
+ "step": 3064,
+ "text_loss": 0.15828095376491547
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.394481948928677,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0008505312778021519,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4948295.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014670816017314792,
+ "skip_count": 0.0,
+ "step": 3066,
+ "text_loss": 0.36697930097579956
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0927734375,
+ "learning_rate": 0.0008503104944941958,
+ "loss": 0.0107,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4951983.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005348859820514917,
+ "skip_count": 2.0,
+ "step": 3068,
+ "text_loss": 0.21612997353076935
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0008500895769465972,
+ "loss": 0.0111,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4955023.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013203793205320835,
+ "skip_count": 0.0,
+ "step": 3070,
+ "text_loss": 0.9757798314094543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.422659230995011,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0478515625,
+ "learning_rate": 0.0008498685252440124,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4957600.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006907356437295675,
+ "skip_count": 0.0,
+ "step": 3072,
+ "text_loss": 0.356107234954834
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.432051658350455,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.061279296875,
+ "learning_rate": 0.0008496473394711487,
+ "loss": 0.0116,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4960746.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027704904787242413,
+ "skip_count": 1.0,
+ "step": 3074,
+ "text_loss": 0.6812908053398132
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 0.0008494260197127649,
+ "loss": 0.0093,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4963845.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036796489730477333,
+ "skip_count": 2.0,
+ "step": 3076,
+ "text_loss": 0.7215370535850525
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0556640625,
+ "learning_rate": 0.0008492045660536712,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4966887.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037137691397219896,
+ "skip_count": 1.0,
+ "step": 3078,
+ "text_loss": 0.8700299859046936
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 14.460228940416789,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0008489829785787291,
+ "loss": 0.0078,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 4969859.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.016492314636707306,
+ "skip_count": 2.0,
+ "step": 3080,
+ "text_loss": 0.6520360112190247
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043701171875,
+ "learning_rate": 0.0008487612573728513,
+ "loss": 0.0094,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4972628.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004022917244583368,
+ "skip_count": 2.0,
+ "step": 3082,
+ "text_loss": 0.17498187720775604
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0008485394025210016,
+ "loss": 0.0076,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4975475.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009141159243881702,
+ "skip_count": 1.0,
+ "step": 3084,
+ "text_loss": 0.5975366234779358
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.488406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 0.0008483174141081956,
+ "loss": 0.0113,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4978858.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031561285723000765,
+ "skip_count": 0.0,
+ "step": 3086,
+ "text_loss": 0.18748866021633148
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.497798649838568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0008480952922194991,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4982142.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007894713780842721,
+ "skip_count": 0.0,
+ "step": 3088,
+ "text_loss": 0.42083197832107544
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.0008478730369400302,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4984872.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005908289458602667,
+ "skip_count": 0.0,
+ "step": 3090,
+ "text_loss": 0.45337188243865967
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.516583504549457,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0008476506483549573,
+ "loss": 0.0101,
+ "macro_f1": 1.0,
+ "num_tokens": 4988137.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016509373672306538,
+ "skip_count": 2.0,
+ "step": 3092,
+ "text_loss": 0.6397262811660767
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0008474281265495002,
+ "loss": 0.0076,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 4991164.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004088304936885834,
+ "skip_count": 1.0,
+ "step": 3094,
+ "text_loss": 0.18352322280406952
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0008472054716089295,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 4993876.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005200014915317297,
+ "skip_count": 0.0,
+ "step": 3096,
+ "text_loss": 0.2776511013507843
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.544760786615791,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0008469826836185673,
+ "loss": 0.01,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 4997068.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012686059810221195,
+ "skip_count": 2.0,
+ "step": 3098,
+ "text_loss": 0.23209233582019806
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.055419921875,
+ "learning_rate": 0.0008467597626637858,
+ "loss": 0.0074,
+ "macro_f1": 1.0,
+ "num_tokens": 5000038.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006401528604328632,
+ "skip_count": 2.0,
+ "step": 3100,
+ "text_loss": 0.45936745405197144
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.56354564132668,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 0.0008465367088300093,
+ "loss": 0.0075,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5002870.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016640547662973404,
+ "skip_count": 1.0,
+ "step": 3102,
+ "text_loss": 0.44502779841423035
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.572938068682125,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0008463135222027124,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5006357.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008411331102252007,
+ "skip_count": 2.0,
+ "step": 3104,
+ "text_loss": 0.3414570391178131
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.582330496037569,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0008460902028674204,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5009059.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010406570509076118,
+ "skip_count": 0.0,
+ "step": 3106,
+ "text_loss": 0.5931221842765808
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0008458667509097098,
+ "loss": 0.0115,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5012327.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001959054498001933,
+ "skip_count": 0.0,
+ "step": 3108,
+ "text_loss": 0.5191171169281006
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.0008456431664152078,
+ "loss": 0.0127,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5015472.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000994380097836256,
+ "skip_count": 0.0,
+ "step": 3110,
+ "text_loss": 0.4455361068248749
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.610507778103903,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0008454194494695923,
+ "loss": 0.0109,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5018901.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037662344984710217,
+ "skip_count": 0.0,
+ "step": 3112,
+ "text_loss": 0.5335362553596497
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 14.619900205459349,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.0008451956001585923,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5022520.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008664715103805065,
+ "skip_count": 3.0,
+ "step": 3114,
+ "text_loss": 0.16230148077011108
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.629292632814794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.000844971618567987,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5025505.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015904927859082818,
+ "skip_count": 0.0,
+ "step": 3116,
+ "text_loss": 0.6989432573318481
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.638685060170237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0008447475047836068,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5028767.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005853322334587574,
+ "skip_count": 1.0,
+ "step": 3118,
+ "text_loss": 0.31420737504959106
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 14.648077487525683,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 0.0008445232588913325,
+ "loss": 0.0115,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5032577.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012760105542838573,
+ "skip_count": 0.0,
+ "step": 3120,
+ "text_loss": 0.5534627437591553
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0008442988809770953,
+ "loss": 0.0095,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5035381.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022257440723478794,
+ "skip_count": 0.0,
+ "step": 3122,
+ "text_loss": 0.42492759227752686
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.666862342236572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0008440743711268775,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5038743.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004648433532565832,
+ "skip_count": 0.0,
+ "step": 3124,
+ "text_loss": 0.16404685378074646
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0008438497294267117,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5041492.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006313877180218697,
+ "skip_count": 0.0,
+ "step": 3126,
+ "text_loss": 0.23191484808921814
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 14.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07666015625,
+ "learning_rate": 0.0008436249559626807,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5043955.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0036270488053560257,
+ "skip_count": 0.0,
+ "step": 3128,
+ "text_loss": 0.5782018303871155
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.695039624302906,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0008434000508209187,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5047571.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003809858812019229,
+ "skip_count": 1.0,
+ "step": 3130,
+ "text_loss": 0.7129825949668884
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.704432051658351,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0008431750140876092,
+ "loss": 0.0128,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5051608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022369057405740023,
+ "skip_count": 0.0,
+ "step": 3132,
+ "text_loss": 0.4433445930480957
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.713824479013795,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.000842949845848987,
+ "loss": 0.0135,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 5054656.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0425117202103138,
+ "skip_count": 2.0,
+ "step": 3134,
+ "text_loss": 0.38721024990081787
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0712890625,
+ "learning_rate": 0.0008427245461913368,
+ "loss": 0.0121,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5059108.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018077283166348934,
+ "skip_count": 0.0,
+ "step": 3136,
+ "text_loss": 0.7496368885040283
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.12109375,
+ "learning_rate": 0.0008424991152009941,
+ "loss": 0.0111,
+ "macro_f1": 1.0,
+ "num_tokens": 5062371.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008801834657788277,
+ "skip_count": 2.0,
+ "step": 3138,
+ "text_loss": 0.5337086319923401
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 14.742001761080129,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0008422735529643444,
+ "loss": 0.0097,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5065593.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00548676960170269,
+ "skip_count": 3.0,
+ "step": 3140,
+ "text_loss": 0.2561623156070709
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.751394188435574,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0008420478595678233,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5068271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006389956455677748,
+ "skip_count": 0.0,
+ "step": 3142,
+ "text_loss": 0.15605193376541138
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.760786615791018,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07958984375,
+ "learning_rate": 0.0008418220350979175,
+ "loss": 0.0128,
+ "macro_f1": 1.0,
+ "num_tokens": 5071358.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012387622147798538,
+ "skip_count": 2.0,
+ "step": 3144,
+ "text_loss": 0.3085838258266449
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0008415960796411628,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5075584.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00311864772811532,
+ "skip_count": 1.0,
+ "step": 3146,
+ "text_loss": 0.4786977469921112
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.779571470501908,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1591796875,
+ "learning_rate": 0.0008413699932841461,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5078388.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030679800547659397,
+ "skip_count": 0.0,
+ "step": 3148,
+ "text_loss": 0.5222916603088379
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.788963897857352,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0008411437761135039,
+ "loss": 0.011,
+ "macro_f1": 1.0,
+ "num_tokens": 5081584.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012907958589494228,
+ "skip_count": 2.0,
+ "step": 3150,
+ "text_loss": 0.5369884371757507
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0008409174282159232,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5084450.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012314042076468468,
+ "skip_count": 2.0,
+ "step": 3152,
+ "text_loss": 0.25685277581214905
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 14.807748752568243,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.000840690949678141,
+ "loss": 0.0091,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5087865.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00899206381291151,
+ "skip_count": 0.0,
+ "step": 3154,
+ "text_loss": 0.1717093288898468
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.817141179923686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06103515625,
+ "learning_rate": 0.0008404643405869441,
+ "loss": 0.0098,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5090857.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013312003575265408,
+ "skip_count": 0.0,
+ "step": 3156,
+ "text_loss": 0.27446436882019043
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.826533607279131,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1533203125,
+ "learning_rate": 0.0008402376010291695,
+ "loss": 0.0126,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5093917.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002653320087119937,
+ "skip_count": 0.0,
+ "step": 3158,
+ "text_loss": 0.4237489402294159
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0008400107310917045,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5096656.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012976993806660175,
+ "skip_count": 2.0,
+ "step": 3160,
+ "text_loss": 0.42361980676651
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.000839783730861486,
+ "loss": 0.0097,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5099582.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006936746649444103,
+ "skip_count": 2.0,
+ "step": 3162,
+ "text_loss": 0.26656073331832886
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.0008395566004255008,
+ "loss": 0.0127,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5102908.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006619359832257032,
+ "skip_count": 1.0,
+ "step": 3164,
+ "text_loss": 0.590774416923523
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06884765625,
+ "learning_rate": 0.0008393293398707858,
+ "loss": 0.0076,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5105829.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010120268911123276,
+ "skip_count": 2.0,
+ "step": 3166,
+ "text_loss": 0.605930507183075
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.873495744056354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.0008391019492844275,
+ "loss": 0.0108,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5109850.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004940980114042759,
+ "skip_count": 2.0,
+ "step": 3168,
+ "text_loss": 0.12973152101039886
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0008388744287535627,
+ "loss": 0.0094,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5113353.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031777634285390377,
+ "skip_count": 1.0,
+ "step": 3170,
+ "text_loss": 0.18577200174331665
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052734375,
+ "learning_rate": 0.0008386467783653775,
+ "loss": 0.0103,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5116421.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005431659985333681,
+ "skip_count": 0.0,
+ "step": 3172,
+ "text_loss": 0.2302747517824173
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 14.901673026122689,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.000838418998207108,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5119457.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0077286697924137115,
+ "skip_count": 4.0,
+ "step": 3174,
+ "text_loss": 0.19606637954711914
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 0.0008381910883660399,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5123201.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003982985392212868,
+ "skip_count": 0.0,
+ "step": 3176,
+ "text_loss": 0.716376006603241
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 14.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.09423828125,
+ "learning_rate": 0.0008379630489295089,
+ "loss": 0.0109,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5126035.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005626026075333357,
+ "skip_count": 1.0,
+ "step": 3178,
+ "text_loss": 0.5144625902175903
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.929850308189023,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 0.0008377348799849,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5129179.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015458245761692524,
+ "skip_count": 2.0,
+ "step": 3180,
+ "text_loss": 0.29887503385543823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 14.939242735544468,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0008375065816196479,
+ "loss": 0.0086,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 5132149.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012210468761622906,
+ "skip_count": 2.0,
+ "step": 3182,
+ "text_loss": 0.8981851935386658
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.948635162899912,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0008372781539212371,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5135287.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0052537876181304455,
+ "skip_count": 0.0,
+ "step": 3184,
+ "text_loss": 0.4245666563510895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 14.958027590255357,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0008370495969772014,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5138589.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012873421423137188,
+ "skip_count": 2.0,
+ "step": 3186,
+ "text_loss": 0.40581050515174866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 14.9674200176108,
+ "f1_execute": 0.95652174949646,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07470703125,
+ "learning_rate": 0.0008368209108751244,
+ "loss": 0.0127,
+ "macro_f1": 0.6521739363670349,
+ "num_tokens": 5141635.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.07720445841550827,
+ "skip_count": 4.0,
+ "step": 3188,
+ "text_loss": 0.3755173981189728
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0008365920957026389,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5144728.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001440995605662465,
+ "skip_count": 0.0,
+ "step": 3190,
+ "text_loss": 0.5067034363746643
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 14.986204872321691,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0008363631515474275,
+ "loss": 0.0089,
+ "macro_f1": 0.6538461446762085,
+ "num_tokens": 5147963.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.018752984702587128,
+ "skip_count": 2.0,
+ "step": 3192,
+ "text_loss": 0.20224551856517792
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 14.995597299677135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0008361340784972217,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5151184.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005360354552976787,
+ "skip_count": 0.0,
+ "step": 3194,
+ "text_loss": 0.4588058292865753
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.004696213677722,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0008359048766398031,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5153889.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009184491937048733,
+ "skip_count": 1.0,
+ "step": 3196,
+ "text_loss": 0.2980220317840576
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.014088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.000835675546063002,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5156758.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001252970308996737,
+ "skip_count": 0.0,
+ "step": 3198,
+ "text_loss": 0.6775755882263184
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 0.0008354460868546985,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5160247.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037315806839615107,
+ "skip_count": 0.0,
+ "step": 3200,
+ "text_loss": 0.35867011547088623
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 15.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0008352164991028217,
+ "loss": 0.0092,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5163456.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001497485558502376,
+ "skip_count": 0.0,
+ "step": 3202,
+ "text_loss": 0.690290093421936
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.042265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.0008349867828953501,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5166139.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001051135826855898,
+ "skip_count": 0.0,
+ "step": 3204,
+ "text_loss": 0.3340415954589844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.051658350454945,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0008347569383203113,
+ "loss": 0.0098,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5169009.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010544003453105688,
+ "skip_count": 0.0,
+ "step": 3206,
+ "text_loss": 0.8584878444671631
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 15.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0008345269654657823,
+ "loss": 0.0085,
+ "macro_f1": 1.0,
+ "num_tokens": 5172618.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007312417030334473,
+ "skip_count": 1.0,
+ "step": 3208,
+ "text_loss": 0.19500218331813812
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.070443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0008342968644198892,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5175857.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00276504410430789,
+ "skip_count": 0.0,
+ "step": 3210,
+ "text_loss": 0.5446314215660095
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.079835632521279,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0008340666352708068,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5178585.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002669303445145488,
+ "skip_count": 0.0,
+ "step": 3212,
+ "text_loss": 0.3687484860420227
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0008338362781067596,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5181777.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031585274264216423,
+ "skip_count": 0.0,
+ "step": 3214,
+ "text_loss": 0.27325859665870667
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.000833605793016021,
+ "loss": 0.009,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5184312.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008807534351944923,
+ "skip_count": 2.0,
+ "step": 3216,
+ "text_loss": 0.4466548562049866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.108012914587613,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0008333751800869133,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5187497.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003171310294419527,
+ "skip_count": 0.0,
+ "step": 3218,
+ "text_loss": 0.5423526763916016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.117405341943059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0008331444394078076,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5190982.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016481258207932115,
+ "skip_count": 2.0,
+ "step": 3220,
+ "text_loss": 0.48984917998313904
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 15.126797769298504,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.000832913571067124,
+ "loss": 0.0107,
+ "macro_f1": 1.0,
+ "num_tokens": 5194044.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003957313951104879,
+ "skip_count": 1.0,
+ "step": 3222,
+ "text_loss": 0.4533331096172333
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.136190196653947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0008326825751533322,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5197092.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016904744552448392,
+ "skip_count": 0.0,
+ "step": 3224,
+ "text_loss": 0.5538802742958069
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0008324514517549501,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5199941.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005608258303254843,
+ "skip_count": 1.0,
+ "step": 3226,
+ "text_loss": 0.416242778301239
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 15.154975051364836,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0008322202009605444,
+ "loss": 0.0072,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 5202618.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.020965175703167915,
+ "skip_count": 2.0,
+ "step": 3228,
+ "text_loss": 0.17496295273303986
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 15.164367478720282,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0008319888228587311,
+ "loss": 0.0063,
+ "macro_f1": 1.0,
+ "num_tokens": 5206414.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.021259209141135216,
+ "skip_count": 5.0,
+ "step": 3230,
+ "text_loss": 0.22471418976783752
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.0008317573175381745,
+ "loss": 0.0115,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5209768.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018647604156285524,
+ "skip_count": 0.0,
+ "step": 3232,
+ "text_loss": 0.4415269196033478
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.0008315256850875881,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5213257.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002345515415072441,
+ "skip_count": 0.0,
+ "step": 3234,
+ "text_loss": 0.347247838973999
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 15.192544760786616,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.0008312939255957336,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5215800.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007112892810255289,
+ "skip_count": 3.0,
+ "step": 3236,
+ "text_loss": 0.31091734766960144
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.201937188142061,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0008310620391514219,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5219205.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00432228296995163,
+ "skip_count": 0.0,
+ "step": 3238,
+ "text_loss": 0.3421775996685028
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0008308300258435124,
+ "loss": 0.0085,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5222422.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0076514314860105515,
+ "skip_count": 2.0,
+ "step": 3240,
+ "text_loss": 0.22378318011760712
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0008305978857609128,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5225625.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007617069641128182,
+ "skip_count": 0.0,
+ "step": 3242,
+ "text_loss": 0.5880323648452759
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0008303656189925799,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5229113.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017418119823560119,
+ "skip_count": 0.0,
+ "step": 3244,
+ "text_loss": 0.3302813768386841
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.239506897563839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0008301332256275183,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5232061.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026667986530810595,
+ "skip_count": 0.0,
+ "step": 3246,
+ "text_loss": 0.5679706335067749
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.248899324919284,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0008299007057547821,
+ "loss": 0.0106,
+ "macro_f1": 1.0,
+ "num_tokens": 5235279.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011016624979674816,
+ "skip_count": 2.0,
+ "step": 3248,
+ "text_loss": 0.5081504583358765
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 15.258291752274728,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0008296680594634731,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5239655.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005492044147104025,
+ "skip_count": 0.0,
+ "step": 3250,
+ "text_loss": 0.14675180613994598
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0008294352868427418,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5243579.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00404445780441165,
+ "skip_count": 1.0,
+ "step": 3252,
+ "text_loss": 0.4201085865497589
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.277076606985618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.0008292023879817871,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5247059.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006886140909045935,
+ "skip_count": 1.0,
+ "step": 3254,
+ "text_loss": 0.2289208322763443
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.286469034341062,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.057861328125,
+ "learning_rate": 0.0008289693629698564,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5249940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005736657767556608,
+ "skip_count": 0.0,
+ "step": 3256,
+ "text_loss": 0.5670450925827026
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.295861461696507,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0008287362118962452,
+ "loss": 0.006,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5253580.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011349895037710667,
+ "skip_count": 1.0,
+ "step": 3258,
+ "text_loss": 0.5042323470115662
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.305253889051952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0008285029348502973,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5257080.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013626761501654983,
+ "skip_count": 0.0,
+ "step": 3260,
+ "text_loss": 0.3227672874927521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.314646316407396,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.0008282695319214053,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5259951.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00471635302528739,
+ "skip_count": 0.0,
+ "step": 3262,
+ "text_loss": 0.20773714780807495
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.324038743762841,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0008280360031990093,
+ "loss": 0.0107,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5263314.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010472415015101433,
+ "skip_count": 2.0,
+ "step": 3264,
+ "text_loss": 0.34397366642951965
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.333431171118287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.000827802348772598,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5267358.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007814752752892673,
+ "skip_count": 0.0,
+ "step": 3266,
+ "text_loss": 0.747342586517334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.0008275685687317084,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5270400.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000902949133887887,
+ "skip_count": 0.0,
+ "step": 3268,
+ "text_loss": 0.43782034516334534
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0008273346631659252,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5273147.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00043462219764478505,
+ "skip_count": 0.0,
+ "step": 3270,
+ "text_loss": 0.6358205080032349
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.361608453184619,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0008271006321648816,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5277638.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002211218234151602,
+ "skip_count": 0.0,
+ "step": 3272,
+ "text_loss": 0.20220105350017548
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 15.371000880540064,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.0008268664758182589,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5280638.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010536720044910908,
+ "skip_count": 0.0,
+ "step": 3274,
+ "text_loss": 0.7579061388969421
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.0008266321942157859,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5283847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017158017726615071,
+ "skip_count": 0.0,
+ "step": 3276,
+ "text_loss": 0.669302761554718
+ },
+ {
+ "acc_repeat": 0.800000011920929,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 15.389785735250953,
+ "f1_execute": 0.9743589162826538,
+ "f1_repeat": 0.888888955116272,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06005859375,
+ "learning_rate": 0.0008263977874472399,
+ "loss": 0.0088,
+ "macro_f1": 0.9544159770011902,
+ "num_tokens": 5286627.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.011220700107514858,
+ "skip_count": 4.0,
+ "step": 3278,
+ "text_loss": 0.8703984022140503
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.399178162606399,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 0.0008261632556024461,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5289766.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020442772656679153,
+ "skip_count": 0.0,
+ "step": 3280,
+ "text_loss": 0.5009346008300781
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10107421875,
+ "learning_rate": 0.0008259285987712774,
+ "loss": 0.0106,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5293010.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005645765457302332,
+ "skip_count": 0.0,
+ "step": 3282,
+ "text_loss": 0.2546011209487915
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0008256938170436549,
+ "loss": 0.0111,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5296732.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027385836001485586,
+ "skip_count": 2.0,
+ "step": 3284,
+ "text_loss": 0.5244000554084778
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 15.427355444672733,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.0008254589105095473,
+ "loss": 0.0061,
+ "macro_f1": 1.0,
+ "num_tokens": 5299926.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007451715879142284,
+ "skip_count": 1.0,
+ "step": 3286,
+ "text_loss": 0.28979742527008057
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0008252238792589711,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5303006.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004805843345820904,
+ "skip_count": 2.0,
+ "step": 3288,
+ "text_loss": 0.5131978392601013
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.446140299383622,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.000824988723381991,
+ "loss": 0.0091,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5306953.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010639613494277,
+ "skip_count": 1.0,
+ "step": 3290,
+ "text_loss": 0.4901447296142578
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 15.455532726739067,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.0008247534429687191,
+ "loss": 0.007,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 5310516.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013625577092170715,
+ "skip_count": 2.0,
+ "step": 3292,
+ "text_loss": 0.2124534696340561
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0008245180381093152,
+ "loss": 0.0114,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5313959.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004958513658493757,
+ "skip_count": 1.0,
+ "step": 3294,
+ "text_loss": 0.46682238578796387
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0008242825088939867,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5316609.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003962756600230932,
+ "skip_count": 0.0,
+ "step": 3296,
+ "text_loss": 0.7010108232498169
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.483710008805401,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0008240468554129892,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5319638.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006996620795689523,
+ "skip_count": 0.0,
+ "step": 3298,
+ "text_loss": 0.4966355860233307
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.493102436160845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0008238110777566255,
+ "loss": 0.0101,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5323019.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016031896229833364,
+ "skip_count": 0.0,
+ "step": 3300,
+ "text_loss": 0.38668957352638245
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 15.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0008235751760152459,
+ "loss": 0.0063,
+ "macro_f1": 1.0,
+ "num_tokens": 5326099.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.00344281829893589,
+ "skip_count": 2.0,
+ "step": 3302,
+ "text_loss": 0.5330720543861389
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.511887290871735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06005859375,
+ "learning_rate": 0.0008233391502792484,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5328993.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007886730134487152,
+ "skip_count": 1.0,
+ "step": 3304,
+ "text_loss": 0.5470269322395325
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.521279718227179,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0008231030006390786,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5331554.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008180000819265842,
+ "skip_count": 1.0,
+ "step": 3306,
+ "text_loss": 0.4023340344429016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.0008228667271852294,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5335712.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0002942821884062141,
+ "skip_count": 0.0,
+ "step": 3308,
+ "text_loss": 0.5306711792945862
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.0008226303300082414,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5338701.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006134595023468137,
+ "skip_count": 0.0,
+ "step": 3310,
+ "text_loss": 0.5906263589859009
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.549457000293513,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.0008223938091987022,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5342274.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016656654188409448,
+ "skip_count": 0.0,
+ "step": 3312,
+ "text_loss": 0.5201764106750488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.558849427648958,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 0.0008221571648472472,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5345185.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038612703792750835,
+ "skip_count": 0.0,
+ "step": 3314,
+ "text_loss": 0.36633720993995667
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.568241855004402,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0008219203970445589,
+ "loss": 0.011,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5348804.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009782899171113968,
+ "skip_count": 1.0,
+ "step": 3316,
+ "text_loss": 0.3117460012435913
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.577634282359847,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055908203125,
+ "learning_rate": 0.0008216835058813672,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5351896.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007713229861110449,
+ "skip_count": 0.0,
+ "step": 3318,
+ "text_loss": 0.253496378660202
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0008214464914484492,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5355058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006227815989404917,
+ "skip_count": 2.0,
+ "step": 3320,
+ "text_loss": 0.32693132758140564
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0008212093538366292,
+ "loss": 0.0099,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5358365.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002601418411359191,
+ "skip_count": 0.0,
+ "step": 3322,
+ "text_loss": 0.40394455194473267
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 15.605811564426181,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.000820972093136779,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5360981.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005545300897210836,
+ "skip_count": 3.0,
+ "step": 3324,
+ "text_loss": 0.6758295893669128
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 15.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.0008207347094398172,
+ "loss": 0.0096,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5364018.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001924700103700161,
+ "skip_count": 0.0,
+ "step": 3326,
+ "text_loss": 0.5196860432624817
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0008204972028367097,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5366986.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012254828587174416,
+ "skip_count": 1.0,
+ "step": 3328,
+ "text_loss": 0.24661913514137268
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.633988846492516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0008202595734184694,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5371463.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005094083491712809,
+ "skip_count": 0.0,
+ "step": 3330,
+ "text_loss": 0.2525769770145416
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 15.643381273847961,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0008200218212761566,
+ "loss": 0.0108,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5374823.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0025883198250085115,
+ "skip_count": 0.0,
+ "step": 3332,
+ "text_loss": 0.21849912405014038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.000819783946500878,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5377640.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008240507915616035,
+ "skip_count": 0.0,
+ "step": 3334,
+ "text_loss": 0.2662734091281891
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 15.66216612855885,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 0.000819545949183788,
+ "loss": 0.01,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 5380593.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.038378193974494934,
+ "skip_count": 3.0,
+ "step": 3336,
+ "text_loss": 0.2431795746088028
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 15.671558555914293,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0008193078294160874,
+ "loss": 0.0097,
+ "macro_f1": 1.0,
+ "num_tokens": 5384487.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005926199723035097,
+ "skip_count": 1.0,
+ "step": 3338,
+ "text_loss": 0.5663705468177795
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.680950983269739,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0008190695872890242,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5387511.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010842559859156609,
+ "skip_count": 2.0,
+ "step": 3340,
+ "text_loss": 0.11517292261123657
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.690343410625184,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.0008188312228938933,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5390698.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001304097007960081,
+ "skip_count": 0.0,
+ "step": 3342,
+ "text_loss": 0.4827076196670532
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 15.699735837980628,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0008185927363220363,
+ "loss": 0.0087,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5393778.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005354117136448622,
+ "skip_count": 0.0,
+ "step": 3344,
+ "text_loss": 0.44467049837112427
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0008183541276648418,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5396925.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004800073802471161,
+ "skip_count": 2.0,
+ "step": 3346,
+ "text_loss": 0.2032834142446518
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.718520692691518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0008181153970137449,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5400522.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021674633026123047,
+ "skip_count": 0.0,
+ "step": 3348,
+ "text_loss": 0.4507528841495514
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.727913120046962,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.051513671875,
+ "learning_rate": 0.0008178765444602278,
+ "loss": 0.0117,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 5403526.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04263930395245552,
+ "skip_count": 2.0,
+ "step": 3350,
+ "text_loss": 0.3606615960597992
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 15.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0008176375700958194,
+ "loss": 0.0087,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5407127.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006953123956918716,
+ "skip_count": 0.0,
+ "step": 3352,
+ "text_loss": 0.2290353775024414
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0008173984740120948,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5410829.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014363783411681652,
+ "skip_count": 0.0,
+ "step": 3354,
+ "text_loss": 0.4220392405986786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.756090402113296,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0008171592563006762,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5414152.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00202389364130795,
+ "skip_count": 1.0,
+ "step": 3356,
+ "text_loss": 0.37729766964912415
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.765482829468741,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0008169199170532323,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5417312.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006253739818930626,
+ "skip_count": 2.0,
+ "step": 3358,
+ "text_loss": 0.1304289996623993
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 15.774875256824185,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0703125,
+ "learning_rate": 0.0008166804563614785,
+ "loss": 0.0084,
+ "macro_f1": 1.0,
+ "num_tokens": 5421227.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01622140221297741,
+ "skip_count": 2.0,
+ "step": 3360,
+ "text_loss": 0.298664391040802
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0008164408743171763,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 5424646.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0037176944315433502,
+ "skip_count": 2.0,
+ "step": 3362,
+ "text_loss": 0.12147632241249084
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046630859375,
+ "learning_rate": 0.0008162011710121339,
+ "loss": 0.0076,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5427897.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020403533708304167,
+ "skip_count": 1.0,
+ "step": 3364,
+ "text_loss": 0.2656533420085907
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.803052538890519,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0008159613465382066,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5430474.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018634048756211996,
+ "skip_count": 0.0,
+ "step": 3366,
+ "text_loss": 0.9133086204528809
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.812444966245964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0008157214009872951,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5433113.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012944488786160946,
+ "skip_count": 2.0,
+ "step": 3368,
+ "text_loss": 0.24352453649044037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05712890625,
+ "learning_rate": 0.0008154813344513472,
+ "loss": 0.0143,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5436259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002347963862121105,
+ "skip_count": 2.0,
+ "step": 3370,
+ "text_loss": 0.7601244449615479
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0008152411470223568,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5439126.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016609140438959002,
+ "skip_count": 0.0,
+ "step": 3372,
+ "text_loss": 0.5551947355270386
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.840622248312298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0008150008387923643,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5442739.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008321396075189114,
+ "skip_count": 0.0,
+ "step": 3374,
+ "text_loss": 0.25028282403945923
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 15.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08544921875,
+ "learning_rate": 0.000814760409853456,
+ "loss": 0.0105,
+ "macro_f1": 1.0,
+ "num_tokens": 5445247.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.009738070890307426,
+ "skip_count": 1.0,
+ "step": 3376,
+ "text_loss": 0.37271201610565186
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0008145198602977651,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5449044.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028421466704458,
+ "skip_count": 0.0,
+ "step": 3378,
+ "text_loss": 0.1458655595779419
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.868799530378633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11474609375,
+ "learning_rate": 0.0008142791902174701,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5453063.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015170135302469134,
+ "skip_count": 0.0,
+ "step": 3380,
+ "text_loss": 0.5548722743988037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 15.878191957734076,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0008140383997047966,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5455814.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022444510832428932,
+ "skip_count": 1.0,
+ "step": 3382,
+ "text_loss": 0.8034513592720032
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.887584385089522,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.000813797488852016,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5459392.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00038578867679461837,
+ "skip_count": 0.0,
+ "step": 3384,
+ "text_loss": 0.6940088868141174
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.896976812444967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0008135564577514458,
+ "loss": 0.011,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5462413.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019727381877601147,
+ "skip_count": 0.0,
+ "step": 3386,
+ "text_loss": 0.5124650597572327
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.099609375,
+ "learning_rate": 0.0008133153064954495,
+ "loss": 0.0107,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5465552.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019896167796105146,
+ "skip_count": 0.0,
+ "step": 3388,
+ "text_loss": 0.4292517900466919
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 15.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0008130740351764367,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 5468573.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030118159484118223,
+ "skip_count": 1.0,
+ "step": 3390,
+ "text_loss": 0.48903173208236694
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 15.925154094511301,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.000812832643886863,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5471547.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005084246397018433,
+ "skip_count": 2.0,
+ "step": 3392,
+ "text_loss": 0.35789889097213745
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.934546521866745,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0008125911327192299,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5474331.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008874498889781535,
+ "skip_count": 0.0,
+ "step": 3394,
+ "text_loss": 0.6267408728599548
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0008123495017660851,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5477633.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001794386887922883,
+ "skip_count": 0.0,
+ "step": 3396,
+ "text_loss": 0.3701885938644409
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0008121077511200221,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5481277.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002140481723472476,
+ "skip_count": 0.0,
+ "step": 3398,
+ "text_loss": 0.6362857818603516
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.962723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0556640625,
+ "learning_rate": 0.00081186588087368,
+ "loss": 0.0116,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5484237.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000867189432028681,
+ "skip_count": 0.0,
+ "step": 3400,
+ "text_loss": 1.0847382545471191
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.0008116238911197442,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5487423.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029817656613886356,
+ "skip_count": 0.0,
+ "step": 3402,
+ "text_loss": 0.3813740313053131
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.981508658643968,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.0008113817819509454,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5490155.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035141287371516228,
+ "skip_count": 0.0,
+ "step": 3404,
+ "text_loss": 0.2113083451986313
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 15.990901085999413,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.0008111395534600603,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5493415.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003317659953609109,
+ "skip_count": 0.0,
+ "step": 3406,
+ "text_loss": 0.5869330167770386
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 0.0008108972057399114,
+ "loss": 0.0123,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5496032.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003833734430372715,
+ "skip_count": 2.0,
+ "step": 3408,
+ "text_loss": 0.2938928008079529
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.009392427355444,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.11328125,
+ "learning_rate": 0.0008106547388833669,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5498890.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002622978063300252,
+ "skip_count": 1.0,
+ "step": 3410,
+ "text_loss": 0.3130980432033539
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0008104121529833402,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5502010.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007447598036378622,
+ "skip_count": 0.0,
+ "step": 3412,
+ "text_loss": 0.4413072466850281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.000810169448132791,
+ "loss": 0.0093,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5505212.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031087708193808794,
+ "skip_count": 1.0,
+ "step": 3414,
+ "text_loss": 0.2910428047180176
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.037569709421778,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0008099266244247243,
+ "loss": 0.0082,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5508755.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02510393038392067,
+ "skip_count": 1.0,
+ "step": 3416,
+ "text_loss": 0.33022749423980713
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0008096836819521903,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5512034.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020537273958325386,
+ "skip_count": 1.0,
+ "step": 3418,
+ "text_loss": 0.4731218218803406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0008094406208082853,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5515707.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004218162503093481,
+ "skip_count": 2.0,
+ "step": 3420,
+ "text_loss": 0.23429590463638306
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 16.065746991488112,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0869140625,
+ "learning_rate": 0.0008091974410861507,
+ "loss": 0.0069,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 5518436.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013488355092704296,
+ "skip_count": 3.0,
+ "step": 3422,
+ "text_loss": 0.45768749713897705
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.07513941884356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0008089541428789733,
+ "loss": 0.0097,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5522368.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010335417464375496,
+ "skip_count": 1.0,
+ "step": 3424,
+ "text_loss": 0.43423423171043396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0008087107262799855,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5526061.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002134323585778475,
+ "skip_count": 0.0,
+ "step": 3426,
+ "text_loss": 0.4031757414340973
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1318359375,
+ "learning_rate": 0.0008084671913824651,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5529284.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0097216060385108,
+ "skip_count": 2.0,
+ "step": 3428,
+ "text_loss": 0.2836039960384369
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.000808223538279735,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5532159.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001684269867837429,
+ "skip_count": 0.0,
+ "step": 3430,
+ "text_loss": 0.5804527401924133
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0008079797670651637,
+ "loss": 0.008,
+ "macro_f1": 1.0,
+ "num_tokens": 5536050.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013918434269726276,
+ "skip_count": 1.0,
+ "step": 3432,
+ "text_loss": 0.31325826048851013
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0008077358778321647,
+ "loss": 0.011,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5538885.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007751787197776139,
+ "skip_count": 0.0,
+ "step": 3434,
+ "text_loss": 0.783108115196228
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.131493982976224,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0008074918706741966,
+ "loss": 0.0063,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 5541909.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.021819550544023514,
+ "skip_count": 2.0,
+ "step": 3436,
+ "text_loss": 0.6558083295822144
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.14088641033167,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.0008072477456847638,
+ "loss": 0.0057,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5545101.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03309348225593567,
+ "skip_count": 0.0,
+ "step": 3438,
+ "text_loss": 0.9877075552940369
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.0008070035029574151,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 5548971.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008696741424500942,
+ "skip_count": 1.0,
+ "step": 3440,
+ "text_loss": 0.24766330420970917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 16.159671265042558,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.000806759142585745,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5552174.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004240929149091244,
+ "skip_count": 3.0,
+ "step": 3442,
+ "text_loss": 0.37255001068115234
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05322265625,
+ "learning_rate": 0.0008065146646633927,
+ "loss": 0.0088,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5555005.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014345484785735607,
+ "skip_count": 1.0,
+ "step": 3444,
+ "text_loss": 0.26157206296920776
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06005859375,
+ "learning_rate": 0.0008062700692840428,
+ "loss": 0.0083,
+ "macro_f1": 1.0,
+ "num_tokens": 5559127.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008315163664519787,
+ "skip_count": 2.0,
+ "step": 3446,
+ "text_loss": 0.21971040964126587
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 16.187848547108892,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.056396484375,
+ "learning_rate": 0.0008060253565414246,
+ "loss": 0.009,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 5562254.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009582413360476494,
+ "skip_count": 3.0,
+ "step": 3448,
+ "text_loss": 0.6758295893669128
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.19724097446434,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0008057805265293124,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5565515.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002429503947496414,
+ "skip_count": 0.0,
+ "step": 3450,
+ "text_loss": 0.696592390537262
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0008055355793415257,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5568392.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007724192109890282,
+ "skip_count": 0.0,
+ "step": 3452,
+ "text_loss": 0.7092870473861694
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.216025829175226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0008052905150719285,
+ "loss": 0.0099,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5571090.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010859938338398933,
+ "skip_count": 0.0,
+ "step": 3454,
+ "text_loss": 0.6593860387802124
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.225418256530673,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0008050453338144301,
+ "loss": 0.0072,
+ "macro_f1": 1.0,
+ "num_tokens": 5574552.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030258705373853445,
+ "skip_count": 1.0,
+ "step": 3456,
+ "text_loss": 0.3479384481906891
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0008048000356629844,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5577484.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005052885971963406,
+ "skip_count": 2.0,
+ "step": 3458,
+ "text_loss": 0.21858671307563782
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.0008045546207115901,
+ "loss": 0.0068,
+ "macro_f1": 1.0,
+ "num_tokens": 5581605.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009976249188184738,
+ "skip_count": 3.0,
+ "step": 3460,
+ "text_loss": 0.16868001222610474
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.253595538597008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0008043090890542904,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5584994.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00270817126147449,
+ "skip_count": 0.0,
+ "step": 3462,
+ "text_loss": 0.785690426826477
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0008040634407851739,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5588067.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018436965765431523,
+ "skip_count": 0.0,
+ "step": 3464,
+ "text_loss": 0.5006644129753113
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0008038176759983731,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5590789.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008516279980540276,
+ "skip_count": 2.0,
+ "step": 3466,
+ "text_loss": 0.20963478088378906
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.281772820663342,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0008035717947880659,
+ "loss": 0.0091,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5593472.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016293043736368418,
+ "skip_count": 0.0,
+ "step": 3468,
+ "text_loss": 0.7376078963279724
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0008033257972484742,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5596108.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002364142332226038,
+ "skip_count": 0.0,
+ "step": 3470,
+ "text_loss": 0.5156455039978027
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0008030796834738649,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5599103.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008872323669493198,
+ "skip_count": 0.0,
+ "step": 3472,
+ "text_loss": 0.2996419668197632
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 16.309950102729672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043701171875,
+ "learning_rate": 0.0008028334535585491,
+ "loss": 0.0087,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5602410.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011508257128298283,
+ "skip_count": 3.0,
+ "step": 3474,
+ "text_loss": 0.25438693165779114
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.31934253008512,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.0008025871075968827,
+ "loss": 0.0106,
+ "macro_f1": 1.0,
+ "num_tokens": 5605424.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.017225435003638268,
+ "skip_count": 2.0,
+ "step": 3476,
+ "text_loss": 0.2549574077129364
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.328734957440563,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0008023406456832657,
+ "loss": 0.0111,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 5608266.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.039165645837783813,
+ "skip_count": 2.0,
+ "step": 3478,
+ "text_loss": 0.1797947734594345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.338127384796007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0008020940679121429,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5611471.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009718866203911602,
+ "skip_count": 0.0,
+ "step": 3480,
+ "text_loss": 0.8267702460289001
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0008018473743780036,
+ "loss": 0.0093,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5615046.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006087122485041618,
+ "skip_count": 2.0,
+ "step": 3482,
+ "text_loss": 0.7267677187919617
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.000801600565175381,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5618350.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007539413054473698,
+ "skip_count": 0.0,
+ "step": 3484,
+ "text_loss": 0.5910211801528931
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.0008013536403988529,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5621381.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008076327503658831,
+ "skip_count": 0.0,
+ "step": 3486,
+ "text_loss": 0.30616798996925354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 16.375697094217788,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0008011066001430412,
+ "loss": 0.0086,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 5624617.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023835813626646996,
+ "skip_count": 4.0,
+ "step": 3488,
+ "text_loss": 0.3376443088054657
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.38508952157323,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0008008594445026122,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5627989.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004226419143378735,
+ "skip_count": 2.0,
+ "step": 3490,
+ "text_loss": 0.8185343146324158
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.394481948928675,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0008006121735722767,
+ "loss": 0.0084,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 5632286.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0366671048104763,
+ "skip_count": 2.0,
+ "step": 3492,
+ "text_loss": 0.2209547609090805
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.403874376284122,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0008003647874467892,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5635368.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012956378981471062,
+ "skip_count": 0.0,
+ "step": 3494,
+ "text_loss": 0.20468664169311523
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.059814453125,
+ "learning_rate": 0.0008001172862209485,
+ "loss": 0.0103,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5638440.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017375422175973654,
+ "skip_count": 0.0,
+ "step": 3496,
+ "text_loss": 0.6647221446037292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 16.42265923099501,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.0007998696699895976,
+ "loss": 0.0091,
+ "macro_f1": 0.6592592597007751,
+ "num_tokens": 5641996.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.025240756571292877,
+ "skip_count": 5.0,
+ "step": 3498,
+ "text_loss": 0.23892143368721008
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.432051658350456,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.0007996219388476236,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5645071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007436830550432205,
+ "skip_count": 1.0,
+ "step": 3500,
+ "text_loss": 0.7580804228782654
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.0007993740928899571,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5648175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001126602990552783,
+ "skip_count": 0.0,
+ "step": 3502,
+ "text_loss": 0.5281378626823425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.0007991261322115737,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5650973.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007907263352535665,
+ "skip_count": 0.0,
+ "step": 3504,
+ "text_loss": 0.25220927596092224
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.000798878056907492,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 5654252.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006263538729399443,
+ "skip_count": 2.0,
+ "step": 3506,
+ "text_loss": 0.46569153666496277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 16.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0703125,
+ "learning_rate": 0.0007986298670727752,
+ "loss": 0.0098,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5657229.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004049144219607115,
+ "skip_count": 3.0,
+ "step": 3508,
+ "text_loss": 0.15174436569213867
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 28.0,
+ "epoch": 16.479013795127678,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0791015625,
+ "learning_rate": 0.0007983815628025301,
+ "loss": 0.0074,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 5659974.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0471976138651371,
+ "skip_count": 3.0,
+ "step": 3510,
+ "text_loss": 0.39072203636169434
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.488406222483125,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.000798133144191907,
+ "loss": 0.0082,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5662893.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04030488431453705,
+ "skip_count": 1.0,
+ "step": 3512,
+ "text_loss": 0.3562147617340088
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.49779864983857,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0595703125,
+ "learning_rate": 0.0007978846113361009,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5666476.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007475079502910376,
+ "skip_count": 1.0,
+ "step": 3514,
+ "text_loss": 0.26518192887306213
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.0007976359643303497,
+ "loss": 0.013,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5669647.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00558585487306118,
+ "skip_count": 2.0,
+ "step": 3516,
+ "text_loss": 0.29284560680389404
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.516583504549455,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0007973872032699354,
+ "loss": 0.0082,
+ "macro_f1": 1.0,
+ "num_tokens": 5673491.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0026981087867170572,
+ "skip_count": 1.0,
+ "step": 3518,
+ "text_loss": 0.35089045763015747
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.000797138328250184,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5676529.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0027328627184033394,
+ "skip_count": 0.0,
+ "step": 3520,
+ "text_loss": 0.41077399253845215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 16.535368359260346,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.0007968893393664646,
+ "loss": 0.01,
+ "macro_f1": 0.6592592597007751,
+ "num_tokens": 5679987.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02695014327764511,
+ "skip_count": 5.0,
+ "step": 3522,
+ "text_loss": 0.44942837953567505
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0007966402367141903,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5683185.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00817026849836111,
+ "skip_count": 2.0,
+ "step": 3524,
+ "text_loss": 0.14528048038482666
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.0007963910203888176,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5686544.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021973433904349804,
+ "skip_count": 0.0,
+ "step": 3526,
+ "text_loss": 0.22358648478984833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.56354564132668,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.0007961416904858469,
+ "loss": 0.0078,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5689579.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.033712416887283325,
+ "skip_count": 1.0,
+ "step": 3528,
+ "text_loss": 0.3083649277687073
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.572938068682124,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0007958922471008217,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5692869.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011182719841599464,
+ "skip_count": 2.0,
+ "step": 3530,
+ "text_loss": 0.21288011968135834
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0007956426903293292,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5696007.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015808293828740716,
+ "skip_count": 0.0,
+ "step": 3532,
+ "text_loss": 0.6068631410598755
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.591722923393014,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052734375,
+ "learning_rate": 0.0007953930202670001,
+ "loss": 0.0062,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 5699474.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03205178305506706,
+ "skip_count": 0.0,
+ "step": 3534,
+ "text_loss": 0.4317135512828827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.601115350748458,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0007951432370095084,
+ "loss": 0.0105,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5703483.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003518853336572647,
+ "skip_count": 0.0,
+ "step": 3536,
+ "text_loss": 0.5432273149490356
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 16.610507778103905,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.11083984375,
+ "learning_rate": 0.0007948933406525715,
+ "loss": 0.01,
+ "macro_f1": 1.0,
+ "num_tokens": 5707301.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004982157610356808,
+ "skip_count": 1.0,
+ "step": 3538,
+ "text_loss": 0.40061065554618835
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.61990020545935,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0751953125,
+ "learning_rate": 0.0007946433312919502,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5710847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003067734418436885,
+ "skip_count": 0.0,
+ "step": 3540,
+ "text_loss": 0.5396234393119812
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 16.629292632814792,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0007943932090234486,
+ "loss": 0.0097,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 5713683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03728383034467697,
+ "skip_count": 2.0,
+ "step": 3542,
+ "text_loss": 0.18310914933681488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 16.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0007941429739429138,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5716397.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025092530995607376,
+ "skip_count": 3.0,
+ "step": 3544,
+ "text_loss": 0.5806207060813904
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0007938926261462366,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5719984.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002493767999112606,
+ "skip_count": 0.0,
+ "step": 3546,
+ "text_loss": 0.38606807589530945
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 16.657469914881126,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.0007936421657293507,
+ "loss": 0.0094,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 5723571.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.014810923486948013,
+ "skip_count": 2.0,
+ "step": 3548,
+ "text_loss": 0.49558472633361816
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.666862342236573,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.0007933915927882327,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5726405.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00152928801253438,
+ "skip_count": 0.0,
+ "step": 3550,
+ "text_loss": 0.8674797415733337
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.000793140907418903,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5729955.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005522782914340496,
+ "skip_count": 2.0,
+ "step": 3552,
+ "text_loss": 0.3274473249912262
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0007928901097174248,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5733030.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009207013063132763,
+ "skip_count": 2.0,
+ "step": 3554,
+ "text_loss": 0.18237128853797913
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.695039624302908,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0007926391997799039,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5735978.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00695531303063035,
+ "skip_count": 0.0,
+ "step": 3556,
+ "text_loss": 0.3266434967517853
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.0007923881777024898,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5738901.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002743212040513754,
+ "skip_count": 1.0,
+ "step": 3558,
+ "text_loss": 0.4971913695335388
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.0007921370435813741,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5741946.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007037297356873751,
+ "skip_count": 0.0,
+ "step": 3560,
+ "text_loss": 0.5645473599433899
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.723216906369238,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.0007918857975127924,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5744987.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030746585689485073,
+ "skip_count": 0.0,
+ "step": 3562,
+ "text_loss": 0.17717665433883667
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0007916344395930224,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5747837.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004522138275206089,
+ "skip_count": 0.0,
+ "step": 3564,
+ "text_loss": 0.7676118612289429
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.000791382969918385,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5750716.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026240211445838213,
+ "skip_count": 0.0,
+ "step": 3566,
+ "text_loss": 0.4975173771381378
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.751394188435572,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.000791131388585244,
+ "loss": 0.011,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 5754368.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.021831991150975227,
+ "skip_count": 2.0,
+ "step": 3568,
+ "text_loss": 0.9670342206954956
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0007908796956900055,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5757076.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017586691537871957,
+ "skip_count": 0.0,
+ "step": 3570,
+ "text_loss": 0.3057977259159088
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.000790627891329119,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5760613.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005515786819159985,
+ "skip_count": 0.0,
+ "step": 3572,
+ "text_loss": 0.5860086679458618
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.779571470501907,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0007903759755990763,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5763557.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004096484277397394,
+ "skip_count": 0.0,
+ "step": 3574,
+ "text_loss": 0.17175781726837158
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.788963897857354,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.000790123948596412,
+ "loss": 0.0119,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5767430.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005216122139245272,
+ "skip_count": 0.0,
+ "step": 3576,
+ "text_loss": 0.7520374059677124
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07177734375,
+ "learning_rate": 0.0007898718104177031,
+ "loss": 0.0108,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5770175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037980107590556145,
+ "skip_count": 0.0,
+ "step": 3578,
+ "text_loss": 0.18117885291576385
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.80774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.0007896195611595699,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5773032.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003672175807878375,
+ "skip_count": 2.0,
+ "step": 3580,
+ "text_loss": 0.7241058349609375
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.817141179923688,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0007893672009186744,
+ "loss": 0.0083,
+ "macro_f1": 1.0,
+ "num_tokens": 5776077.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01229850109666586,
+ "skip_count": 3.0,
+ "step": 3582,
+ "text_loss": 0.29140418767929077
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0007891147297917216,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5779088.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0035251814406365156,
+ "skip_count": 0.0,
+ "step": 3584,
+ "text_loss": 0.1727485954761505
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.055908203125,
+ "learning_rate": 0.000788862147875459,
+ "loss": 0.0094,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5782201.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004725661128759384,
+ "skip_count": 2.0,
+ "step": 3586,
+ "text_loss": 0.43512848019599915
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.845318461990022,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.0007886094552666765,
+ "loss": 0.0106,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5785039.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005632172804325819,
+ "skip_count": 0.0,
+ "step": 3588,
+ "text_loss": 0.3534786105155945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0556640625,
+ "learning_rate": 0.0007883566520622062,
+ "loss": 0.0109,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5788017.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006249965168535709,
+ "skip_count": 1.0,
+ "step": 3590,
+ "text_loss": 0.2089710384607315
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0007881037383589229,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5791168.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013797614956274629,
+ "skip_count": 0.0,
+ "step": 3592,
+ "text_loss": 0.4349329471588135
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06982421875,
+ "learning_rate": 0.0007878507142537436,
+ "loss": 0.0091,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5793927.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019719740375876427,
+ "skip_count": 1.0,
+ "step": 3594,
+ "text_loss": 0.6087368726730347
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0007875975798436274,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5797214.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0037070370744913816,
+ "skip_count": 0.0,
+ "step": 3596,
+ "text_loss": 0.4258122444152832
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048583984375,
+ "learning_rate": 0.0007873443352255764,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5800691.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008431311696767807,
+ "skip_count": 0.0,
+ "step": 3598,
+ "text_loss": 0.6006711721420288
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.901673026122687,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055419921875,
+ "learning_rate": 0.0007870909804966337,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5804712.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017720256000757217,
+ "skip_count": 0.0,
+ "step": 3600,
+ "text_loss": 0.6055042743682861
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.911065453478134,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.0007868375157538861,
+ "loss": 0.0083,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5807670.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010697763413190842,
+ "skip_count": 0.0,
+ "step": 3602,
+ "text_loss": 0.8039056658744812
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0007865839410944611,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5810880.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030022128485143185,
+ "skip_count": 0.0,
+ "step": 3604,
+ "text_loss": 0.596110463142395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 16.92985030818902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0007863302566155295,
+ "loss": 0.0093,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5814171.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006257854867726564,
+ "skip_count": 2.0,
+ "step": 3606,
+ "text_loss": 0.5700319409370422
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.0007860764624143031,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5817607.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004838473163545132,
+ "skip_count": 0.0,
+ "step": 3608,
+ "text_loss": 0.8319530487060547
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 16.94863516289991,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.08154296875,
+ "learning_rate": 0.0007858225585880369,
+ "loss": 0.0067,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 5821452.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02173662930727005,
+ "skip_count": 2.0,
+ "step": 3610,
+ "text_loss": 0.3738477826118469
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0007855685452340269,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5824683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032719180453568697,
+ "skip_count": 0.0,
+ "step": 3612,
+ "text_loss": 0.4054839015007019
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.967420017610802,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0007853144224496118,
+ "loss": 0.0093,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5827860.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.032171256840229034,
+ "skip_count": 0.0,
+ "step": 3614,
+ "text_loss": 0.18112395703792572
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 16.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 0.0007850601903321716,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5831651.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013230946846306324,
+ "skip_count": 1.0,
+ "step": 3616,
+ "text_loss": 0.2698844075202942
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 16.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.000784805848979129,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5834369.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00162619655020535,
+ "skip_count": 0.0,
+ "step": 3618,
+ "text_loss": 0.2430931180715561
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 16.995597299677137,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.0007845513984879477,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5838102.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002781603019684553,
+ "skip_count": 0.0,
+ "step": 3620,
+ "text_loss": 0.4968300759792328
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.00469621367772,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.0007842968389561337,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5841029.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023873315658420324,
+ "skip_count": 0.0,
+ "step": 3622,
+ "text_loss": 0.5842974781990051
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.014088641033165,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0007840421704812346,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5845158.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00400173757225275,
+ "skip_count": 1.0,
+ "step": 3624,
+ "text_loss": 0.8312450647354126
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.00078378739316084,
+ "loss": 0.0094,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5849175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004974664188921452,
+ "skip_count": 0.0,
+ "step": 3626,
+ "text_loss": 0.48637253046035767
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 25.0,
+ "epoch": 17.032873495744056,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.888888955116272,
+ "grad_norm": 0.10693359375,
+ "learning_rate": 0.000783532507092581,
+ "loss": 0.0079,
+ "macro_f1": 0.9555556178092957,
+ "num_tokens": 5852020.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02555239573121071,
+ "skip_count": 5.0,
+ "step": 3628,
+ "text_loss": 0.5407033562660217
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0007832775123741306,
+ "loss": 0.0106,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5854873.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025962977670133114,
+ "skip_count": 0.0,
+ "step": 3630,
+ "text_loss": 0.618230938911438
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.051658350454947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.000783022409103203,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5858086.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029271875973790884,
+ "skip_count": 0.0,
+ "step": 3632,
+ "text_loss": 0.21259798109531403
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0007827671973775542,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5860886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004102068953216076,
+ "skip_count": 0.0,
+ "step": 3634,
+ "text_loss": 0.4991208016872406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.070443205165834,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0007825118772949819,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5864291.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023497689981013536,
+ "skip_count": 1.0,
+ "step": 3636,
+ "text_loss": 0.3878401517868042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.0007822564489533255,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5867155.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007680345326662064,
+ "skip_count": 2.0,
+ "step": 3638,
+ "text_loss": 0.6132124066352844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.053466796875,
+ "learning_rate": 0.0007820009124504653,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5870325.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008242831099778414,
+ "skip_count": 0.0,
+ "step": 3640,
+ "text_loss": 0.3552473187446594
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.098620487232168,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0007817452678843236,
+ "loss": 0.0073,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 5873301.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.023831043392419815,
+ "skip_count": 2.0,
+ "step": 3642,
+ "text_loss": 0.18363867700099945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.108012914587615,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.0007814895153528635,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5876225.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001999989850446582,
+ "skip_count": 0.0,
+ "step": 3644,
+ "text_loss": 0.17581747472286224
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.11740534194306,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0007812336549540903,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5879501.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001098626758903265,
+ "skip_count": 0.0,
+ "step": 3646,
+ "text_loss": 0.5040884613990784
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.126797769298502,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0007809776867860499,
+ "loss": 0.005,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5882608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012210183776915073,
+ "skip_count": 1.0,
+ "step": 3648,
+ "text_loss": 0.27114811539649963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00078072161094683,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5886106.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005191771313548088,
+ "skip_count": 2.0,
+ "step": 3650,
+ "text_loss": 0.5167917609214783
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0007804654275345591,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5889122.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016411367105320096,
+ "skip_count": 1.0,
+ "step": 3652,
+ "text_loss": 0.7691274285316467
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 17.154975051364836,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0007802091366474074,
+ "loss": 0.005,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 5892313.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.015627093613147736,
+ "skip_count": 1.0,
+ "step": 3654,
+ "text_loss": 0.4646325409412384
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.164367478720283,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0007799527383835858,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5895577.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009879748104140162,
+ "skip_count": 0.0,
+ "step": 3656,
+ "text_loss": 0.5587969422340393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0986328125,
+ "learning_rate": 0.0007796962328413469,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5898546.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004864919930696487,
+ "skip_count": 0.0,
+ "step": 3658,
+ "text_loss": 0.6981375813484192
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0007794396201189839,
+ "loss": 0.0078,
+ "macro_f1": 1.0,
+ "num_tokens": 5901618.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006617432460188866,
+ "skip_count": 2.0,
+ "step": 3660,
+ "text_loss": 0.22521957755088806
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.192544760786618,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0007791829003148312,
+ "loss": 0.0098,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 5904540.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0782252699136734,
+ "skip_count": 2.0,
+ "step": 3662,
+ "text_loss": 0.2649642825126648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06494140625,
+ "learning_rate": 0.0007789260735272647,
+ "loss": 0.0114,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5907827.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012057392159476876,
+ "skip_count": 0.0,
+ "step": 3664,
+ "text_loss": 0.6943771243095398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 0.0007786691398547005,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5911163.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007476957980543375,
+ "skip_count": 2.0,
+ "step": 3666,
+ "text_loss": 0.1502683162689209
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 17.220722042852948,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0007784120993955962,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5913948.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004082011990249157,
+ "skip_count": 0.0,
+ "step": 3668,
+ "text_loss": 0.4127517640590668
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 17.230114470208395,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0007781549522484503,
+ "loss": 0.0066,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 5917360.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.027505695819854736,
+ "skip_count": 1.0,
+ "step": 3670,
+ "text_loss": 0.23892618715763092
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0007778976985118018,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5920524.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024977331049740314,
+ "skip_count": 2.0,
+ "step": 3672,
+ "text_loss": 0.5076471567153931
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.248899324919282,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 0.0007776403382842312,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5923632.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015700991498306394,
+ "skip_count": 0.0,
+ "step": 3674,
+ "text_loss": 0.6287924647331238
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.25829175227473,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05810546875,
+ "learning_rate": 0.0007773828716643591,
+ "loss": 0.0085,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5926438.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.05108916014432907,
+ "skip_count": 0.0,
+ "step": 3676,
+ "text_loss": 0.26517006754875183
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0007771252987508474,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5930081.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003439917229115963,
+ "skip_count": 0.0,
+ "step": 3678,
+ "text_loss": 0.5189079642295837
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 17.277076606985617,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056884765625,
+ "learning_rate": 0.0007768676196423984,
+ "loss": 0.0064,
+ "macro_f1": 1.0,
+ "num_tokens": 5933463.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001935846172273159,
+ "skip_count": 1.0,
+ "step": 3680,
+ "text_loss": 0.6703575849533081
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 17.286469034341064,
+ "f1_execute": 0.9433962106704712,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0007766098344377553,
+ "loss": 0.0082,
+ "macro_f1": 0.31446540355682373,
+ "num_tokens": 5937098.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0384826585650444,
+ "skip_count": 2.0,
+ "step": 3682,
+ "text_loss": 0.6424444913864136
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.0007763519432357018,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5940436.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008654671837575734,
+ "skip_count": 0.0,
+ "step": 3684,
+ "text_loss": 0.4189988672733307
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.30525388905195,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.0007760939461350623,
+ "loss": 0.0111,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5943731.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007468715775758028,
+ "skip_count": 2.0,
+ "step": 3686,
+ "text_loss": 0.2875453233718872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.314646316407398,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0007758358432347019,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5946707.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001252831774763763,
+ "skip_count": 0.0,
+ "step": 3688,
+ "text_loss": 0.5093055367469788
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0007755776346335259,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5949833.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001680848654359579,
+ "skip_count": 0.0,
+ "step": 3690,
+ "text_loss": 0.4031114876270294
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.0007753193204304807,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5953095.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0047258250415325165,
+ "skip_count": 2.0,
+ "step": 3692,
+ "text_loss": 0.17632785439491272
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.342823598473732,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0007750609007245524,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 5955971.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.001980359200388193,
+ "skip_count": 4.0,
+ "step": 3694,
+ "text_loss": 0.3423727750778198
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0007748023756147679,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5958948.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00511702848598361,
+ "skip_count": 0.0,
+ "step": 3696,
+ "text_loss": 0.28279972076416016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0007745437452001949,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5961819.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005220443126745522,
+ "skip_count": 0.0,
+ "step": 3698,
+ "text_loss": 0.4793325662612915
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.371000880540066,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0007742850095799408,
+ "loss": 0.0084,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 5964625.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06411020457744598,
+ "skip_count": 0.0,
+ "step": 3700,
+ "text_loss": 0.2825184464454651
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 17.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0751953125,
+ "learning_rate": 0.0007740261688531536,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5967134.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004408109001815319,
+ "skip_count": 3.0,
+ "step": 3702,
+ "text_loss": 0.690429151058197
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.0007737672231190215,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5969831.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006747521692886949,
+ "skip_count": 0.0,
+ "step": 3704,
+ "text_loss": 0.32556024193763733
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.399178162606397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.0007735081724767732,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5973015.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020414739847183228,
+ "skip_count": 0.0,
+ "step": 3706,
+ "text_loss": 0.5876469612121582
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 17.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.072265625,
+ "learning_rate": 0.0007732490170256769,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5975778.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005610425490885973,
+ "skip_count": 0.0,
+ "step": 3708,
+ "text_loss": 0.2968577444553375
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.0007729897568650422,
+ "loss": 0.0097,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5979115.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001248046406544745,
+ "skip_count": 0.0,
+ "step": 3710,
+ "text_loss": 0.626361608505249
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.42735544467273,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06787109375,
+ "learning_rate": 0.0007727303920942176,
+ "loss": 0.0102,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5982213.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005791695322841406,
+ "skip_count": 2.0,
+ "step": 3712,
+ "text_loss": 0.4133484661579132
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 17.436747872028178,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.08740234375,
+ "learning_rate": 0.0007724709228125922,
+ "loss": 0.0105,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 5984930.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02114664763212204,
+ "skip_count": 2.0,
+ "step": 3714,
+ "text_loss": 0.4646461308002472
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 17.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0007722113491195952,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 5988017.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005913930479437113,
+ "skip_count": 5.0,
+ "step": 3716,
+ "text_loss": 0.15474505722522736
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0007719516711146957,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5991562.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0075925313867628574,
+ "skip_count": 2.0,
+ "step": 3718,
+ "text_loss": 0.5293686985969543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.464925154094512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.000771691888897403,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 5994675.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012335237115621567,
+ "skip_count": 0.0,
+ "step": 3720,
+ "text_loss": 0.5210637450218201
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0771484375,
+ "learning_rate": 0.0007714320025672657,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 5999070.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010582062415778637,
+ "skip_count": 2.0,
+ "step": 3722,
+ "text_loss": 0.2783571779727936
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 17.4837100088054,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.000771172012223873,
+ "loss": 0.0078,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 6002702.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015008784830570221,
+ "skip_count": 3.0,
+ "step": 3724,
+ "text_loss": 0.358705073595047
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.493102436160846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052734375,
+ "learning_rate": 0.0007709119179668538,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6005517.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00111615180503577,
+ "skip_count": 0.0,
+ "step": 3726,
+ "text_loss": 0.45202162861824036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 17.50249486351629,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0007706517198958764,
+ "loss": 0.0096,
+ "macro_f1": 0.6595745086669922,
+ "num_tokens": 6009111.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.05215252563357353,
+ "skip_count": 4.0,
+ "step": 3728,
+ "text_loss": 0.20360413193702698
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 17.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.0007703914181106497,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6012989.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010039499960839748,
+ "skip_count": 3.0,
+ "step": 3730,
+ "text_loss": 0.20334361493587494
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.52127971822718,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08203125,
+ "learning_rate": 0.0007701310127109211,
+ "loss": 0.0062,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6016420.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01090205181390047,
+ "skip_count": 1.0,
+ "step": 3732,
+ "text_loss": 0.47959551215171814
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 24.0,
+ "epoch": 17.530672145582624,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.888888955116272,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0007698705037964791,
+ "loss": 0.0076,
+ "macro_f1": 0.6225374937057495,
+ "num_tokens": 6019551.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02677762135863304,
+ "skip_count": 5.0,
+ "step": 3734,
+ "text_loss": 0.2621438801288605
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 17.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.056640625,
+ "learning_rate": 0.000769609891467151,
+ "loss": 0.0119,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6022262.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00460716662928462,
+ "skip_count": 0.0,
+ "step": 3736,
+ "text_loss": 0.3433022201061249
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0007693491758228037,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6025723.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036111194640398026,
+ "skip_count": 2.0,
+ "step": 3738,
+ "text_loss": 0.38703784346580505
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0007690883569633442,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6028652.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003299296135082841,
+ "skip_count": 0.0,
+ "step": 3740,
+ "text_loss": 0.24203069508075714
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0007688274349887188,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6032280.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003173880511894822,
+ "skip_count": 0.0,
+ "step": 3742,
+ "text_loss": 0.2827291488647461
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.57763428235985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0007685664099989131,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6035111.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008576177642680705,
+ "skip_count": 0.0,
+ "step": 3744,
+ "text_loss": 0.43613526225090027
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0007683052820939524,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6038428.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004335585981607437,
+ "skip_count": 2.0,
+ "step": 3746,
+ "text_loss": 1.0385624170303345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0007680440513739015,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6041185.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008210531086660922,
+ "skip_count": 0.0,
+ "step": 3748,
+ "text_loss": 0.7070431709289551
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 17.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056640625,
+ "learning_rate": 0.0007677827179388646,
+ "loss": 0.0089,
+ "macro_f1": 1.0,
+ "num_tokens": 6046333.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003778942162171006,
+ "skip_count": 1.0,
+ "step": 3750,
+ "text_loss": 0.3682238757610321
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 17.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08984375,
+ "learning_rate": 0.000767521281888985,
+ "loss": 0.009,
+ "macro_f1": 1.0,
+ "num_tokens": 6049528.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002767334459349513,
+ "skip_count": 1.0,
+ "step": 3752,
+ "text_loss": 0.7619418501853943
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0007672597433244455,
+ "loss": 0.0108,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6053202.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004796457476913929,
+ "skip_count": 2.0,
+ "step": 3754,
+ "text_loss": 0.4157083034515381
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0007669981023454682,
+ "loss": 0.0126,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6056609.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013067846884950995,
+ "skip_count": 0.0,
+ "step": 3756,
+ "text_loss": 0.4529118537902832
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0007667363590523142,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6060504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010285493917763233,
+ "skip_count": 0.0,
+ "step": 3758,
+ "text_loss": 0.8363246321678162
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 17.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.055419921875,
+ "learning_rate": 0.0007664745135452844,
+ "loss": 0.0092,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6063526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006289863493293524,
+ "skip_count": 3.0,
+ "step": 3760,
+ "text_loss": 0.5313657522201538
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.662166128558848,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05517578125,
+ "learning_rate": 0.0007662125659247183,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6067147.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028537956532090902,
+ "skip_count": 0.0,
+ "step": 3762,
+ "text_loss": 0.5668109059333801
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0007659505162909949,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6070350.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026814753655344248,
+ "skip_count": 0.0,
+ "step": 3764,
+ "text_loss": 0.4983512759208679
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056884765625,
+ "learning_rate": 0.0007656883647445318,
+ "loss": 0.0099,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6073091.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005981382913887501,
+ "skip_count": 1.0,
+ "step": 3766,
+ "text_loss": 0.30372318625450134
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.690343410625182,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0007654261113857863,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6076244.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000803640519734472,
+ "skip_count": 0.0,
+ "step": 3768,
+ "text_loss": 0.6100738048553467
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.69973583798063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0007651637563152539,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6078936.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013324898900464177,
+ "skip_count": 0.0,
+ "step": 3770,
+ "text_loss": 0.4733821153640747
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 17.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.0007649012996334701,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6081951.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0021543330512940884,
+ "skip_count": 0.0,
+ "step": 3772,
+ "text_loss": 0.6794875860214233
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.718520692691516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.0007646387414410085,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6085165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005426189745776355,
+ "skip_count": 0.0,
+ "step": 3774,
+ "text_loss": 0.5886107683181763
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.727913120046964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0007643760818384819,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6088370.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002537576947361231,
+ "skip_count": 0.0,
+ "step": 3776,
+ "text_loss": 0.23591920733451843
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0007641133209265423,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6092319.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002613696036860347,
+ "skip_count": 0.0,
+ "step": 3778,
+ "text_loss": 0.3217754662036896
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052978515625,
+ "learning_rate": 0.0007638504588058796,
+ "loss": 0.0105,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6095799.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007219464750960469,
+ "skip_count": 0.0,
+ "step": 3780,
+ "text_loss": 0.4276983141899109
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 17.756090402113298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.0007635874955772234,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6098789.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005965052172541618,
+ "skip_count": 3.0,
+ "step": 3782,
+ "text_loss": 0.30936646461486816
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07177734375,
+ "learning_rate": 0.0007633244313413417,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6101631.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007469559786841273,
+ "skip_count": 0.0,
+ "step": 3784,
+ "text_loss": 0.44460123777389526
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.774875256824185,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0007630612661990412,
+ "loss": 0.0097,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6105097.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004300760570913553,
+ "skip_count": 1.0,
+ "step": 3786,
+ "text_loss": 0.41950157284736633
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.784267684179632,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0007627980002511672,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6107847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023050960153341293,
+ "skip_count": 1.0,
+ "step": 3788,
+ "text_loss": 0.48561373353004456
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0007625346335986039,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6110546.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018124044872820377,
+ "skip_count": 0.0,
+ "step": 3790,
+ "text_loss": 0.20882295072078705
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0007622711663422735,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6113600.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007613401976414025,
+ "skip_count": 0.0,
+ "step": 3792,
+ "text_loss": 0.31751760840415955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.812444966245963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0007620075985831375,
+ "loss": 0.0092,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6116916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005452962126582861,
+ "skip_count": 2.0,
+ "step": 3794,
+ "text_loss": 0.3246645927429199
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 17.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0007617439304221956,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6120056.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0043787881731987,
+ "skip_count": 0.0,
+ "step": 3796,
+ "text_loss": 0.4859195947647095
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.0007614801619604856,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6122668.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033891722559928894,
+ "skip_count": 0.0,
+ "step": 3798,
+ "text_loss": 0.48194369673728943
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.840622248312297,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.0007612162932990845,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6126792.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001883238204754889,
+ "skip_count": 0.0,
+ "step": 3800,
+ "text_loss": 0.3740062117576599
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0007609523245391068,
+ "loss": 0.0076,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6129801.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00882677361369133,
+ "skip_count": 2.0,
+ "step": 3802,
+ "text_loss": 0.5759486556053162
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0007606882557817062,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6133613.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009537030011415482,
+ "skip_count": 2.0,
+ "step": 3804,
+ "text_loss": 0.3217554986476898
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.0007604240871280742,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6137784.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023913346230983734,
+ "skip_count": 0.0,
+ "step": 3806,
+ "text_loss": 0.3718445599079132
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.878191957734078,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0007601598186794407,
+ "loss": 0.0081,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 6141356.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.033796411007642746,
+ "skip_count": 1.0,
+ "step": 3808,
+ "text_loss": 0.2717749774456024
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.88758438508952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.000759895450537074,
+ "loss": 0.01,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6144448.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037919918540865183,
+ "skip_count": 2.0,
+ "step": 3810,
+ "text_loss": 0.5935076475143433
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.896976812444965,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0007596309828022803,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6147526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008182782912626863,
+ "skip_count": 0.0,
+ "step": 3812,
+ "text_loss": 0.449336439371109
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 17.906369239800412,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0007593664155764044,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6150620.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001734903547912836,
+ "skip_count": 0.0,
+ "step": 3814,
+ "text_loss": 0.6647221446037292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.915761667155856,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0007591017489608286,
+ "loss": 0.0088,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6153714.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04721754416823387,
+ "skip_count": 0.0,
+ "step": 3816,
+ "text_loss": 0.25481200218200684
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0007588369830569738,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6156974.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0002484306460246444,
+ "skip_count": 0.0,
+ "step": 3818,
+ "text_loss": 0.7195295691490173
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.934546521866746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0007585721179662988,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6159660.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0051363613456487656,
+ "skip_count": 2.0,
+ "step": 3820,
+ "text_loss": 0.5073586702346802
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052734375,
+ "learning_rate": 0.0007583071537903005,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6163146.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006719176657497883,
+ "skip_count": 0.0,
+ "step": 3822,
+ "text_loss": 0.6950558423995972
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 17.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0007580420906305136,
+ "loss": 0.0073,
+ "macro_f1": 1.0,
+ "num_tokens": 6166257.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00871267355978489,
+ "skip_count": 3.0,
+ "step": 3824,
+ "text_loss": 0.2549148201942444
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.0007577769285885109,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6169624.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015642556827515364,
+ "skip_count": 0.0,
+ "step": 3826,
+ "text_loss": 0.3720305860042572
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0007575116677659029,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6172673.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011551049537956715,
+ "skip_count": 0.0,
+ "step": 3828,
+ "text_loss": 0.6819429397583008
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 17.981508658643968,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0007572463082643377,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6175414.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008922060951590538,
+ "skip_count": 0.0,
+ "step": 3830,
+ "text_loss": 0.5424665212631226
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 17.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0007569808501855023,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6178701.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004167596809566021,
+ "skip_count": 1.0,
+ "step": 3832,
+ "text_loss": 0.4429764151573181
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.00075671529363112,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6183036.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008732969872653484,
+ "skip_count": 0.0,
+ "step": 3834,
+ "text_loss": 0.8015334010124207
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.009392427355444,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0007564496387029531,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6186325.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021374202333390713,
+ "skip_count": 1.0,
+ "step": 3836,
+ "text_loss": 0.4233771562576294
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.000756183885502801,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6189919.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004017227329313755,
+ "skip_count": 0.0,
+ "step": 3838,
+ "text_loss": 0.33691394329071045
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 0.0007559180341325005,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6193412.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013120946241542697,
+ "skip_count": 0.0,
+ "step": 3840,
+ "text_loss": 0.14970099925994873
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 18.037569709421778,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0007556520846939265,
+ "loss": 0.0061,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 6196588.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011793316341936588,
+ "skip_count": 2.0,
+ "step": 3842,
+ "text_loss": 0.2714047133922577
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 18.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0007553860372889914,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 6200841.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.019968654960393906,
+ "skip_count": 4.0,
+ "step": 3844,
+ "text_loss": 0.23680976033210754
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 18.05635456413267,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 0.0007551198920196452,
+ "loss": 0.0079,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 6203797.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013615630567073822,
+ "skip_count": 2.0,
+ "step": 3846,
+ "text_loss": 0.25839608907699585
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0546875,
+ "learning_rate": 0.000754853648987875,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6206790.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002420815173536539,
+ "skip_count": 1.0,
+ "step": 3848,
+ "text_loss": 0.5358025431632996
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 18.07513941884356,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0007545873082957057,
+ "loss": 0.0072,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 6209791.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.018236197531223297,
+ "skip_count": 3.0,
+ "step": 3850,
+ "text_loss": 0.1463700383901596
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 18.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0007543208700451998,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6212792.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006242573726922274,
+ "skip_count": 3.0,
+ "step": 3852,
+ "text_loss": 0.9441591501235962
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.093924273554446,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0007540543343384565,
+ "loss": 0.0062,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6215747.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01451140083372593,
+ "skip_count": 1.0,
+ "step": 3854,
+ "text_loss": 0.41610902547836304
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0007537877012776132,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6218593.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00037674361374229193,
+ "skip_count": 0.0,
+ "step": 3856,
+ "text_loss": 0.6048852205276489
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.0007535209709648439,
+ "loss": 0.0045,
+ "macro_f1": 1.0,
+ "num_tokens": 6221315.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005776284262537956,
+ "skip_count": 3.0,
+ "step": 3858,
+ "text_loss": 0.35627537965774536
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.0007532541435023605,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6225012.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009280376834794879,
+ "skip_count": 0.0,
+ "step": 3860,
+ "text_loss": 0.6440183520317078
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0007529872189924114,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6227650.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009876530384644866,
+ "skip_count": 0.0,
+ "step": 3862,
+ "text_loss": 0.35507893562316895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.14088641033167,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0007527201975372827,
+ "loss": 0.0045,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 6230557.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013780162669718266,
+ "skip_count": 1.0,
+ "step": 3864,
+ "text_loss": 0.38958442211151123
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 18.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.0007524530792392977,
+ "loss": 0.011,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6233371.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004849869292229414,
+ "skip_count": 3.0,
+ "step": 3866,
+ "text_loss": 0.3826720714569092
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.159671265042558,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 0.0007521858642008163,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6236770.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008618295192718506,
+ "skip_count": 1.0,
+ "step": 3868,
+ "text_loss": 0.3596078157424927
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0007519185525242363,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6239661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013421972980722785,
+ "skip_count": 0.0,
+ "step": 3870,
+ "text_loss": 0.5585550665855408
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.0007516511443119916,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6242459.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038009448908269405,
+ "skip_count": 1.0,
+ "step": 3872,
+ "text_loss": 0.4418395757675171
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.187848547108892,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0007513836396665534,
+ "loss": 0.0061,
+ "macro_f1": 1.0,
+ "num_tokens": 6245489.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002785376040264964,
+ "skip_count": 2.0,
+ "step": 3874,
+ "text_loss": 0.551510751247406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.19724097446434,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0007511160386904305,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6249014.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021424589212983847,
+ "skip_count": 1.0,
+ "step": 3876,
+ "text_loss": 1.0502676963806152
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0007508483414861679,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6252357.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0085759861394763,
+ "skip_count": 1.0,
+ "step": 3878,
+ "text_loss": 0.49212515354156494
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.216025829175226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0007505805481563477,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6254975.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010723904706537724,
+ "skip_count": 0.0,
+ "step": 3880,
+ "text_loss": 0.7022985816001892
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.225418256530673,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.0007503126588035887,
+ "loss": 0.0081,
+ "macro_f1": 1.0,
+ "num_tokens": 6258001.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012809890322387218,
+ "skip_count": 2.0,
+ "step": 3882,
+ "text_loss": 0.1829151213169098
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.0007500446735305466,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6261795.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026790346018970013,
+ "skip_count": 1.0,
+ "step": 3884,
+ "text_loss": 0.20436066389083862
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.000749776592439914,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 6265585.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005243788007646799,
+ "skip_count": 2.0,
+ "step": 3886,
+ "text_loss": 0.4479229748249054
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.253595538597008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.00074950841563442,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6269039.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007998534478247166,
+ "skip_count": 1.0,
+ "step": 3888,
+ "text_loss": 0.2154676914215088
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0007492401432168303,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6272315.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004648822825402021,
+ "skip_count": 1.0,
+ "step": 3890,
+ "text_loss": 0.3375042676925659
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.272380393307895,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0007489717752899477,
+ "loss": 0.0094,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6275342.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012154200114309788,
+ "skip_count": 1.0,
+ "step": 3892,
+ "text_loss": 0.1964082419872284
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.281772820663342,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.000748703311956611,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 6278700.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004610476549714804,
+ "skip_count": 2.0,
+ "step": 3894,
+ "text_loss": 0.26545581221580505
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 0.0007484347533196961,
+ "loss": 0.0105,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6281864.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0075586591847240925,
+ "skip_count": 2.0,
+ "step": 3896,
+ "text_loss": 0.3106999397277832
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.0007481660994821151,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6284676.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007845268584787846,
+ "skip_count": 1.0,
+ "step": 3898,
+ "text_loss": 0.4094304144382477
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.309950102729672,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0007478973505468165,
+ "loss": 0.0081,
+ "macro_f1": 1.0,
+ "num_tokens": 6287470.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011116391979157925,
+ "skip_count": 2.0,
+ "step": 3900,
+ "text_loss": 0.1838909536600113
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.31934253008512,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0007476285066167857,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6290432.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004599364474415779,
+ "skip_count": 0.0,
+ "step": 3902,
+ "text_loss": 0.25872838497161865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.0007473595677950439,
+ "loss": 0.0109,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6293557.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016367282951250672,
+ "skip_count": 1.0,
+ "step": 3904,
+ "text_loss": 0.5272360444068909
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.338127384796007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0007470905341846492,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6295979.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004760588926728815,
+ "skip_count": 0.0,
+ "step": 3906,
+ "text_loss": 0.666959822177887
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0007468214058886956,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6299215.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000524883100297302,
+ "skip_count": 0.0,
+ "step": 3908,
+ "text_loss": 0.5144801139831543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0007465521830103137,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6302320.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016085522947832942,
+ "skip_count": 0.0,
+ "step": 3910,
+ "text_loss": 0.14342890679836273
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0007462828656526702,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6305212.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002720315707847476,
+ "skip_count": 2.0,
+ "step": 3912,
+ "text_loss": 0.31109121441841125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.375697094217788,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06884765625,
+ "learning_rate": 0.0007460134539189681,
+ "loss": 0.0114,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6308964.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010418406454846263,
+ "skip_count": 1.0,
+ "step": 3914,
+ "text_loss": 0.5662030577659607
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.38508952157323,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 0.0007457439479124459,
+ "loss": 0.0134,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6313195.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020303844939917326,
+ "skip_count": 0.0,
+ "step": 3916,
+ "text_loss": 0.6358339190483093
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.394481948928675,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.0007454743477363797,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6315949.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006592223653569818,
+ "skip_count": 0.0,
+ "step": 3918,
+ "text_loss": 0.35648423433303833
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.403874376284122,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0007452046534940803,
+ "loss": 0.0075,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 6319024.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.024555351585149765,
+ "skip_count": 1.0,
+ "step": 3920,
+ "text_loss": 0.21955153346061707
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0007449348652888952,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6321633.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003606822807341814,
+ "skip_count": 1.0,
+ "step": 3922,
+ "text_loss": 0.6079489588737488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0007446649832242075,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6325209.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035831446293741465,
+ "skip_count": 1.0,
+ "step": 3924,
+ "text_loss": 0.2774808406829834
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.432051658350456,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0007443950074034368,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6327822.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006809544749557972,
+ "skip_count": 2.0,
+ "step": 3926,
+ "text_loss": 0.48236769437789917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.4414440857059,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0007441249379300381,
+ "loss": 0.007,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 6331662.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.023832591250538826,
+ "skip_count": 2.0,
+ "step": 3928,
+ "text_loss": 0.7287537455558777
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0007438547749075028,
+ "loss": 0.0061,
+ "macro_f1": 1.0,
+ "num_tokens": 6335801.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011755098588764668,
+ "skip_count": 3.0,
+ "step": 3930,
+ "text_loss": 0.17253030836582184
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0007435845184393577,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6338747.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005972472485154867,
+ "skip_count": 0.0,
+ "step": 3932,
+ "text_loss": 0.6400216817855835
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0007433141686291657,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6342772.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030393085908144712,
+ "skip_count": 1.0,
+ "step": 3934,
+ "text_loss": 0.6865074038505554
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.0007430437255805252,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6345957.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006984061910770833,
+ "skip_count": 0.0,
+ "step": 3936,
+ "text_loss": 0.40398702025413513
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.488406222483125,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07275390625,
+ "learning_rate": 0.0007427731893970706,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6349162.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005219762213528156,
+ "skip_count": 0.0,
+ "step": 3938,
+ "text_loss": 0.5951031446456909
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 18.49779864983857,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.0007425025601824717,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6352655.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015575960278511047,
+ "skip_count": 3.0,
+ "step": 3940,
+ "text_loss": 0.26689088344573975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0007422318380404346,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6355890.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012208883417770267,
+ "skip_count": 0.0,
+ "step": 3942,
+ "text_loss": 0.570725679397583
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.516583504549455,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0007419610230746999,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6358891.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0029412026051431894,
+ "skip_count": 0.0,
+ "step": 3944,
+ "text_loss": 0.5521301031112671
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0007416901153890448,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6361586.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010283910669386387,
+ "skip_count": 0.0,
+ "step": 3946,
+ "text_loss": 0.4046417772769928
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0007414191150872818,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6364954.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008222512900829315,
+ "skip_count": 2.0,
+ "step": 3948,
+ "text_loss": 0.2803446352481842
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0007411480222732583,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6367660.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001304348581470549,
+ "skip_count": 0.0,
+ "step": 3950,
+ "text_loss": 0.45553359389305115
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0007408768370508576,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6371585.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016345062758773565,
+ "skip_count": 0.0,
+ "step": 3952,
+ "text_loss": 0.25424402952194214
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0007406055595239986,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6374365.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005097290268167853,
+ "skip_count": 0.0,
+ "step": 3954,
+ "text_loss": 0.5856026411056519
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.572938068682124,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.060546875,
+ "learning_rate": 0.0007403341897966356,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6377335.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002482263371348381,
+ "skip_count": 1.0,
+ "step": 3956,
+ "text_loss": 0.5145615339279175
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.0007400627279727574,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6380799.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011743451468646526,
+ "skip_count": 0.0,
+ "step": 3958,
+ "text_loss": 0.31868961453437805
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.0007397911741563892,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6383963.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009861881844699383,
+ "skip_count": 0.0,
+ "step": 3960,
+ "text_loss": 0.21192194521427155
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.601115350748458,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0007395195284515905,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6387410.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004189098719507456,
+ "skip_count": 0.0,
+ "step": 3962,
+ "text_loss": 0.5809708833694458
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.610507778103905,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0007392477909624567,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6390670.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001853612600825727,
+ "skip_count": 0.0,
+ "step": 3964,
+ "text_loss": 0.48985618352890015
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.61990020545935,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.0007389759617931182,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6393609.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003303771372884512,
+ "skip_count": 0.0,
+ "step": 3966,
+ "text_loss": 0.28729453682899475
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 18.629292632814792,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.10595703125,
+ "learning_rate": 0.0007387040410477404,
+ "loss": 0.0058,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 6396608.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01791577786207199,
+ "skip_count": 4.0,
+ "step": 3968,
+ "text_loss": 0.30386820435523987
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.0007384320288305235,
+ "loss": 0.0091,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6399793.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005771282012574375,
+ "skip_count": 0.0,
+ "step": 3970,
+ "text_loss": 0.47285011410713196
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0007381599252457037,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6403365.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003010645741596818,
+ "skip_count": 0.0,
+ "step": 3972,
+ "text_loss": 0.5313063859939575
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.000737887730397551,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6406205.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006457438692450523,
+ "skip_count": 0.0,
+ "step": 3974,
+ "text_loss": 0.2323843240737915
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.666862342236573,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0007376154443903713,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6409552.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010693981312215328,
+ "skip_count": 0.0,
+ "step": 3976,
+ "text_loss": 0.6304101943969727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.676254769592017,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0007373430673285051,
+ "loss": 0.008,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6412386.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03116440214216709,
+ "skip_count": 0.0,
+ "step": 3978,
+ "text_loss": 0.23448467254638672
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.68564719694746,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10009765625,
+ "learning_rate": 0.0007370705993163278,
+ "loss": 0.0111,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6416054.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011973714455962181,
+ "skip_count": 0.0,
+ "step": 3980,
+ "text_loss": 0.6371755599975586
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.695039624302908,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0007367980404582497,
+ "loss": 0.0105,
+ "macro_f1": 1.0,
+ "num_tokens": 6419238.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005117347463965416,
+ "skip_count": 2.0,
+ "step": 3982,
+ "text_loss": 0.19822923839092255
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.0007365253908587158,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6422122.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010648667812347412,
+ "skip_count": 0.0,
+ "step": 3984,
+ "text_loss": 0.566700279712677
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.0007362526506222058,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6425313.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005726494826376438,
+ "skip_count": 0.0,
+ "step": 3986,
+ "text_loss": 0.6568437814712524
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 18.723216906369238,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0007359798198532343,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6428422.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004504100419580936,
+ "skip_count": 0.0,
+ "step": 3988,
+ "text_loss": 0.598754346370697
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0007357068986563509,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6431512.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019837068393826485,
+ "skip_count": 1.0,
+ "step": 3990,
+ "text_loss": 0.7152895927429199
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0007354338871361393,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6434358.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026031541638076305,
+ "skip_count": 1.0,
+ "step": 3992,
+ "text_loss": 0.4986513555049896
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.751394188435572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.000735160785397218,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6438175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024831905029714108,
+ "skip_count": 2.0,
+ "step": 3994,
+ "text_loss": 0.4406205713748932
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0007348875935442401,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6441228.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008635876583866775,
+ "skip_count": 0.0,
+ "step": 3996,
+ "text_loss": 0.48884135484695435
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0007346143116818932,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6444318.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004007008858025074,
+ "skip_count": 0.0,
+ "step": 3998,
+ "text_loss": 0.6669428944587708
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.779571470501907,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08203125,
+ "learning_rate": 0.0007343409399148994,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6448317.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031380734872072935,
+ "skip_count": 0.0,
+ "step": 4000,
+ "text_loss": 0.6468493938446045
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.788963897857354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0007340674783480154,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6451673.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004996029660105705,
+ "skip_count": 0.0,
+ "step": 4002,
+ "text_loss": 0.28135430812835693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.798356325212797,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0007337939270860323,
+ "loss": 0.009,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6456372.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03784399852156639,
+ "skip_count": 0.0,
+ "step": 4004,
+ "text_loss": 0.41668644547462463
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.80774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0007335202862337753,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6459047.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011750755365937948,
+ "skip_count": 0.0,
+ "step": 4006,
+ "text_loss": 0.6853910684585571
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 18.817141179923688,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.000733246555896104,
+ "loss": 0.0062,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 6462390.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01630394533276558,
+ "skip_count": 4.0,
+ "step": 4008,
+ "text_loss": 0.7110592126846313
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.0007329727361779124,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6466057.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0052404399029910564,
+ "skip_count": 2.0,
+ "step": 4010,
+ "text_loss": 0.13856995105743408
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.000732698827184129,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6468878.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002138581359758973,
+ "skip_count": 0.0,
+ "step": 4012,
+ "text_loss": 0.3999565839767456
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.845318461990022,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.000732424829019716,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6472364.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037466560024768114,
+ "skip_count": 0.0,
+ "step": 4014,
+ "text_loss": 0.28161346912384033
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0007321507417896699,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6475379.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010469373082742095,
+ "skip_count": 0.0,
+ "step": 4016,
+ "text_loss": 1.0490952730178833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06591796875,
+ "learning_rate": 0.0007318765655990218,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6478585.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009968385100364685,
+ "skip_count": 2.0,
+ "step": 4018,
+ "text_loss": 0.31696680188179016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 18.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0007316023005528362,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6484153.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002349073765799403,
+ "skip_count": 1.0,
+ "step": 4020,
+ "text_loss": 0.30981555581092834
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 18.8828881714118,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0007313279467562124,
+ "loss": 0.0053,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 6487029.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011854278855025768,
+ "skip_count": 4.0,
+ "step": 4022,
+ "text_loss": 0.9689550399780273
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.0007310535043142829,
+ "loss": 0.0077,
+ "macro_f1": 1.0,
+ "num_tokens": 6490315.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00908346101641655,
+ "skip_count": 3.0,
+ "step": 4024,
+ "text_loss": 0.1705625057220459
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.901673026122687,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0007307789733322146,
+ "loss": 0.0094,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6493921.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007360641611739993,
+ "skip_count": 0.0,
+ "step": 4026,
+ "text_loss": 0.6252996325492859
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.087890625,
+ "learning_rate": 0.0007305043539152083,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6496689.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017757206223905087,
+ "skip_count": 0.0,
+ "step": 4028,
+ "text_loss": 0.40533265471458435
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.000730229646168499,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6500090.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022657213266938925,
+ "skip_count": 0.0,
+ "step": 4030,
+ "text_loss": 0.25954708456993103
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.92985030818902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0007299548501973548,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6503023.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021747269202023745,
+ "skip_count": 0.0,
+ "step": 4032,
+ "text_loss": 0.6223418712615967
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 18.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0007296799661070782,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6506382.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006400502752512693,
+ "skip_count": 4.0,
+ "step": 4034,
+ "text_loss": 0.6873653531074524
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.94863516289991,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0007294049940030055,
+ "loss": 0.0065,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6509194.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0197185929864645,
+ "skip_count": 1.0,
+ "step": 4036,
+ "text_loss": 0.16156800091266632
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0007291299339905059,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6512271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009541353792883456,
+ "skip_count": 0.0,
+ "step": 4038,
+ "text_loss": 0.5038442015647888
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.967420017610802,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0007288547861749838,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6516403.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008226391859352589,
+ "skip_count": 2.0,
+ "step": 4040,
+ "text_loss": 0.3706657588481903
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.976812444966246,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0007285795506618758,
+ "loss": 0.0063,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6519310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017001887783408165,
+ "skip_count": 1.0,
+ "step": 4042,
+ "text_loss": 0.24296723306179047
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 18.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0007283042275566528,
+ "loss": 0.0125,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6521979.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01666323095560074,
+ "skip_count": 2.0,
+ "step": 4044,
+ "text_loss": 0.36904850602149963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 18.995597299677137,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 0.0007280288169648192,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6524976.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007593175978399813,
+ "skip_count": 0.0,
+ "step": 4046,
+ "text_loss": 0.7312731146812439
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 19.00469621367772,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0007277533189919127,
+ "loss": 0.0063,
+ "macro_f1": 1.0,
+ "num_tokens": 6528638.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005652119871228933,
+ "skip_count": 1.0,
+ "step": 4048,
+ "text_loss": 0.23326151072978973
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.014088641033165,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.0007274777337435046,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6532193.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010509157553315163,
+ "skip_count": 2.0,
+ "step": 4050,
+ "text_loss": 0.23918013274669647
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0007272020613251999,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6534994.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002153293928131461,
+ "skip_count": 0.0,
+ "step": 4052,
+ "text_loss": 0.5890526175498962
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0007269263018426367,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 6537469.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0018494052346795797,
+ "skip_count": 2.0,
+ "step": 4054,
+ "text_loss": 0.36058738827705383
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0007266504554014866,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6541271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007579320226795971,
+ "skip_count": 0.0,
+ "step": 4056,
+ "text_loss": 0.4089007079601288
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.051658350454947,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0007263745221074545,
+ "loss": 0.0086,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 6544293.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06202420964837074,
+ "skip_count": 2.0,
+ "step": 4058,
+ "text_loss": 0.2226305454969406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 19.06105077781039,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.0007260985020662784,
+ "loss": 0.0049,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 6547640.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.044639844447374344,
+ "skip_count": 3.0,
+ "step": 4060,
+ "text_loss": 0.23004353046417236
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 19.070443205165834,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.0007258223953837298,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6550840.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004215611144900322,
+ "skip_count": 0.0,
+ "step": 4062,
+ "text_loss": 0.2891770601272583
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0007255462021656132,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6554122.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011056234361603856,
+ "skip_count": 0.0,
+ "step": 4064,
+ "text_loss": 0.7485370635986328
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0007252699225177666,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6557138.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008258933201432228,
+ "skip_count": 2.0,
+ "step": 4066,
+ "text_loss": 0.25219282507896423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.098620487232168,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0007249935565460606,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6560654.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005102175287902355,
+ "skip_count": 0.0,
+ "step": 4068,
+ "text_loss": 0.5553314089775085
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.108012914587615,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0007247171043563994,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6563814.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01283820066601038,
+ "skip_count": 2.0,
+ "step": 4070,
+ "text_loss": 0.15729956328868866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.11740534194306,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.0007244405660547199,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6567060.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009684927063062787,
+ "skip_count": 0.0,
+ "step": 4072,
+ "text_loss": 0.3725031912326813
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.126797769298502,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 0.000724163941746992,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6571608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007890827837400138,
+ "skip_count": 0.0,
+ "step": 4074,
+ "text_loss": 0.8438301682472229
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 19.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0007238872315392189,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 6575214.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0040600355714559555,
+ "skip_count": 1.0,
+ "step": 4076,
+ "text_loss": 0.5923112034797668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0007236104355374363,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6578383.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024899677373468876,
+ "skip_count": 2.0,
+ "step": 4078,
+ "text_loss": 0.20302526652812958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05517578125,
+ "learning_rate": 0.000723333553847713,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6582175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006120906211435795,
+ "skip_count": 2.0,
+ "step": 4080,
+ "text_loss": 0.5400223731994629
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.164367478720283,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06787109375,
+ "learning_rate": 0.0007230565865761504,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6585516.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029941233806312084,
+ "skip_count": 0.0,
+ "step": 4082,
+ "text_loss": 0.19460804760456085
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.0007227795338288831,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6588266.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009357884526252747,
+ "skip_count": 2.0,
+ "step": 4084,
+ "text_loss": 0.35237613320350647
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0007225023957120782,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6591009.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023083325941115618,
+ "skip_count": 2.0,
+ "step": 4086,
+ "text_loss": 0.4336731433868408
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.192544760786618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.0007222251723319356,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6594472.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008416616474278271,
+ "skip_count": 0.0,
+ "step": 4088,
+ "text_loss": 0.6390535831451416
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 0.0007219478637946877,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6597477.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004390760324895382,
+ "skip_count": 1.0,
+ "step": 4090,
+ "text_loss": 0.525839626789093
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0007216704702065997,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6600431.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010311100631952286,
+ "skip_count": 0.0,
+ "step": 4092,
+ "text_loss": 0.5310423374176025
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.220722042852948,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0007213929916739695,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6603899.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032497600186616182,
+ "skip_count": 1.0,
+ "step": 4094,
+ "text_loss": 0.2775326073169708
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.000721115428303127,
+ "loss": 0.0077,
+ "macro_f1": 1.0,
+ "num_tokens": 6606544.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004692315589636564,
+ "skip_count": 3.0,
+ "step": 4096,
+ "text_loss": 0.6667124032974243
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0007208377802004353,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6610097.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007263485458679497,
+ "skip_count": 0.0,
+ "step": 4098,
+ "text_loss": 0.6916406750679016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.248899324919282,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0007205600474722897,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6613836.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017989488551393151,
+ "skip_count": 0.0,
+ "step": 4100,
+ "text_loss": 0.5257929563522339
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.000720282230225118,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6616780.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011308686807751656,
+ "skip_count": 1.0,
+ "step": 4102,
+ "text_loss": 0.4410906732082367
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0007200043285653799,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6620110.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002058265497907996,
+ "skip_count": 2.0,
+ "step": 4104,
+ "text_loss": 0.8581191897392273
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 19.277076606985617,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0007197263425995681,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6622585.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017528717871755362,
+ "skip_count": 0.0,
+ "step": 4106,
+ "text_loss": 0.5000449419021606
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.286469034341064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.0007194482724342075,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6626356.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021995846182107925,
+ "skip_count": 0.0,
+ "step": 4108,
+ "text_loss": 0.401346892118454
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.0007191701181758547,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6629738.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014869922306388617,
+ "skip_count": 0.0,
+ "step": 4110,
+ "text_loss": 0.9598422050476074
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.30525388905195,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.0007188918799310993,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6632807.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012853415682911873,
+ "skip_count": 0.0,
+ "step": 4112,
+ "text_loss": 0.3996548354625702
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.314646316407398,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0007186135578065627,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6636227.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009887361666187644,
+ "skip_count": 0.0,
+ "step": 4114,
+ "text_loss": 0.4127283990383148
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.0007183351519088982,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6639443.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006282114889472723,
+ "skip_count": 1.0,
+ "step": 4116,
+ "text_loss": 0.20028606057167053
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.333431171118285,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.061767578125,
+ "learning_rate": 0.0007180566623447917,
+ "loss": 0.0114,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 6642127.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008101986721158028,
+ "skip_count": 0.0,
+ "step": 4118,
+ "text_loss": 0.763931155204773
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.342823598473732,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.0007177780892209607,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6645376.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001953610684722662,
+ "skip_count": 0.0,
+ "step": 4120,
+ "text_loss": 0.42317715287208557
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0007174994326441551,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6648150.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003279355587437749,
+ "skip_count": 0.0,
+ "step": 4122,
+ "text_loss": 0.19656142592430115
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.0007172206927211567,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6650935.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032076311763375998,
+ "skip_count": 0.0,
+ "step": 4124,
+ "text_loss": 0.13608409464359283
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0007169418695587791,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6654464.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004065621178597212,
+ "skip_count": 2.0,
+ "step": 4126,
+ "text_loss": 0.4882086217403412
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.0007166629632638678,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6657749.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009243001695722342,
+ "skip_count": 0.0,
+ "step": 4128,
+ "text_loss": 0.31632331013679504
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0007163839739433003,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6660997.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018459554994478822,
+ "skip_count": 0.0,
+ "step": 4130,
+ "text_loss": 0.6123947501182556
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.399178162606397,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0007161049017039857,
+ "loss": 0.0073,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 6663542.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.030032536014914513,
+ "skip_count": 2.0,
+ "step": 4132,
+ "text_loss": 0.6985659003257751
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 0.0007158257466528652,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6666178.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013813833938911557,
+ "skip_count": 0.0,
+ "step": 4134,
+ "text_loss": 0.38380664587020874
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 19.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.0007155465088969114,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6668852.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00513424864038825,
+ "skip_count": 3.0,
+ "step": 4136,
+ "text_loss": 0.49724283814430237
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.42735544467273,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0007152671885431288,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6671430.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005165594047866762,
+ "skip_count": 0.0,
+ "step": 4138,
+ "text_loss": 0.666959822177887
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 0.0007149877856985535,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6675215.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001685218419879675,
+ "skip_count": 0.0,
+ "step": 4140,
+ "text_loss": 0.3127259612083435
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.000714708300470253,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6678505.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004025314934551716,
+ "skip_count": 0.0,
+ "step": 4142,
+ "text_loss": 0.3179470896720886
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 19.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0007144287329653269,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6681127.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005965690594166517,
+ "skip_count": 0.0,
+ "step": 4144,
+ "text_loss": 0.3862907886505127
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.464925154094512,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0007141490832909058,
+ "loss": 0.0071,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6683968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012896374799311161,
+ "skip_count": 1.0,
+ "step": 4146,
+ "text_loss": 0.48156118392944336
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0007138693515541519,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6687196.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006367767928168178,
+ "skip_count": 1.0,
+ "step": 4148,
+ "text_loss": 0.676702082157135
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 19.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.0007135895378622592,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6689972.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004532640799880028,
+ "skip_count": 3.0,
+ "step": 4150,
+ "text_loss": 0.5865558981895447
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.493102436160846,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0007133096423224526,
+ "loss": 0.0081,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6693568.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0377078577876091,
+ "skip_count": 0.0,
+ "step": 4152,
+ "text_loss": 0.2790502607822418
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056640625,
+ "learning_rate": 0.0007130296650419885,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6696468.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004455826710909605,
+ "skip_count": 1.0,
+ "step": 4154,
+ "text_loss": 0.5869500041007996
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0007127496061281551,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6699307.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001998464809730649,
+ "skip_count": 0.0,
+ "step": 4156,
+ "text_loss": 0.6931945085525513
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 19.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0007124694656882713,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6702647.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.004117495380342007,
+ "skip_count": 0.0,
+ "step": 4158,
+ "text_loss": 0.4325876832008362
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.0007121892438296874,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6705964.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014713290147483349,
+ "skip_count": 0.0,
+ "step": 4160,
+ "text_loss": 0.3672060966491699
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0007119089406597849,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6710182.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037311650812625885,
+ "skip_count": 1.0,
+ "step": 4162,
+ "text_loss": 0.6643805503845215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0007116285562859767,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6713410.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006017287727445364,
+ "skip_count": 0.0,
+ "step": 4164,
+ "text_loss": 0.4606415927410126
+ },
+ {
+ "acc_repeat": 0.3333333432674408,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 19.55884942764896,
+ "f1_execute": 0.9545454382896423,
+ "f1_repeat": 0.5,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.0007113480908157065,
+ "loss": 0.0108,
+ "macro_f1": 0.8181818723678589,
+ "num_tokens": 6716056.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.08640352636575699,
+ "skip_count": 4.0,
+ "step": 4166,
+ "text_loss": 0.3139408528804779
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0007110675443564491,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6719497.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012731150491163135,
+ "skip_count": 0.0,
+ "step": 4168,
+ "text_loss": 0.7283861637115479
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.57763428235985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0007107869170157108,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6722297.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021509863436222076,
+ "skip_count": 2.0,
+ "step": 4170,
+ "text_loss": 0.5767703056335449
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.000710506208901028,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6725762.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00257494836114347,
+ "skip_count": 1.0,
+ "step": 4172,
+ "text_loss": 0.33571913838386536
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.000710225420119969,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 6728436.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00943201594054699,
+ "skip_count": 3.0,
+ "step": 4174,
+ "text_loss": 0.6849368810653687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0007099445507801323,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6731427.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01046718005090952,
+ "skip_count": 2.0,
+ "step": 4176,
+ "text_loss": 0.3346157670021057
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0007096636009891477,
+ "loss": 0.0091,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6734800.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007813365664333105,
+ "skip_count": 0.0,
+ "step": 4178,
+ "text_loss": 0.49989959597587585
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.000709382570854676,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6738244.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002825600327923894,
+ "skip_count": 0.0,
+ "step": 4180,
+ "text_loss": 0.15744923055171967
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0007091014604844078,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6741695.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017124463338404894,
+ "skip_count": 0.0,
+ "step": 4182,
+ "text_loss": 0.3752405643463135
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.0007088202699860655,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 6744882.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005134924780577421,
+ "skip_count": 3.0,
+ "step": 4184,
+ "text_loss": 0.18534569442272186
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 0.000708538999467402,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6747811.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002371585462242365,
+ "skip_count": 1.0,
+ "step": 4186,
+ "text_loss": 0.6251029968261719
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.662166128558848,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0007082576490362004,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6750765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002088436856865883,
+ "skip_count": 0.0,
+ "step": 4188,
+ "text_loss": 0.35471436381340027
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.000707976218800275,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6754021.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012272283202037215,
+ "skip_count": 0.0,
+ "step": 4190,
+ "text_loss": 0.5737302899360657
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07763671875,
+ "learning_rate": 0.0007076947088674701,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6756793.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026050808373838663,
+ "skip_count": 0.0,
+ "step": 4192,
+ "text_loss": 0.526336669921875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.690343410625182,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.054931640625,
+ "learning_rate": 0.000707413119345661,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6760221.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013151296880096197,
+ "skip_count": 0.0,
+ "step": 4194,
+ "text_loss": 0.5678895711898804
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.69973583798063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0007071314503427532,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6763721.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001528652966953814,
+ "skip_count": 0.0,
+ "step": 4196,
+ "text_loss": 0.7640175223350525
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0007068497019666829,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6768581.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019202446565032005,
+ "skip_count": 0.0,
+ "step": 4198,
+ "text_loss": 0.41878414154052734
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.718520692691516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.051513671875,
+ "learning_rate": 0.0007065678743254167,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6772758.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004667408298701048,
+ "skip_count": 1.0,
+ "step": 4200,
+ "text_loss": 0.3550313413143158
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 19.727913120046964,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 0.0007062859675269513,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6776671.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.00568761583417654,
+ "skip_count": 0.0,
+ "step": 4202,
+ "text_loss": 0.1707649976015091
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0007060039816793141,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6780284.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030401297844946384,
+ "skip_count": 0.0,
+ "step": 4204,
+ "text_loss": 0.2686377167701721
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 19.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.0007057219168905625,
+ "loss": 0.0068,
+ "macro_f1": 1.0,
+ "num_tokens": 6783525.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003353122156113386,
+ "skip_count": 5.0,
+ "step": 4206,
+ "text_loss": 0.5235374569892883
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.756090402113298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.000705439773268784,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6787691.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016532237641513348,
+ "skip_count": 1.0,
+ "step": 4208,
+ "text_loss": 0.5002681612968445
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0007051575509220972,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6790833.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011808308772742748,
+ "skip_count": 0.0,
+ "step": 4210,
+ "text_loss": 0.7251001596450806
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.774875256824185,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.0007048752499586497,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6794260.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006246297620236874,
+ "skip_count": 2.0,
+ "step": 4212,
+ "text_loss": 0.2430499643087387
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.784267684179632,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.00070459287048662,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6797413.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012964420020580292,
+ "skip_count": 0.0,
+ "step": 4214,
+ "text_loss": 0.48889362812042236
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0007043104126142163,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6800815.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018109704833477736,
+ "skip_count": 0.0,
+ "step": 4216,
+ "text_loss": 0.5617026686668396
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 19.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 0.0007040278764496771,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 6803937.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0028699536342173815,
+ "skip_count": 1.0,
+ "step": 4218,
+ "text_loss": 0.548405647277832
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.812444966245963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0007037452621012708,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6806946.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007951617590151727,
+ "skip_count": 0.0,
+ "step": 4220,
+ "text_loss": 0.5702725648880005
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0007034625696772958,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6810083.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003436052706092596,
+ "skip_count": 2.0,
+ "step": 4222,
+ "text_loss": 0.3898725211620331
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.00070317979928608,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6812845.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005070401239208877,
+ "skip_count": 0.0,
+ "step": 4224,
+ "text_loss": 0.5244157910346985
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.840622248312297,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.000702896951035982,
+ "loss": 0.0101,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6815801.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01560303382575512,
+ "skip_count": 1.0,
+ "step": 4226,
+ "text_loss": 0.26503118872642517
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0007026140250353896,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6819464.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009310240857303143,
+ "skip_count": 2.0,
+ "step": 4228,
+ "text_loss": 0.15597499907016754
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.0007023310213927208,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6822657.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005309136584401131,
+ "skip_count": 0.0,
+ "step": 4230,
+ "text_loss": 0.5271651148796082
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046875,
+ "learning_rate": 0.0007020479402164226,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6825661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005936166271567345,
+ "skip_count": 2.0,
+ "step": 4232,
+ "text_loss": 0.6105108857154846
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.878191957734078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0007017647816149727,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6828688.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001653556595556438,
+ "skip_count": 0.0,
+ "step": 4234,
+ "text_loss": 0.6966437101364136
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.88758438508952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.000701481545696878,
+ "loss": 0.009,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6831850.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013501866487786174,
+ "skip_count": 0.0,
+ "step": 4236,
+ "text_loss": 1.259678840637207
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.896976812444965,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.059814453125,
+ "learning_rate": 0.0007011982325706747,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6834862.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008970130234956741,
+ "skip_count": 1.0,
+ "step": 4238,
+ "text_loss": 0.24906545877456665
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.906369239800412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0007009148423449292,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6838148.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026013399474322796,
+ "skip_count": 0.0,
+ "step": 4240,
+ "text_loss": 0.291467547416687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.915761667155856,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0007006313751282371,
+ "loss": 0.0094,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6841142.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021415632218122482,
+ "skip_count": 1.0,
+ "step": 4242,
+ "text_loss": 0.507606029510498
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.0007003478310292236,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6844042.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023636550176888704,
+ "skip_count": 0.0,
+ "step": 4244,
+ "text_loss": 0.11626995354890823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.934546521866746,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0007000642101565433,
+ "loss": 0.008,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6847359.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.025154776871204376,
+ "skip_count": 0.0,
+ "step": 4246,
+ "text_loss": 0.42898693680763245
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.0006997805126188803,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6850443.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00540317315608263,
+ "skip_count": 0.0,
+ "step": 4248,
+ "text_loss": 0.18085283041000366
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.000699496738524948,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6853495.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014433214673772454,
+ "skip_count": 0.0,
+ "step": 4250,
+ "text_loss": 0.5524004697799683
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 19.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0006992128879834891,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 6856774.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013381492346525192,
+ "skip_count": 3.0,
+ "step": 4252,
+ "text_loss": 0.19605717062950134
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.0006989289611032758,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6860313.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007140172645449638,
+ "skip_count": 1.0,
+ "step": 4254,
+ "text_loss": 0.3182447552680969
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 19.981508658643968,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0006986449579931091,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6863683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006486213766038418,
+ "skip_count": 1.0,
+ "step": 4256,
+ "text_loss": 0.19250160455703735
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 19.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.0006983608787618201,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6867609.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001465818495489657,
+ "skip_count": 0.0,
+ "step": 4258,
+ "text_loss": 0.5912898182868958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.000698076723518268,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6870040.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031106441747397184,
+ "skip_count": 0.0,
+ "step": 4260,
+ "text_loss": 0.13542121648788452
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.009392427355444,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0006977924923713418,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6873441.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005377951893024147,
+ "skip_count": 0.0,
+ "step": 4262,
+ "text_loss": 0.352464497089386
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0006975081854299594,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6876637.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007052485831081867,
+ "skip_count": 0.0,
+ "step": 4264,
+ "text_loss": 0.5023844242095947
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.0006972238028030678,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6879928.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013608322478830814,
+ "skip_count": 0.0,
+ "step": 4266,
+ "text_loss": 0.8664718270301819
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.037569709421778,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0006969393445996429,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6883425.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007607188890688121,
+ "skip_count": 0.0,
+ "step": 4268,
+ "text_loss": 0.5131992101669312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0006966548109286897,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6886790.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00035804163780994713,
+ "skip_count": 0.0,
+ "step": 4270,
+ "text_loss": 0.5352054834365845
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.000696370201899242,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6889747.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004451376851648092,
+ "skip_count": 1.0,
+ "step": 4272,
+ "text_loss": 0.47865036129951477
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0006960855176203623,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6892604.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015342880506068468,
+ "skip_count": 0.0,
+ "step": 4274,
+ "text_loss": 0.36278650164604187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.07513941884356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0006958007582011425,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6895563.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022974940948188305,
+ "skip_count": 2.0,
+ "step": 4276,
+ "text_loss": 0.6695618629455566
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0006955159237507027,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6898591.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00859096460044384,
+ "skip_count": 1.0,
+ "step": 4278,
+ "text_loss": 0.44284722208976746
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0006952310143781921,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 6903119.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007919861935079098,
+ "skip_count": 3.0,
+ "step": 4280,
+ "text_loss": 0.5006136298179626
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0006949460301927886,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6906394.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008476210059598088,
+ "skip_count": 0.0,
+ "step": 4282,
+ "text_loss": 0.8153555989265442
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048095703125,
+ "learning_rate": 0.0006946609713036985,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6909136.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006711610127240419,
+ "skip_count": 2.0,
+ "step": 4284,
+ "text_loss": 0.43136683106422424
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0185546875,
+ "learning_rate": 0.0006943758378201571,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6912734.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038677838165313005,
+ "skip_count": 0.0,
+ "step": 4286,
+ "text_loss": 0.2693749964237213
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0006940906298514278,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6915838.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012188015971332788,
+ "skip_count": 0.0,
+ "step": 4288,
+ "text_loss": 0.5809219479560852
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0006938053475068031,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6919225.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001955829095095396,
+ "skip_count": 0.0,
+ "step": 4290,
+ "text_loss": 0.5116089582443237
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 20.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11279296875,
+ "learning_rate": 0.0006935199908956037,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6922495.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0035709093790501356,
+ "skip_count": 0.0,
+ "step": 4292,
+ "text_loss": 0.2745901644229889
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.159671265042558,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.0006932345601271786,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6925317.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005745319649577141,
+ "skip_count": 0.0,
+ "step": 4294,
+ "text_loss": 0.6039219498634338
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 20.169063692398005,
+ "f1_execute": 0.9743589162826538,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0693359375,
+ "learning_rate": 0.0006929490553109056,
+ "loss": 0.0107,
+ "macro_f1": 0.9247862696647644,
+ "num_tokens": 6928054.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.061689916998147964,
+ "skip_count": 6.0,
+ "step": 4296,
+ "text_loss": 0.3904837667942047
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0006926634765561907,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6931348.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002007248578593135,
+ "skip_count": 0.0,
+ "step": 4298,
+ "text_loss": 0.5170742273330688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.187848547108892,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.000692377823972468,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6934411.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005786226247437298,
+ "skip_count": 0.0,
+ "step": 4300,
+ "text_loss": 0.8032443523406982
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.19724097446434,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0006920920976692004,
+ "loss": 0.0071,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 6938153.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.024602646008133888,
+ "skip_count": 0.0,
+ "step": 4302,
+ "text_loss": 0.446534663438797
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0006918062977558784,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6940731.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005759815219789743,
+ "skip_count": 2.0,
+ "step": 4304,
+ "text_loss": 0.15479247272014618
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.216025829175226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0006915204243420214,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6943246.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005315347574651241,
+ "skip_count": 1.0,
+ "step": 4306,
+ "text_loss": 0.22127842903137207
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.225418256530673,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0006912344775371765,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6947197.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012061651796102524,
+ "skip_count": 0.0,
+ "step": 4308,
+ "text_loss": 0.7058854103088379
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0006909484574509191,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6951817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029203309677541256,
+ "skip_count": 0.0,
+ "step": 4310,
+ "text_loss": 0.6014000773429871
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.0006906623641928525,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6955094.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005703397560864687,
+ "skip_count": 2.0,
+ "step": 4312,
+ "text_loss": 0.5923848152160645
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.253595538597008,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08154296875,
+ "learning_rate": 0.0006903761978726084,
+ "loss": 0.0073,
+ "macro_f1": 1.0,
+ "num_tokens": 6958127.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004489895887672901,
+ "skip_count": 2.0,
+ "step": 4314,
+ "text_loss": 0.36911651492118835
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.000690089958599846,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6960871.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003871412482112646,
+ "skip_count": 2.0,
+ "step": 4316,
+ "text_loss": 0.442545086145401
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.000689803646484253,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 6963980.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008667866699397564,
+ "skip_count": 2.0,
+ "step": 4318,
+ "text_loss": 0.1987489014863968
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 20.281772820663342,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0006895172616355446,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6967132.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00843339879065752,
+ "skip_count": 0.0,
+ "step": 4320,
+ "text_loss": 0.48267918825149536
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0006892308041634639,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6969971.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004312851815484464,
+ "skip_count": 0.0,
+ "step": 4322,
+ "text_loss": 0.3662732243537903
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 20.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0006889442741777822,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6973114.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004588035400956869,
+ "skip_count": 3.0,
+ "step": 4324,
+ "text_loss": 0.6707104444503784
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.309950102729672,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 0.0006886576717882982,
+ "loss": 0.0057,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 6976013.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0687296912074089,
+ "skip_count": 3.0,
+ "step": 4326,
+ "text_loss": 0.1662217676639557
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.31934253008512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0006883709971048384,
+ "loss": 0.0091,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6979200.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002950174268335104,
+ "skip_count": 0.0,
+ "step": 4328,
+ "text_loss": 0.21168152987957
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0006880842502372572,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6982640.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032158740796148777,
+ "skip_count": 0.0,
+ "step": 4330,
+ "text_loss": 0.26790961623191833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.338127384796007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.0006877974312954365,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6985917.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005083635332994163,
+ "skip_count": 0.0,
+ "step": 4332,
+ "text_loss": 0.9736502170562744
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.347519812151454,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.000687510540389286,
+ "loss": 0.0053,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 6988388.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03473830223083496,
+ "skip_count": 2.0,
+ "step": 4334,
+ "text_loss": 0.21662230789661407
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0006872235776287425,
+ "loss": 0.0091,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 6991360.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002206524135544896,
+ "skip_count": 0.0,
+ "step": 4336,
+ "text_loss": 0.6026972532272339
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 20.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.0006869365431237711,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 6995080.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.000969731598161161,
+ "skip_count": 0.0,
+ "step": 4338,
+ "text_loss": 0.5833017230033875
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.375697094217788,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0006866494369843635,
+ "loss": 0.0054,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 6998526.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.013962293043732643,
+ "skip_count": 2.0,
+ "step": 4340,
+ "text_loss": 0.41465985774993896
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 20.38508952157323,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.0006863622593205397,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7001494.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0064964210614562035,
+ "skip_count": 3.0,
+ "step": 4342,
+ "text_loss": 0.3774271011352539
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 20.394481948928675,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0006860750102423464,
+ "loss": 0.0062,
+ "macro_f1": 0.6589147448539734,
+ "num_tokens": 7005544.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.023250726982951164,
+ "skip_count": 6.0,
+ "step": 4344,
+ "text_loss": 0.2732464373111725
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.403874376284122,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 0.0006857876898598582,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7008847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038170060142874718,
+ "skip_count": 2.0,
+ "step": 4346,
+ "text_loss": 0.29610875248908997
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0006855002982831769,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7012577.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012856025714427233,
+ "skip_count": 0.0,
+ "step": 4348,
+ "text_loss": 0.6098502278327942
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.061767578125,
+ "learning_rate": 0.0006852128356224314,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7015650.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008162742480635643,
+ "skip_count": 1.0,
+ "step": 4350,
+ "text_loss": 0.20868146419525146
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.432051658350456,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.0006849253019877778,
+ "loss": 0.0074,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 7019925.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.023544032126665115,
+ "skip_count": 3.0,
+ "step": 4352,
+ "text_loss": 0.628226101398468
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 0.0006846376974893996,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7023130.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004982319660484791,
+ "skip_count": 2.0,
+ "step": 4354,
+ "text_loss": 0.7037544250488281
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 20.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0006843500222375074,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7026422.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004015266429632902,
+ "skip_count": 0.0,
+ "step": 4356,
+ "text_loss": 0.22352729737758636
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 27.0,
+ "epoch": 20.46022894041679,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0006840622763423391,
+ "loss": 0.0071,
+ "macro_f1": 0.9449735879898071,
+ "num_tokens": 7029077.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.021162014454603195,
+ "skip_count": 4.0,
+ "step": 4358,
+ "text_loss": 0.2431403249502182
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0006837744599141591,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7032582.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007044129306450486,
+ "skip_count": 0.0,
+ "step": 4360,
+ "text_loss": 0.26667487621307373
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0006834865730632594,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7035642.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0067853196524083614,
+ "skip_count": 1.0,
+ "step": 4362,
+ "text_loss": 0.20965275168418884
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.488406222483125,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0006831986158999588,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7038601.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00899333506822586,
+ "skip_count": 2.0,
+ "step": 4364,
+ "text_loss": 0.26860126852989197
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.49779864983857,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.000682910588534603,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7042274.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019194348715245724,
+ "skip_count": 0.0,
+ "step": 4366,
+ "text_loss": 0.14046810567378998
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0006826224910775647,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 7045268.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006915684789419174,
+ "skip_count": 3.0,
+ "step": 4368,
+ "text_loss": 0.5900366306304932
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.516583504549455,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0006823343236392432,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7049407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001678116386756301,
+ "skip_count": 0.0,
+ "step": 4370,
+ "text_loss": 0.7868026494979858
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.000682046086330065,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7052783.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003459530707914382,
+ "skip_count": 0.0,
+ "step": 4372,
+ "text_loss": 0.6349637508392334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.0006817577792604831,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7055757.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011729507241398096,
+ "skip_count": 0.0,
+ "step": 4374,
+ "text_loss": 0.43258991837501526
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0006814694025409773,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7058684.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006664610700681806,
+ "skip_count": 0.0,
+ "step": 4376,
+ "text_loss": 0.5307940244674683
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.091796875,
+ "learning_rate": 0.0006811809562820542,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7061902.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004595907870680094,
+ "skip_count": 2.0,
+ "step": 4378,
+ "text_loss": 0.5830042362213135
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0006808924405942467,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7065100.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032026609405875206,
+ "skip_count": 0.0,
+ "step": 4380,
+ "text_loss": 0.20797798037528992
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 20.572938068682124,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 0.0006806038555881148,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7068556.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0024626904632896185,
+ "skip_count": 0.0,
+ "step": 4382,
+ "text_loss": 0.5791074633598328
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0006803152013742448,
+ "loss": 0.0075,
+ "macro_f1": 1.0,
+ "num_tokens": 7071284.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010723610408604145,
+ "skip_count": 2.0,
+ "step": 4384,
+ "text_loss": 0.13227243721485138
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 20.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0006800264780632495,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7074428.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0011231007520109415,
+ "skip_count": 0.0,
+ "step": 4386,
+ "text_loss": 0.4360627233982086
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 20.601115350748458,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.0006797376857657681,
+ "loss": 0.0081,
+ "macro_f1": 1.0,
+ "num_tokens": 7078313.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.008419238030910492,
+ "skip_count": 1.0,
+ "step": 4388,
+ "text_loss": 0.5183924436569214
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.610507778103905,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.0006794488245924664,
+ "loss": 0.0084,
+ "macro_f1": 1.0,
+ "num_tokens": 7081258.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006582668516784906,
+ "skip_count": 3.0,
+ "step": 4390,
+ "text_loss": 0.2797473669052124
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.61990020545935,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046630859375,
+ "learning_rate": 0.0006791598946540368,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7084527.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00557357631623745,
+ "skip_count": 2.0,
+ "step": 4392,
+ "text_loss": 0.39495575428009033
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.629292632814792,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06005859375,
+ "learning_rate": 0.0006788708960611975,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7087675.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007155992556363344,
+ "skip_count": 0.0,
+ "step": 4394,
+ "text_loss": 0.3785299062728882
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01806640625,
+ "learning_rate": 0.0006785818289246934,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7090171.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009265039698220789,
+ "skip_count": 0.0,
+ "step": 4396,
+ "text_loss": 0.42634522914886475
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 20.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.0006782926933552955,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 7092529.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008679097518324852,
+ "skip_count": 7.0,
+ "step": 4398,
+ "text_loss": 0.4283660054206848
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0006780034894638014,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7095141.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002363949315622449,
+ "skip_count": 0.0,
+ "step": 4400,
+ "text_loss": 0.481539249420166
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 20.666862342236573,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.000677714217361034,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7098208.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004005146212875843,
+ "skip_count": 3.0,
+ "step": 4402,
+ "text_loss": 0.6443291902542114
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0006774248771578435,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7101681.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026864963583648205,
+ "skip_count": 0.0,
+ "step": 4404,
+ "text_loss": 0.16315312683582306
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 27.0,
+ "epoch": 20.68564719694746,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0006771354689651054,
+ "loss": 0.005,
+ "macro_f1": 0.9449735879898071,
+ "num_tokens": 7104719.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.02719845622777939,
+ "skip_count": 4.0,
+ "step": 4406,
+ "text_loss": 0.37855592370033264
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.695039624302908,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.0006768459928937213,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7108697.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010488593950867653,
+ "skip_count": 0.0,
+ "step": 4408,
+ "text_loss": 0.23133711516857147
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 20.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0006765564490546193,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7111426.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0013637891970574856,
+ "skip_count": 0.0,
+ "step": 4410,
+ "text_loss": 0.41399383544921875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0732421875,
+ "learning_rate": 0.0006762668375587528,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7114241.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000900395680218935,
+ "skip_count": 0.0,
+ "step": 4412,
+ "text_loss": 0.6460412740707397
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.723216906369238,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.0006759771585171016,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7117031.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024001260753721,
+ "skip_count": 0.0,
+ "step": 4414,
+ "text_loss": 0.7645824551582336
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0006756874120406714,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 7120766.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.005034091416746378,
+ "skip_count": 4.0,
+ "step": 4416,
+ "text_loss": 0.31753066182136536
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0006753975982404934,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7125243.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002483269665390253,
+ "skip_count": 0.0,
+ "step": 4418,
+ "text_loss": 0.5304268002510071
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.751394188435572,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0006751077172276249,
+ "loss": 0.0052,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7127795.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02676006779074669,
+ "skip_count": 1.0,
+ "step": 4420,
+ "text_loss": 0.22011354565620422
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 0.000674817769113149,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7130837.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003267093561589718,
+ "skip_count": 2.0,
+ "step": 4422,
+ "text_loss": 0.2906076908111572
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 20.770179043146463,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.000674527754008174,
+ "loss": 0.0045,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 7135090.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.022510390728712082,
+ "skip_count": 3.0,
+ "step": 4424,
+ "text_loss": 0.2544902563095093
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.779571470501907,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0006742376720238345,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7138751.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011178571730852127,
+ "skip_count": 0.0,
+ "step": 4426,
+ "text_loss": 0.6811438798904419
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 20.788963897857354,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0006739475232712904,
+ "loss": 0.0036,
+ "macro_f1": 1.0,
+ "num_tokens": 7141762.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005595206283032894,
+ "skip_count": 1.0,
+ "step": 4428,
+ "text_loss": 0.38743990659713745
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0006736573078617272,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7145235.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002793942578136921,
+ "skip_count": 2.0,
+ "step": 4430,
+ "text_loss": 0.21894219517707825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 20.80774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0006733670259063561,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7149042.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006146818865090609,
+ "skip_count": 3.0,
+ "step": 4432,
+ "text_loss": 0.17822015285491943
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 20.817141179923688,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0006730766775164136,
+ "loss": 0.0061,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 7152166.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.026045087724924088,
+ "skip_count": 2.0,
+ "step": 4434,
+ "text_loss": 0.2910420000553131
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 20.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0006727862628031618,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7155506.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0022973387967795134,
+ "skip_count": 0.0,
+ "step": 4436,
+ "text_loss": 0.3502544164657593
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.0006724957818778882,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7158739.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002357073128223419,
+ "skip_count": 1.0,
+ "step": 4438,
+ "text_loss": 0.26200664043426514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.845318461990022,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0006722052348519054,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7161776.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005521026905626059,
+ "skip_count": 0.0,
+ "step": 4440,
+ "text_loss": 0.3922915458679199
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 20.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.000671914621836552,
+ "loss": 0.0106,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7164763.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007691344246268272,
+ "skip_count": 2.0,
+ "step": 4442,
+ "text_loss": 0.6021351218223572
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.000671623942943191,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7167924.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032181134447455406,
+ "skip_count": 0.0,
+ "step": 4444,
+ "text_loss": 0.23639555275440216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.873495744056356,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.0006713331982832113,
+ "loss": 0.0071,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7170743.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.024979131296277046,
+ "skip_count": 0.0,
+ "step": 4446,
+ "text_loss": 0.4957772493362427
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0006710423879680271,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7174660.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002571308286860585,
+ "skip_count": 0.0,
+ "step": 4448,
+ "text_loss": 0.47968071699142456
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.000670751512109077,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7177965.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00212799571454525,
+ "skip_count": 0.0,
+ "step": 4450,
+ "text_loss": 0.6550716161727905
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.901673026122687,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0006704605708178252,
+ "loss": 0.0107,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7181512.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004176430404186249,
+ "skip_count": 1.0,
+ "step": 4452,
+ "text_loss": 0.36959558725357056
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0006701695642057613,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7184555.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010968588758260012,
+ "skip_count": 0.0,
+ "step": 4454,
+ "text_loss": 0.6686749458312988
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0006698784923843993,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7187474.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014241471653804183,
+ "skip_count": 0.0,
+ "step": 4456,
+ "text_loss": 0.6147221922874451
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.92985030818902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0006695873554652784,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7190649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008801907300949097,
+ "skip_count": 0.0,
+ "step": 4458,
+ "text_loss": 0.26381927728652954
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.0006692961535599634,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7193961.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009027508087456226,
+ "skip_count": 1.0,
+ "step": 4460,
+ "text_loss": 0.1926470547914505
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0006690048867800427,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7197456.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022697453387081623,
+ "skip_count": 0.0,
+ "step": 4462,
+ "text_loss": 0.6736721992492676
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0006687135552371305,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7200290.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006747903767973185,
+ "skip_count": 1.0,
+ "step": 4464,
+ "text_loss": 0.2026437371969223
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.967420017610802,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0006684221590428657,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7203320.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011565096210688353,
+ "skip_count": 0.0,
+ "step": 4466,
+ "text_loss": 0.7587730288505554
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 20.976812444966246,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0006681306983089121,
+ "loss": 0.0083,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 7206411.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.023645581677556038,
+ "skip_count": 2.0,
+ "step": 4468,
+ "text_loss": 0.8981561660766602
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 20.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0006678391731469575,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7209421.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035848666448146105,
+ "skip_count": 0.0,
+ "step": 4470,
+ "text_loss": 0.1522839516401291
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 20.995597299677137,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0006675475836687152,
+ "loss": 0.0069,
+ "macro_f1": 1.0,
+ "num_tokens": 7212267.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005046425387263298,
+ "skip_count": 1.0,
+ "step": 4472,
+ "text_loss": 0.46007999777793884
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.00469621367772,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0006672559299859228,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7215195.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019333874806761742,
+ "skip_count": 0.0,
+ "step": 4474,
+ "text_loss": 1.0859547853469849
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.014088641033165,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0006669642122103423,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7217941.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005401032394729555,
+ "skip_count": 0.0,
+ "step": 4476,
+ "text_loss": 0.9754356145858765
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.023481068388612,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.0006666724304537611,
+ "loss": 0.0053,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7222494.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015569722279906273,
+ "skip_count": 0.0,
+ "step": 4478,
+ "text_loss": 0.2896423637866974
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0006663805848279898,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7225292.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020135147497057915,
+ "skip_count": 0.0,
+ "step": 4480,
+ "text_loss": 0.8492724299430847
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 21.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.0006660886754448648,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7229184.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002355351345613599,
+ "skip_count": 0.0,
+ "step": 4482,
+ "text_loss": 0.189764603972435
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.051658350454947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.0006657967024162459,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7232906.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003044391982257366,
+ "skip_count": 0.0,
+ "step": 4484,
+ "text_loss": 0.4239847660064697
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0006655046658540179,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7235996.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00602696230635047,
+ "skip_count": 2.0,
+ "step": 4486,
+ "text_loss": 0.217103973031044
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.070443205165834,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0169677734375,
+ "learning_rate": 0.0006652125658700896,
+ "loss": 0.0031,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7238882.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001470155781134963,
+ "skip_count": 1.0,
+ "step": 4488,
+ "text_loss": 0.6090770363807678
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0006649204025763945,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 7241815.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008737480267882347,
+ "skip_count": 2.0,
+ "step": 4490,
+ "text_loss": 0.48314425349235535
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0177001953125,
+ "learning_rate": 0.0006646281760848902,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7244848.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008257135050371289,
+ "skip_count": 0.0,
+ "step": 4492,
+ "text_loss": 0.5884748101234436
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.098620487232168,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0006643358865075581,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7247930.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016262239078059793,
+ "skip_count": 0.0,
+ "step": 4494,
+ "text_loss": 0.21444730460643768
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.108012914587615,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0006640435339564042,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7251776.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001315156347118318,
+ "skip_count": 0.0,
+ "step": 4496,
+ "text_loss": 0.6890370845794678
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.11740534194306,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0006637511185434588,
+ "loss": 0.0091,
+ "macro_f1": 1.0,
+ "num_tokens": 7255070.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007614497095346451,
+ "skip_count": 3.0,
+ "step": 4498,
+ "text_loss": 0.516417920589447
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 21.126797769298502,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0006634586403807758,
+ "loss": 0.0041,
+ "macro_f1": 1.0,
+ "num_tokens": 7258115.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.004906686954200268,
+ "skip_count": 2.0,
+ "step": 4500,
+ "text_loss": 0.577463686466217
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.13619019665395,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0927734375,
+ "learning_rate": 0.0006631660995804334,
+ "loss": 0.0067,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 7260769.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013337121345102787,
+ "skip_count": 2.0,
+ "step": 4502,
+ "text_loss": 0.37124839425086975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0006628734962545339,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7263908.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023418180644512177,
+ "skip_count": 0.0,
+ "step": 4504,
+ "text_loss": 0.17937727272510529
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0006625808305152033,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7267391.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006556165171787143,
+ "skip_count": 0.0,
+ "step": 4506,
+ "text_loss": 0.45344987511634827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.164367478720283,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0006622881024745919,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7271402.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021988123189657927,
+ "skip_count": 0.0,
+ "step": 4508,
+ "text_loss": 0.5842905640602112
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0006619953122448734,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7274354.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00774174090474844,
+ "skip_count": 2.0,
+ "step": 4510,
+ "text_loss": 0.27159228920936584
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0006617024599382456,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7277378.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006942499312572181,
+ "skip_count": 0.0,
+ "step": 4512,
+ "text_loss": 0.4464176297187805
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.192544760786618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0006614095456669302,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7280526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003003394464030862,
+ "skip_count": 0.0,
+ "step": 4514,
+ "text_loss": 0.31188079714775085
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0006611165695431725,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7283916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006948060472495854,
+ "skip_count": 0.0,
+ "step": 4516,
+ "text_loss": 0.5266574025154114
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0006608235316792413,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7286843.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014080886030569673,
+ "skip_count": 0.0,
+ "step": 4518,
+ "text_loss": 0.5880120396614075
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.220722042852948,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0006605304321874295,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7289940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016894340515136719,
+ "skip_count": 0.0,
+ "step": 4520,
+ "text_loss": 0.6623797416687012
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0006602372711800531,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7292869.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003522444050759077,
+ "skip_count": 0.0,
+ "step": 4522,
+ "text_loss": 0.5488807559013367
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0006599440487694521,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7296618.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011981099378317595,
+ "skip_count": 0.0,
+ "step": 4524,
+ "text_loss": 0.4128517210483551
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 21.248899324919282,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00065965076506799,
+ "loss": 0.0047,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 7300481.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.010548194870352745,
+ "skip_count": 2.0,
+ "step": 4526,
+ "text_loss": 0.26450902223587036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0006593574201880536,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7303272.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005642973352223635,
+ "skip_count": 1.0,
+ "step": 4528,
+ "text_loss": 0.35269856452941895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.000659064014242053,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7306615.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004171932581812143,
+ "skip_count": 1.0,
+ "step": 4530,
+ "text_loss": 0.18814080953598022
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.277076606985617,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0006587705473424223,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7310368.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002289367141202092,
+ "skip_count": 2.0,
+ "step": 4532,
+ "text_loss": 0.7363705635070801
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.286469034341064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.000658477019601618,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7313788.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004440625663846731,
+ "skip_count": 1.0,
+ "step": 4534,
+ "text_loss": 0.8126176595687866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0006581834311321211,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7317864.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013160990783944726,
+ "skip_count": 2.0,
+ "step": 4536,
+ "text_loss": 0.7015916109085083
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.30525388905195,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04736328125,
+ "learning_rate": 0.000657889782046435,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7320693.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032275544945150614,
+ "skip_count": 2.0,
+ "step": 4538,
+ "text_loss": 0.6481677293777466
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.314646316407398,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.0006575960724570865,
+ "loss": 0.0054,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7324335.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009769129566848278,
+ "skip_count": 1.0,
+ "step": 4540,
+ "text_loss": 0.22194676101207733
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0006573023024766258,
+ "loss": 0.0061,
+ "macro_f1": 1.0,
+ "num_tokens": 7327431.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0036973082460463047,
+ "skip_count": 4.0,
+ "step": 4542,
+ "text_loss": 0.475127637386322
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.000657008472217626,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7330262.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007046440150588751,
+ "skip_count": 0.0,
+ "step": 4544,
+ "text_loss": 0.2649917006492615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.342823598473732,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.0006567145817926836,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7333110.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026714997366070747,
+ "skip_count": 0.0,
+ "step": 4546,
+ "text_loss": 0.5490524768829346
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0006564206313144175,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7336101.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006552211008965969,
+ "skip_count": 0.0,
+ "step": 4548,
+ "text_loss": 0.14098678529262543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0006561266208954707,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7339435.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035560601390898228,
+ "skip_count": 2.0,
+ "step": 4550,
+ "text_loss": 0.20412275195121765
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0006558325506485081,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7342609.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020106974989175797,
+ "skip_count": 1.0,
+ "step": 4552,
+ "text_loss": 0.6184256076812744
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 0.0006555384206862183,
+ "loss": 0.009,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7345614.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014235252747312188,
+ "skip_count": 0.0,
+ "step": 4554,
+ "text_loss": 1.0108838081359863
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.389785735250953,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0006552442311213121,
+ "loss": 0.0041,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7348957.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01703745685517788,
+ "skip_count": 0.0,
+ "step": 4556,
+ "text_loss": 0.21315747499465942
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 21.399178162606397,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0006549499820665237,
+ "loss": 0.0077,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 7352724.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013315381482243538,
+ "skip_count": 3.0,
+ "step": 4558,
+ "text_loss": 0.34369465708732605
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.00065465567363461,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7356592.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017354936571791768,
+ "skip_count": 0.0,
+ "step": 4560,
+ "text_loss": 0.6267461180686951
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0006543613059383503,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7359774.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011646085418760777,
+ "skip_count": 2.0,
+ "step": 4562,
+ "text_loss": 0.4400193989276886
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.42735544467273,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0006540668790905471,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7362765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019345436012372375,
+ "skip_count": 0.0,
+ "step": 4564,
+ "text_loss": 0.49204275012016296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0006537723932040251,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7366337.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00562885170802474,
+ "skip_count": 1.0,
+ "step": 4566,
+ "text_loss": 0.22566382586956024
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 21.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0006534778483916319,
+ "loss": 0.0084,
+ "macro_f1": 1.0,
+ "num_tokens": 7369851.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005508176051080227,
+ "skip_count": 2.0,
+ "step": 4568,
+ "text_loss": 0.8057850003242493
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0006531832447662377,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7373918.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006460923235863447,
+ "skip_count": 2.0,
+ "step": 4570,
+ "text_loss": 0.5141497254371643
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.464925154094512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0006528885824407351,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7376674.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032120654359459877,
+ "skip_count": 0.0,
+ "step": 4572,
+ "text_loss": 0.1281338930130005
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 0.0006525938615280394,
+ "loss": 0.0116,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7379791.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00443810923025012,
+ "skip_count": 0.0,
+ "step": 4574,
+ "text_loss": 0.268352210521698
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.000652299082141088,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7382886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008284369483590126,
+ "skip_count": 2.0,
+ "step": 4576,
+ "text_loss": 0.30193832516670227
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 21.493102436160846,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0006520042443928411,
+ "loss": 0.0068,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 7386036.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03383317217230797,
+ "skip_count": 1.0,
+ "step": 4578,
+ "text_loss": 0.23106542229652405
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.000651709348396281,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7388908.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017075951909646392,
+ "skip_count": 1.0,
+ "step": 4580,
+ "text_loss": 0.386099249124527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0006514143942644124,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7392004.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009516917169094086,
+ "skip_count": 1.0,
+ "step": 4582,
+ "text_loss": 0.3162059485912323
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051513671875,
+ "learning_rate": 0.0006511193821102623,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7395538.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031392278615385294,
+ "skip_count": 0.0,
+ "step": 4584,
+ "text_loss": 0.5536221861839294
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0006508243120468799,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7398461.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014138511614874005,
+ "skip_count": 0.0,
+ "step": 4586,
+ "text_loss": 0.7934318780899048
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0006505291841873367,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7401611.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005265916115604341,
+ "skip_count": 0.0,
+ "step": 4588,
+ "text_loss": 0.4569905698299408
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.000650233998644726,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7404641.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024988956283777952,
+ "skip_count": 0.0,
+ "step": 4590,
+ "text_loss": 0.49998772144317627
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0006499387555321636,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7407574.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004110113717615604,
+ "skip_count": 1.0,
+ "step": 4592,
+ "text_loss": 0.5679413676261902
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0006496434549627874,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7410806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032845588866621256,
+ "skip_count": 0.0,
+ "step": 4594,
+ "text_loss": 0.35515281558036804
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.57763428235985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0006493480970497568,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7413402.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010577172972261906,
+ "skip_count": 1.0,
+ "step": 4596,
+ "text_loss": 0.26111698150634766
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0006490526819062537,
+ "loss": 0.0091,
+ "macro_f1": 1.0,
+ "num_tokens": 7417236.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002054794691503048,
+ "skip_count": 2.0,
+ "step": 4598,
+ "text_loss": 0.6480993628501892
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07958984375,
+ "learning_rate": 0.0006487572096454818,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7420278.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017989084590226412,
+ "skip_count": 0.0,
+ "step": 4600,
+ "text_loss": 0.4935401678085327
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0006484616803806665,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7423866.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006671485956758261,
+ "skip_count": 1.0,
+ "step": 4602,
+ "text_loss": 0.15030258893966675
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 21.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0006481660942250552,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7426884.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008334980346262455,
+ "skip_count": 3.0,
+ "step": 4604,
+ "text_loss": 0.29933279752731323
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 21.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0006478704512919173,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7431017.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011923984624445438,
+ "skip_count": 3.0,
+ "step": 4606,
+ "text_loss": 0.35141825675964355
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 21.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.0006475747516945432,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7434406.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031092462595552206,
+ "skip_count": 3.0,
+ "step": 4608,
+ "text_loss": 0.21021464467048645
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 21.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.000647278995546246,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7437204.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0006713552866131067,
+ "skip_count": 0.0,
+ "step": 4610,
+ "text_loss": 0.4052635431289673
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0006469831829603598,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7439741.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022583482787013054,
+ "skip_count": 2.0,
+ "step": 4612,
+ "text_loss": 0.5443860292434692
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.662166128558848,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0006466873140502407,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7443619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004187075886875391,
+ "skip_count": 2.0,
+ "step": 4614,
+ "text_loss": 0.30709847807884216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.0006463913889292661,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7446696.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008314833045005798,
+ "skip_count": 0.0,
+ "step": 4616,
+ "text_loss": 0.22949637472629547
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0006460954077108353,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7450377.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001277514616958797,
+ "skip_count": 0.0,
+ "step": 4618,
+ "text_loss": 0.37715134024620056
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.690343410625182,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0006457993705083684,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7453271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022756033577024937,
+ "skip_count": 2.0,
+ "step": 4620,
+ "text_loss": 0.7373883128166199
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.69973583798063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.0006455032774353078,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7456492.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0039057908579707146,
+ "skip_count": 2.0,
+ "step": 4622,
+ "text_loss": 0.5058769583702087
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 0.0006452071286051169,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7459619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019458672031760216,
+ "skip_count": 0.0,
+ "step": 4624,
+ "text_loss": 0.5110082030296326
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.718520692691516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0006449109241312802,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7462552.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0002716891176532954,
+ "skip_count": 1.0,
+ "step": 4626,
+ "text_loss": 0.6197522878646851
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.727913120046964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0006446146641273042,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7466769.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037578947376459837,
+ "skip_count": 2.0,
+ "step": 4628,
+ "text_loss": 0.1653924286365509
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.000644318348706716,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7470216.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012791058979928493,
+ "skip_count": 0.0,
+ "step": 4630,
+ "text_loss": 0.7114694118499756
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0006440219779830643,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7472975.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00736592011526227,
+ "skip_count": 2.0,
+ "step": 4632,
+ "text_loss": 0.26601463556289673
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.756090402113298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.000643725552069919,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7475672.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00045455715735442936,
+ "skip_count": 0.0,
+ "step": 4634,
+ "text_loss": 0.5028402805328369
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.0006434290710808711,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7478850.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004247233271598816,
+ "skip_count": 2.0,
+ "step": 4636,
+ "text_loss": 0.12746070325374603
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 21.774875256824185,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0006431325351295324,
+ "loss": 0.0083,
+ "macro_f1": 0.5427350401878357,
+ "num_tokens": 7481747.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.047564394772052765,
+ "skip_count": 2.0,
+ "step": 4638,
+ "text_loss": 0.24056802690029144
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.784267684179632,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0006428359443295362,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7484885.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011175100225955248,
+ "skip_count": 0.0,
+ "step": 4640,
+ "text_loss": 0.6265338063240051
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 21.793660111535075,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0006425392987945369,
+ "loss": 0.0086,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 7487973.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016879938542842865,
+ "skip_count": 2.0,
+ "step": 4642,
+ "text_loss": 0.2523447275161743
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 21.80305253889052,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0006422425986382093,
+ "loss": 0.0055,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 7491024.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018616504967212677,
+ "skip_count": 3.0,
+ "step": 4644,
+ "text_loss": 0.38890624046325684
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.812444966245963,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0006419458439742496,
+ "loss": 0.0056,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7494199.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023129139095544815,
+ "skip_count": 1.0,
+ "step": 4646,
+ "text_loss": 0.4060848355293274
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0006416490349163747,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7497287.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018601802876219153,
+ "skip_count": 0.0,
+ "step": 4648,
+ "text_loss": 0.3387545943260193
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0006413521715783225,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7500598.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017482215771451592,
+ "skip_count": 0.0,
+ "step": 4650,
+ "text_loss": 0.4290996193885803
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.840622248312297,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0006410552540738514,
+ "loss": 0.007,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7503252.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0420118011534214,
+ "skip_count": 0.0,
+ "step": 4652,
+ "text_loss": 0.439496248960495
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 21.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.000640758282516741,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 7506382.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017782216891646385,
+ "skip_count": 1.0,
+ "step": 4654,
+ "text_loss": 0.8513308167457581
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 21.859407103023187,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.0006404612570207911,
+ "loss": 0.0102,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7510423.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010385853238403797,
+ "skip_count": 0.0,
+ "step": 4656,
+ "text_loss": 0.7159742712974548
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0006401641776998223,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7513394.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011917101219296455,
+ "skip_count": 0.0,
+ "step": 4658,
+ "text_loss": 0.6165401339530945
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.878191957734078,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0006398670446676766,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 7516828.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.008860073052346706,
+ "skip_count": 4.0,
+ "step": 4660,
+ "text_loss": 0.923275887966156
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.88758438508952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0006395698580382153,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7519764.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000505418807733804,
+ "skip_count": 0.0,
+ "step": 4662,
+ "text_loss": 0.6143050789833069
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.896976812444965,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.0006392726179253212,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7522390.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004020806401968002,
+ "skip_count": 1.0,
+ "step": 4664,
+ "text_loss": 0.6935067176818848
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.906369239800412,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 0.0006389753244428972,
+ "loss": 0.0079,
+ "macro_f1": 1.0,
+ "num_tokens": 7525821.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00957963801920414,
+ "skip_count": 2.0,
+ "step": 4666,
+ "text_loss": 0.3350338637828827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.915761667155856,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0006386779777048666,
+ "loss": 0.0063,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 7529513.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.020673364400863647,
+ "skip_count": 2.0,
+ "step": 4668,
+ "text_loss": 0.47800472378730774
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.0006383805778251735,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7533450.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007217096630483866,
+ "skip_count": 1.0,
+ "step": 4670,
+ "text_loss": 0.4506106972694397
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 21.934546521866746,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.0006380831249177817,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7536287.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007001714315265417,
+ "skip_count": 0.0,
+ "step": 4672,
+ "text_loss": 0.4081715941429138
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0006377856190966762,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7539442.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015112817054614425,
+ "skip_count": 0.0,
+ "step": 4674,
+ "text_loss": 0.21451139450073242
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 21.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0006374880604758615,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7542594.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007311929017305374,
+ "skip_count": 2.0,
+ "step": 4676,
+ "text_loss": 0.14785248041152954
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 21.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0006371904491693626,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7545780.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007489737123250961,
+ "skip_count": 1.0,
+ "step": 4678,
+ "text_loss": 0.2248108983039856
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 21.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0006368927852912247,
+ "loss": 0.0057,
+ "macro_f1": 1.0,
+ "num_tokens": 7548287.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009772555902600288,
+ "skip_count": 1.0,
+ "step": 4680,
+ "text_loss": 0.1566995233297348
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 21.981508658643968,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0006365950689555133,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7551424.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002134992741048336,
+ "skip_count": 0.0,
+ "step": 4682,
+ "text_loss": 0.7322417497634888
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 21.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0006362973002763139,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 7554182.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008511497639119625,
+ "skip_count": 4.0,
+ "step": 4684,
+ "text_loss": 0.24387991428375244
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.0006359994793677319,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7557044.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004151526838541031,
+ "skip_count": 2.0,
+ "step": 4686,
+ "text_loss": 0.6139411330223083
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.009392427355444,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0006357016063438928,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7560231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009724601986818016,
+ "skip_count": 0.0,
+ "step": 4688,
+ "text_loss": 0.7875718474388123
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.0006354036813189421,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7562953.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008926765876822174,
+ "skip_count": 0.0,
+ "step": 4690,
+ "text_loss": 0.5195512771606445
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0006351057044070455,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7566137.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031294538639485836,
+ "skip_count": 0.0,
+ "step": 4692,
+ "text_loss": 0.7288873195648193
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.037569709421778,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0006348076757223877,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7569073.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015065820189192891,
+ "skip_count": 2.0,
+ "step": 4694,
+ "text_loss": 0.7242236137390137
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0006345095953791746,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7573025.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005603441968560219,
+ "skip_count": 0.0,
+ "step": 4696,
+ "text_loss": 0.34443899989128113
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.0006342114634916307,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7576546.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011047758162021637,
+ "skip_count": 0.0,
+ "step": 4698,
+ "text_loss": 0.4892682731151581
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.0006339132801740008,
+ "loss": 0.0076,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7580711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019803126342594624,
+ "skip_count": 2.0,
+ "step": 4700,
+ "text_loss": 0.4479489028453827
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 22.07513941884356,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 0.0006336150455405494,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7583385.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0005326359532773495,
+ "skip_count": 0.0,
+ "step": 4702,
+ "text_loss": 0.627504825592041
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.0006333167597055604,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7586584.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005587987834587693,
+ "skip_count": 0.0,
+ "step": 4704,
+ "text_loss": 0.43891432881355286
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.0006330184227833376,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7590408.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007053783163428307,
+ "skip_count": 2.0,
+ "step": 4706,
+ "text_loss": 0.19946859776973724
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 22.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0006327200348882043,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7593857.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0009479080326855183,
+ "skip_count": 0.0,
+ "step": 4708,
+ "text_loss": 0.7973214387893677
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1259765625,
+ "learning_rate": 0.0006324215961345032,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7596429.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012403312139213085,
+ "skip_count": 0.0,
+ "step": 4710,
+ "text_loss": 0.48477989435195923
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0006321231066365966,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7599618.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005520360427908599,
+ "skip_count": 0.0,
+ "step": 4712,
+ "text_loss": 0.44222453236579895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0006318245665088665,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7603180.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015553623670712113,
+ "skip_count": 0.0,
+ "step": 4714,
+ "text_loss": 0.5132410526275635
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0006315259758657138,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7606457.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004210884217172861,
+ "skip_count": 1.0,
+ "step": 4716,
+ "text_loss": 0.39850690960884094
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 22.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.0006312273348215589,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7609317.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001220117206685245,
+ "skip_count": 0.0,
+ "step": 4718,
+ "text_loss": 0.3509018123149872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.159671265042558,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0006309286434908419,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7613076.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007768960203975439,
+ "skip_count": 2.0,
+ "step": 4720,
+ "text_loss": 0.33361560106277466
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0006306299019880217,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7616242.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006226699333637953,
+ "skip_count": 0.0,
+ "step": 4722,
+ "text_loss": 0.23661087453365326
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 22.17845611975345,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0006303311104275766,
+ "loss": 0.0073,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 7619069.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015590761788189411,
+ "skip_count": 1.0,
+ "step": 4724,
+ "text_loss": 0.23373056948184967
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.187848547108892,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0006300322689240041,
+ "loss": 0.0076,
+ "macro_f1": 1.0,
+ "num_tokens": 7622581.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006862971931695938,
+ "skip_count": 2.0,
+ "step": 4726,
+ "text_loss": 0.8301828503608704
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 22.19724097446434,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0006297333775918209,
+ "loss": 0.0086,
+ "macro_f1": 1.0,
+ "num_tokens": 7625566.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006256614346057177,
+ "skip_count": 1.0,
+ "step": 4728,
+ "text_loss": 0.3756707012653351
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.0006294344365455626,
+ "loss": 0.0079,
+ "macro_f1": 1.0,
+ "num_tokens": 7629047.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009151885285973549,
+ "skip_count": 2.0,
+ "step": 4730,
+ "text_loss": 0.33362850546836853
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.216025829175226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0006291354458997841,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7631847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009307434665970504,
+ "skip_count": 0.0,
+ "step": 4732,
+ "text_loss": 0.4572524130344391
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.225418256530673,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0006288364057690591,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7635181.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00041220212006010115,
+ "skip_count": 0.0,
+ "step": 4734,
+ "text_loss": 0.40211325883865356
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0006285373162679804,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7637752.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006696670898236334,
+ "skip_count": 2.0,
+ "step": 4736,
+ "text_loss": 0.7588053345680237
+ },
+ {
+ "acc_repeat": 0.75,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 22.24420311124156,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.8571428656578064,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0006282381775111597,
+ "loss": 0.0081,
+ "macro_f1": 0.9449735879898071,
+ "num_tokens": 7640719.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.016283133998513222,
+ "skip_count": 2.0,
+ "step": 4738,
+ "text_loss": 0.5697863101959229
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 22.253595538597008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0006279389896132274,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7643524.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00763951288536191,
+ "skip_count": 3.0,
+ "step": 4740,
+ "text_loss": 0.548592209815979
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 22.26298796595245,
+ "f1_execute": 0.9756097793579102,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0006276397526888329,
+ "loss": 0.0094,
+ "macro_f1": 0.925203263759613,
+ "num_tokens": 7646919.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.038590483367443085,
+ "skip_count": 5.0,
+ "step": 4742,
+ "text_loss": 0.27226054668426514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0006273404668526443,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7650404.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012555639259517193,
+ "skip_count": 0.0,
+ "step": 4744,
+ "text_loss": 0.47892290353775024
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 22.281772820663342,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0006270411322193488,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7652942.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015356402145698667,
+ "skip_count": 0.0,
+ "step": 4746,
+ "text_loss": 0.5515767931938171
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0006267417489036517,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7656269.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005182140972465277,
+ "skip_count": 0.0,
+ "step": 4748,
+ "text_loss": 0.3496028184890747
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.0006264423170202773,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7658664.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004144361708313227,
+ "skip_count": 0.0,
+ "step": 4750,
+ "text_loss": 0.2786032557487488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.309950102729672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0006261428366839685,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7661471.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00035335420398041606,
+ "skip_count": 0.0,
+ "step": 4752,
+ "text_loss": 0.4838487505912781
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.31934253008512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0006258433080094868,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7664593.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0103341368958354,
+ "skip_count": 2.0,
+ "step": 4754,
+ "text_loss": 0.24325360357761383
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0006255437311116119,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7667573.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014633853919804096,
+ "skip_count": 2.0,
+ "step": 4756,
+ "text_loss": 0.21569855511188507
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.338127384796007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.0006252441061051426,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7671171.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004900569561868906,
+ "skip_count": 0.0,
+ "step": 4758,
+ "text_loss": 0.12832018733024597
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0006249444331048955,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7673932.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020371589343994856,
+ "skip_count": 0.0,
+ "step": 4760,
+ "text_loss": 0.38652482628822327
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.000624644712225706,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7677396.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028059002943336964,
+ "skip_count": 2.0,
+ "step": 4762,
+ "text_loss": 0.7937633395195007
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.0006243449435824276,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7680392.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007225095760077238,
+ "skip_count": 0.0,
+ "step": 4764,
+ "text_loss": 0.5690395832061768
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.375697094217788,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0006240451272899321,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7684121.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002052050782367587,
+ "skip_count": 1.0,
+ "step": 4766,
+ "text_loss": 0.5321336984634399
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 22.38508952157323,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0006237452634631099,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7687236.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0039039517287164927,
+ "skip_count": 0.0,
+ "step": 4768,
+ "text_loss": 0.30823320150375366
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 22.394481948928675,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0006234453522168694,
+ "loss": 0.0084,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 7690355.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014570238068699837,
+ "skip_count": 2.0,
+ "step": 4770,
+ "text_loss": 0.21501587331295013
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 25.0,
+ "epoch": 22.403874376284122,
+ "f1_execute": 0.949999988079071,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.000623145393666137,
+ "loss": 0.0069,
+ "macro_f1": 0.886363685131073,
+ "num_tokens": 7693559.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.061707716435194016,
+ "skip_count": 6.0,
+ "step": 4772,
+ "text_loss": 0.24371100962162018
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0006228453879258576,
+ "loss": 0.0037,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7696422.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005053870379924774,
+ "skip_count": 2.0,
+ "step": 4774,
+ "text_loss": 0.237778440117836
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060302734375,
+ "learning_rate": 0.0006225453351109934,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7700460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017990898340940475,
+ "skip_count": 0.0,
+ "step": 4776,
+ "text_loss": 0.612456738948822
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.432051658350456,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.000622245235336526,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7703330.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004507021512836218,
+ "skip_count": 2.0,
+ "step": 4778,
+ "text_loss": 0.36898812651634216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0006219450887174537,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7707243.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006295828148722649,
+ "skip_count": 1.0,
+ "step": 4780,
+ "text_loss": 0.14474599063396454
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0006216448953687932,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7711121.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005049831233918667,
+ "skip_count": 0.0,
+ "step": 4782,
+ "text_loss": 0.4696790277957916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0006213446554055795,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7714889.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006010758224874735,
+ "skip_count": 0.0,
+ "step": 4784,
+ "text_loss": 0.46253830194473267
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 22.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0006210443689428649,
+ "loss": 0.0063,
+ "macro_f1": 1.0,
+ "num_tokens": 7718420.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.006691234186291695,
+ "skip_count": 1.0,
+ "step": 4786,
+ "text_loss": 0.579987645149231
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.00062074403609572,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7721720.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001864895923063159,
+ "skip_count": 0.0,
+ "step": 4788,
+ "text_loss": 0.325242817401886
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.488406222483125,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.0006204436569792324,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7724916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00202955212444067,
+ "skip_count": 0.0,
+ "step": 4790,
+ "text_loss": 0.49637556076049805
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 22.49779864983857,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0006201432317085083,
+ "loss": 0.0085,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7728081.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0037843603640794754,
+ "skip_count": 0.0,
+ "step": 4792,
+ "text_loss": 0.38812628388404846
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 22.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.0006198427603986711,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7731457.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012036679312586784,
+ "skip_count": 3.0,
+ "step": 4794,
+ "text_loss": 0.2996312379837036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.516583504549455,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0006195422431648623,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7734595.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008874868508428335,
+ "skip_count": 1.0,
+ "step": 4796,
+ "text_loss": 0.3203189969062805
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 22.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.0006192416801222403,
+ "loss": 0.0051,
+ "macro_f1": 1.0,
+ "num_tokens": 7737565.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0032894534524530172,
+ "skip_count": 1.0,
+ "step": 4798,
+ "text_loss": 0.3283322751522064
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.0006189410713859815,
+ "loss": 0.0076,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7740439.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009667043574154377,
+ "skip_count": 2.0,
+ "step": 4800,
+ "text_loss": 0.25219282507896423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 22.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0006186404170712797,
+ "loss": 0.0093,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7743813.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012643060646951199,
+ "skip_count": 4.0,
+ "step": 4802,
+ "text_loss": 0.22567439079284668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0006183397172933462,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7747182.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002678517485037446,
+ "skip_count": 0.0,
+ "step": 4804,
+ "text_loss": 0.19188879430294037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0006180389721674101,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7750735.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013385121710598469,
+ "skip_count": 0.0,
+ "step": 4806,
+ "text_loss": 0.5860441327095032
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.572938068682124,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.000617738181808717,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7753843.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034869094379246235,
+ "skip_count": 1.0,
+ "step": 4808,
+ "text_loss": 0.4366260766983032
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0478515625,
+ "learning_rate": 0.0006174373463325306,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7757039.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013648992171511054,
+ "skip_count": 0.0,
+ "step": 4810,
+ "text_loss": 0.5217258334159851
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0006171364658541314,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 7760016.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038017008919268847,
+ "skip_count": 2.0,
+ "step": 4812,
+ "text_loss": 0.8130963444709778
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.601115350748458,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0006168355404888177,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7762961.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006867518648505211,
+ "skip_count": 2.0,
+ "step": 4814,
+ "text_loss": 0.17822521924972534
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.610507778103905,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0006165345703519043,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7766399.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004653502255678177,
+ "skip_count": 0.0,
+ "step": 4816,
+ "text_loss": 0.5316070914268494
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 22.61990020545935,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0006162335555587238,
+ "loss": 0.008,
+ "macro_f1": 1.0,
+ "num_tokens": 7769039.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016906452365219593,
+ "skip_count": 1.0,
+ "step": 4818,
+ "text_loss": 0.5680997967720032
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.629292632814792,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 0.0006159324962246257,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7772768.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002541248919442296,
+ "skip_count": 0.0,
+ "step": 4820,
+ "text_loss": 0.6169226169586182
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0006156313924649762,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7775545.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008644679561257362,
+ "skip_count": 2.0,
+ "step": 4822,
+ "text_loss": 0.2211475968360901
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.0006153302443951589,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7778837.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0041346061043441296,
+ "skip_count": 2.0,
+ "step": 4824,
+ "text_loss": 0.5369775891304016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.0006150290521305746,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7782309.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012756052892655134,
+ "skip_count": 0.0,
+ "step": 4826,
+ "text_loss": 0.5294989943504333
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.666862342236573,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0006147278157866403,
+ "loss": 0.0046,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7785565.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.029718991369009018,
+ "skip_count": 1.0,
+ "step": 4828,
+ "text_loss": 0.6920449733734131
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0006144265354787906,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7788218.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004829924553632736,
+ "skip_count": 0.0,
+ "step": 4830,
+ "text_loss": 0.17072243988513947
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0006141252113224767,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7790788.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00254037044942379,
+ "skip_count": 0.0,
+ "step": 4832,
+ "text_loss": 0.20075996220111847
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.695039624302908,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01519775390625,
+ "learning_rate": 0.0006138238434331666,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7793913.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004426188243087381,
+ "skip_count": 0.0,
+ "step": 4834,
+ "text_loss": 0.695742130279541
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 0.000613522431926345,
+ "loss": 0.0036,
+ "macro_f1": 1.0,
+ "num_tokens": 7796932.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005176798906177282,
+ "skip_count": 3.0,
+ "step": 4836,
+ "text_loss": 0.4910822808742523
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0006132209769175132,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7800686.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004120545461773872,
+ "skip_count": 0.0,
+ "step": 4838,
+ "text_loss": 0.3701378405094147
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.723216906369238,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0006129194785221894,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7804765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0043835826218128204,
+ "skip_count": 0.0,
+ "step": 4840,
+ "text_loss": 0.343635618686676
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0006126179368559086,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7807498.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001394893741235137,
+ "skip_count": 1.0,
+ "step": 4842,
+ "text_loss": 0.47756674885749817
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.000612316352034222,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7810784.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031262130942195654,
+ "skip_count": 2.0,
+ "step": 4844,
+ "text_loss": 0.13077901303768158
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 22.751394188435572,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 0.0006120147241726972,
+ "loss": 0.0081,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 7814754.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.016139274463057518,
+ "skip_count": 1.0,
+ "step": 4846,
+ "text_loss": 0.18850074708461761
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0006117130533869189,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7818245.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009124451316893101,
+ "skip_count": 0.0,
+ "step": 4848,
+ "text_loss": 0.42503559589385986
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0006114113397924878,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7822214.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015132242115214467,
+ "skip_count": 0.0,
+ "step": 4850,
+ "text_loss": 0.16767354309558868
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 22.779571470501907,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0006111095835050212,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 7825019.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006253300234675407,
+ "skip_count": 2.0,
+ "step": 4852,
+ "text_loss": 0.44826745986938477
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.788963897857354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0006108077846401524,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7828113.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024391328915953636,
+ "skip_count": 0.0,
+ "step": 4854,
+ "text_loss": 0.2009880244731903
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 22.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0006105059433135317,
+ "loss": 0.0078,
+ "macro_f1": 1.0,
+ "num_tokens": 7831177.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0020866121631115675,
+ "skip_count": 1.0,
+ "step": 4856,
+ "text_loss": 0.7082528471946716
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.80774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.0006102040596408251,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7834485.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004373365081846714,
+ "skip_count": 1.0,
+ "step": 4858,
+ "text_loss": 0.2541539669036865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.817141179923688,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0006099021337377148,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7837749.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004309024661779404,
+ "skip_count": 0.0,
+ "step": 4860,
+ "text_loss": 0.3163885176181793
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 22.82653360727913,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0006096001657198995,
+ "loss": 0.0065,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 7840979.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023044804111123085,
+ "skip_count": 4.0,
+ "step": 4862,
+ "text_loss": 0.49609798192977905
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 22.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 0.0006092981557030941,
+ "loss": 0.0056,
+ "macro_f1": 1.0,
+ "num_tokens": 7844905.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010683654807507992,
+ "skip_count": 3.0,
+ "step": 4864,
+ "text_loss": 0.16866883635520935
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.845318461990022,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0006089961038030291,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7847800.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011224723421037197,
+ "skip_count": 0.0,
+ "step": 4866,
+ "text_loss": 0.5093055367469788
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0006086940101354515,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7850983.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003944621421396732,
+ "skip_count": 1.0,
+ "step": 4868,
+ "text_loss": 0.5753747224807739
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 22.86410331670091,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.0006083918748161244,
+ "loss": 0.0069,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 7855041.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02532145567238331,
+ "skip_count": 2.0,
+ "step": 4870,
+ "text_loss": 0.8082366585731506
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0006080896979608262,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7858058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007558314246125519,
+ "skip_count": 0.0,
+ "step": 4872,
+ "text_loss": 0.6476574540138245
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.000607787479685352,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7861223.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009224560926668346,
+ "skip_count": 0.0,
+ "step": 4874,
+ "text_loss": 0.5012133717536926
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0006074852201055121,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7864180.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028308273758739233,
+ "skip_count": 0.0,
+ "step": 4876,
+ "text_loss": 0.7447214722633362
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.901673026122687,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052734375,
+ "learning_rate": 0.0006071829193371331,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7866726.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021505290642380714,
+ "skip_count": 0.0,
+ "step": 4878,
+ "text_loss": 0.5444929599761963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11376953125,
+ "learning_rate": 0.0006068805774960573,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7870166.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021109723020344973,
+ "skip_count": 0.0,
+ "step": 4880,
+ "text_loss": 0.3577263355255127
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.0006065781946981425,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7873028.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027144821360707283,
+ "skip_count": 0.0,
+ "step": 4882,
+ "text_loss": 0.28464797139167786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.92985030818902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05224609375,
+ "learning_rate": 0.0006062757710592624,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7876747.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004638207610696554,
+ "skip_count": 0.0,
+ "step": 4884,
+ "text_loss": 0.381534606218338
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0006059733066953066,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 7879524.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002225410658866167,
+ "skip_count": 2.0,
+ "step": 4886,
+ "text_loss": 0.5167883634567261
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0006056708017221796,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7882809.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00419368501752615,
+ "skip_count": 1.0,
+ "step": 4888,
+ "text_loss": 0.22688335180282593
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.000605368256255802,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7886310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017340193735435605,
+ "skip_count": 1.0,
+ "step": 4890,
+ "text_loss": 1.0128135681152344
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 22.967420017610802,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0712890625,
+ "learning_rate": 0.0006050656704121098,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7889483.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016647159354761243,
+ "skip_count": 0.0,
+ "step": 4892,
+ "text_loss": 0.2213262915611267
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 22.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0006047630443070547,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7892615.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038971947506070137,
+ "skip_count": 3.0,
+ "step": 4894,
+ "text_loss": 0.45751357078552246
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 22.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0006044603780566032,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 7895747.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0036852145567536354,
+ "skip_count": 1.0,
+ "step": 4896,
+ "text_loss": 0.13489919900894165
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 22.995597299677137,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0006041576717767379,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7899155.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007661987561732531,
+ "skip_count": 1.0,
+ "step": 4898,
+ "text_loss": 0.281853586435318
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 23.00469621367772,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0006038549255834563,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 7901667.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01836695335805416,
+ "skip_count": 5.0,
+ "step": 4900,
+ "text_loss": 0.24879895150661469
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.014088641033165,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.000603552139592771,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7904506.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011829182039946318,
+ "skip_count": 0.0,
+ "step": 4902,
+ "text_loss": 0.7550268769264221
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 23.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0006032493139207106,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7907316.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0022891140542924404,
+ "skip_count": 0.0,
+ "step": 4904,
+ "text_loss": 0.37596020102500916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.0006029464486833186,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7911283.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001990227960050106,
+ "skip_count": 0.0,
+ "step": 4906,
+ "text_loss": 0.5879577994346619
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.0006026435439966531,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7913907.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026039890944957733,
+ "skip_count": 1.0,
+ "step": 4908,
+ "text_loss": 0.41484713554382324
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.051658350454947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0006023405999767879,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7916772.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009183229878544807,
+ "skip_count": 1.0,
+ "step": 4910,
+ "text_loss": 0.20732562243938446
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0006020376167398116,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7919346.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005508727394044399,
+ "skip_count": 1.0,
+ "step": 4912,
+ "text_loss": 0.41416165232658386
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 27.0,
+ "epoch": 23.070443205165834,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0006017345944018284,
+ "loss": 0.0051,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7922404.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008651934564113617,
+ "skip_count": 0.0,
+ "step": 4914,
+ "text_loss": 0.4290519952774048
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0006014315330789563,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7925165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003601635340601206,
+ "skip_count": 1.0,
+ "step": 4916,
+ "text_loss": 0.8447931408882141
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0006011284328873296,
+ "loss": 0.0041,
+ "macro_f1": 1.0,
+ "num_tokens": 7928146.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0049415635876357555,
+ "skip_count": 2.0,
+ "step": 4918,
+ "text_loss": 0.32237401604652405
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.098620487232168,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.0006008252939430967,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7931163.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024150956887751818,
+ "skip_count": 0.0,
+ "step": 4920,
+ "text_loss": 0.2251713126897812
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.108012914587615,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0006005221163624209,
+ "loss": 0.0057,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7934084.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03181030973792076,
+ "skip_count": 0.0,
+ "step": 4922,
+ "text_loss": 0.4962928593158722
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.11740534194306,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.054931640625,
+ "learning_rate": 0.0006002189002614806,
+ "loss": 0.0089,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7937021.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00227518193423748,
+ "skip_count": 2.0,
+ "step": 4924,
+ "text_loss": 0.34440335631370544
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.126797769298502,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0005999156457564685,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7940205.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004331593867391348,
+ "skip_count": 1.0,
+ "step": 4926,
+ "text_loss": 0.14114083349704742
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0005996123529635925,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7945174.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000612895586527884,
+ "skip_count": 0.0,
+ "step": 4928,
+ "text_loss": 0.3895469009876251
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.145582624009393,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.000599309021999075,
+ "loss": 0.006,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 7948716.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02319233864545822,
+ "skip_count": 1.0,
+ "step": 4930,
+ "text_loss": 0.38103172183036804
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0005990056529791528,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7952497.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003423231653869152,
+ "skip_count": 0.0,
+ "step": 4932,
+ "text_loss": 0.30447322130203247
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.164367478720283,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017822265625,
+ "learning_rate": 0.0005987022460200778,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7955578.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007005351362749934,
+ "skip_count": 0.0,
+ "step": 4934,
+ "text_loss": 0.49621838331222534
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 23.173759906075727,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0005983988012381159,
+ "loss": 0.0061,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 7958741.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03962617367506027,
+ "skip_count": 1.0,
+ "step": 4936,
+ "text_loss": 0.1920493096113205
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 23.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 0.0005980953187495476,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7962236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026006060652434826,
+ "skip_count": 3.0,
+ "step": 4938,
+ "text_loss": 0.5286803841590881
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.192544760786618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0005977917986706681,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7965631.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005010952707380056,
+ "skip_count": 0.0,
+ "step": 4940,
+ "text_loss": 0.3507745563983917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.0005974882411177871,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7968516.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023964287247508764,
+ "skip_count": 0.0,
+ "step": 4942,
+ "text_loss": 0.9110504388809204
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.000597184646207228,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 7971310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026230409275740385,
+ "skip_count": 1.0,
+ "step": 4944,
+ "text_loss": 0.4131232798099518
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.220722042852948,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0005968810140553292,
+ "loss": 0.0102,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7974809.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007397596491500735,
+ "skip_count": 0.0,
+ "step": 4946,
+ "text_loss": 0.5130466222763062
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0005965773447784431,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7977800.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009955473942682147,
+ "skip_count": 0.0,
+ "step": 4948,
+ "text_loss": 0.5366153717041016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01373291015625,
+ "learning_rate": 0.0005962736384929362,
+ "loss": 0.0026,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7981027.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0049227322451770306,
+ "skip_count": 0.0,
+ "step": 4950,
+ "text_loss": 0.17266370356082916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.248899324919282,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 0.0005959698953151895,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7983580.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009975163266062737,
+ "skip_count": 0.0,
+ "step": 4952,
+ "text_loss": 0.2474549114704132
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.0005956661153615979,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7986711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006475782720372081,
+ "skip_count": 0.0,
+ "step": 4954,
+ "text_loss": 0.5748327970504761
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.0005953622987485703,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7990194.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001449751085601747,
+ "skip_count": 0.0,
+ "step": 4956,
+ "text_loss": 0.5163559317588806
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.277076606985617,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0005950584455925301,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7993050.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017087773885577917,
+ "skip_count": 0.0,
+ "step": 4958,
+ "text_loss": 0.15892620384693146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.286469034341064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.0005947545560099142,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 7996383.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0044417232275009155,
+ "skip_count": 0.0,
+ "step": 4960,
+ "text_loss": 0.48022928833961487
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 23.295861461696507,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0005944506301171734,
+ "loss": 0.0066,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 7999843.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010093312710523605,
+ "skip_count": 2.0,
+ "step": 4962,
+ "text_loss": 0.5050316452980042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.30525388905195,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0005941466680307732,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8003504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009699694812297821,
+ "skip_count": 0.0,
+ "step": 4964,
+ "text_loss": 0.30474427342414856
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 23.314646316407398,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0005938426698671922,
+ "loss": 0.0097,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8007427.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016759657301008701,
+ "skip_count": 0.0,
+ "step": 4966,
+ "text_loss": 0.25060293078422546
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.0005935386357429232,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 8010265.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006916914135217667,
+ "skip_count": 3.0,
+ "step": 4968,
+ "text_loss": 0.49084481596946716
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 23.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0005932345657744723,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 8013733.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.017182426527142525,
+ "skip_count": 5.0,
+ "step": 4970,
+ "text_loss": 0.2705717980861664
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.342823598473732,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00059293046007836,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8017068.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008485594764351845,
+ "skip_count": 2.0,
+ "step": 4972,
+ "text_loss": 0.18570218980312347
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0005926263187711201,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8020185.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021750847809016705,
+ "skip_count": 2.0,
+ "step": 4974,
+ "text_loss": 0.4457069933414459
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.0005923221419693001,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8023038.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020193420350551605,
+ "skip_count": 0.0,
+ "step": 4976,
+ "text_loss": 0.7394505143165588
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.054931640625,
+ "learning_rate": 0.0005920179297894613,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8026236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001450369250960648,
+ "skip_count": 1.0,
+ "step": 4978,
+ "text_loss": 0.5914503335952759
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.000591713682348178,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8028765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017808573320508003,
+ "skip_count": 0.0,
+ "step": 4980,
+ "text_loss": 0.19231407344341278
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0005914093997620388,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8032043.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018225493840873241,
+ "skip_count": 0.0,
+ "step": 4982,
+ "text_loss": 0.3567875325679779
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.399178162606397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0005911050821476449,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8035086.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016285666497424245,
+ "skip_count": 0.0,
+ "step": 4984,
+ "text_loss": 0.34609633684158325
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.0005908007296216119,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8038193.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014699801104143262,
+ "skip_count": 0.0,
+ "step": 4986,
+ "text_loss": 0.4492359757423401
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.000590496342300568,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8041099.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002442725468426943,
+ "skip_count": 0.0,
+ "step": 4988,
+ "text_loss": 0.5162975788116455
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.42735544467273,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0005901919203011548,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8044350.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008624207228422165,
+ "skip_count": 2.0,
+ "step": 4990,
+ "text_loss": 0.2533033490180969
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.0005898874637400279,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8047467.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015421364223584533,
+ "skip_count": 0.0,
+ "step": 4992,
+ "text_loss": 0.4890289306640625
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.0005895829727338552,
+ "loss": 0.0065,
+ "macro_f1": 1.0,
+ "num_tokens": 8050626.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0024516626726835966,
+ "skip_count": 2.0,
+ "step": 4994,
+ "text_loss": 0.50797039270401
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0005892784473993184,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8053386.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018553845584392548,
+ "skip_count": 2.0,
+ "step": 4996,
+ "text_loss": 0.628828763961792
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.464925154094512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.000588973887853112,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8055941.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004258487373590469,
+ "skip_count": 0.0,
+ "step": 4998,
+ "text_loss": 0.2643229067325592
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.474317581449956,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0005886692942119441,
+ "loss": 0.0062,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 8058638.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.019064312800765038,
+ "skip_count": 2.0,
+ "step": 5000,
+ "text_loss": 0.4925006031990051
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0005883646665925353,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8062097.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007969749276526272,
+ "skip_count": 0.0,
+ "step": 5002,
+ "text_loss": 0.49412909150123596
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.493102436160846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0005880600051116196,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8065202.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005813780706375837,
+ "skip_count": 2.0,
+ "step": 5004,
+ "text_loss": 0.5681346654891968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0005877553098859439,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8068574.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005012941546738148,
+ "skip_count": 0.0,
+ "step": 5006,
+ "text_loss": 0.2682424485683441
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 23.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0005874505810322678,
+ "loss": 0.0102,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8071834.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005859757773578167,
+ "skip_count": 3.0,
+ "step": 5008,
+ "text_loss": 0.6460036039352417
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.000587145818667364,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8074687.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002868571551516652,
+ "skip_count": 2.0,
+ "step": 5010,
+ "text_loss": 0.2405751347541809
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0005868410229080181,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8077617.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021759893279522657,
+ "skip_count": 1.0,
+ "step": 5012,
+ "text_loss": 0.7455595135688782
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.0005865361938710286,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8080734.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008311949786730111,
+ "skip_count": 0.0,
+ "step": 5014,
+ "text_loss": 0.44876906275749207
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 25.0,
+ "epoch": 23.549457000293515,
+ "f1_execute": 0.9756097793579102,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0005862313316732063,
+ "loss": 0.0054,
+ "macro_f1": 0.9615669250488281,
+ "num_tokens": 8085092.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.012511664070189,
+ "skip_count": 6.0,
+ "step": 5016,
+ "text_loss": 0.26010942459106445
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.000585926436431375,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8088333.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035441694781184196,
+ "skip_count": 0.0,
+ "step": 5018,
+ "text_loss": 0.28225192427635193
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 23.568241855004402,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0005856215082623711,
+ "loss": 0.0093,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 8091298.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.023543989285826683,
+ "skip_count": 2.0,
+ "step": 5020,
+ "text_loss": 0.5757577419281006
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.57763428235985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.0005853165472830439,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8094361.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003124240320175886,
+ "skip_count": 0.0,
+ "step": 5022,
+ "text_loss": 0.4021305739879608
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0005850115536102546,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8097514.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008170558139681816,
+ "skip_count": 1.0,
+ "step": 5024,
+ "text_loss": 0.18926584720611572
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 23.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.0005847065273608777,
+ "loss": 0.0051,
+ "macro_f1": 1.0,
+ "num_tokens": 8100525.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02127663604915142,
+ "skip_count": 5.0,
+ "step": 5026,
+ "text_loss": 0.18827557563781738
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.0005844014686517998,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8104016.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00272122910246253,
+ "skip_count": 0.0,
+ "step": 5028,
+ "text_loss": 0.15534701943397522
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 23.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0005840963775999199,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 8106697.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.008979840204119682,
+ "skip_count": 4.0,
+ "step": 5030,
+ "text_loss": 0.8123718500137329
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0005837912543221493,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8110986.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005006929859519005,
+ "skip_count": 0.0,
+ "step": 5032,
+ "text_loss": 0.26128846406936646
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.0005834860989354121,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8114010.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005531277856789529,
+ "skip_count": 0.0,
+ "step": 5034,
+ "text_loss": 0.5100266933441162
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.64338127384796,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0556640625,
+ "learning_rate": 0.0005831809115566442,
+ "loss": 0.0073,
+ "macro_f1": 0.6538461446762085,
+ "num_tokens": 8117168.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04978533461689949,
+ "skip_count": 1.0,
+ "step": 5036,
+ "text_loss": 0.41049885749816895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0005828756923027941,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8119900.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006322385743260384,
+ "skip_count": 0.0,
+ "step": 5038,
+ "text_loss": 0.5584380626678467
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.662166128558848,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0005825704412908225,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8123928.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001000594231300056,
+ "skip_count": 0.0,
+ "step": 5040,
+ "text_loss": 0.6460791230201721
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 0.0005822651586377019,
+ "loss": 0.0108,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8127926.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011595834977924824,
+ "skip_count": 2.0,
+ "step": 5042,
+ "text_loss": 0.3131820261478424
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 23.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.0005819598444604173,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8131092.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004449303261935711,
+ "skip_count": 3.0,
+ "step": 5044,
+ "text_loss": 0.2774372696876526
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.690343410625182,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0005816544988759658,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8134051.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007877505850046873,
+ "skip_count": 0.0,
+ "step": 5046,
+ "text_loss": 0.39496293663978577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.69973583798063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.0005813491220013563,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8138725.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002868623472750187,
+ "skip_count": 0.0,
+ "step": 5048,
+ "text_loss": 0.3779948651790619
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 0.0005810437139536098,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 8141913.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006244937423616648,
+ "skip_count": 4.0,
+ "step": 5050,
+ "text_loss": 0.4512978494167328
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.718520692691516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06396484375,
+ "learning_rate": 0.0005807382748497592,
+ "loss": 0.0112,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8146193.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011013929033651948,
+ "skip_count": 0.0,
+ "step": 5052,
+ "text_loss": 0.6194499731063843
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.727913120046964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0005804328048068493,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8149701.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005505079869180918,
+ "skip_count": 1.0,
+ "step": 5054,
+ "text_loss": 0.2932305335998535
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 23.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0005801273039419368,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 8152861.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0057641929015517235,
+ "skip_count": 1.0,
+ "step": 5056,
+ "text_loss": 0.2631317973136902
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 23.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0005798217723720904,
+ "loss": 0.005,
+ "macro_f1": 1.0,
+ "num_tokens": 8155843.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0021671492140740156,
+ "skip_count": 5.0,
+ "step": 5058,
+ "text_loss": 0.2889988422393799
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.756090402113298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0005795162102143902,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8158812.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004476628266274929,
+ "skip_count": 1.0,
+ "step": 5060,
+ "text_loss": 0.48028868436813354
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0005792106175859283,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 8162719.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038497636560350657,
+ "skip_count": 3.0,
+ "step": 5062,
+ "text_loss": 0.4559471607208252
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.774875256824185,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 0.0005789049946038083,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8165692.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004451582673937082,
+ "skip_count": 0.0,
+ "step": 5064,
+ "text_loss": 0.3782602548599243
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.784267684179632,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.0005785993413851456,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8168900.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002951978938654065,
+ "skip_count": 0.0,
+ "step": 5066,
+ "text_loss": 0.32392629981040955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.000578293658047067,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8171661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011171254329383373,
+ "skip_count": 2.0,
+ "step": 5068,
+ "text_loss": 0.24492619931697845
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0005779879447067109,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8175075.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016067599644884467,
+ "skip_count": 0.0,
+ "step": 5070,
+ "text_loss": 0.7738823294639587
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.812444966245963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.000577682201481227,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8178515.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009113503620028496,
+ "skip_count": 1.0,
+ "step": 5072,
+ "text_loss": 0.2082248032093048
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 23.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0005773764284877774,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 8181790.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007332196459174156,
+ "skip_count": 1.0,
+ "step": 5074,
+ "text_loss": 0.4557662904262543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0537109375,
+ "learning_rate": 0.0005770706258435342,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8184854.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016252279747277498,
+ "skip_count": 0.0,
+ "step": 5076,
+ "text_loss": 0.2888098657131195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.840622248312297,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0005767647936656818,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8187860.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003406575648114085,
+ "skip_count": 0.0,
+ "step": 5078,
+ "text_loss": 0.6533790230751038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0005764589320714158,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8191683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006520140450447798,
+ "skip_count": 0.0,
+ "step": 5080,
+ "text_loss": 0.6903796195983887
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0005761530411779426,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8195109.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01188349537551403,
+ "skip_count": 1.0,
+ "step": 5082,
+ "text_loss": 0.20460398495197296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 23.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.083984375,
+ "learning_rate": 0.0005758471211024804,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8198340.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004826809279620647,
+ "skip_count": 3.0,
+ "step": 5084,
+ "text_loss": 0.2203969657421112
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.878191957734078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 0.0005755411719622584,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8200882.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019170823507010937,
+ "skip_count": 0.0,
+ "step": 5086,
+ "text_loss": 0.6744595170021057
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.88758438508952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.0005752351938745167,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8203777.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002110893838107586,
+ "skip_count": 1.0,
+ "step": 5088,
+ "text_loss": 0.4137859046459198
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.896976812444965,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.000574929186956507,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8207627.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018580821342766285,
+ "skip_count": 1.0,
+ "step": 5090,
+ "text_loss": 0.4830456078052521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.906369239800412,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0005746231513254912,
+ "loss": 0.0066,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 8210263.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0194723978638649,
+ "skip_count": 0.0,
+ "step": 5092,
+ "text_loss": 0.17383277416229248
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 23.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0005743170870987433,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8214166.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006944256369024515,
+ "skip_count": 2.0,
+ "step": 5094,
+ "text_loss": 0.20003484189510345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0005740109943935472,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8217545.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002044794149696827,
+ "skip_count": 1.0,
+ "step": 5096,
+ "text_loss": 0.5117167830467224
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 23.934546521866746,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06494140625,
+ "learning_rate": 0.0005737048733271986,
+ "loss": 0.0076,
+ "macro_f1": 1.0,
+ "num_tokens": 8220673.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009966124780476093,
+ "skip_count": 2.0,
+ "step": 5098,
+ "text_loss": 0.2705996036529541
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0005733987240170035,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8223796.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009675708715803921,
+ "skip_count": 0.0,
+ "step": 5100,
+ "text_loss": 0.7016357183456421
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.0005730925465802788,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8227048.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009548200177960098,
+ "skip_count": 0.0,
+ "step": 5102,
+ "text_loss": 0.30823078751564026
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0005727863411343526,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8229971.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005767418188042939,
+ "skip_count": 0.0,
+ "step": 5104,
+ "text_loss": 0.6897505521774292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 23.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0005724801077965629,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8232758.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009297889657318592,
+ "skip_count": 3.0,
+ "step": 5106,
+ "text_loss": 0.21293514966964722
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 23.981508658643968,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0005721738466842592,
+ "loss": 0.0079,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 8238154.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013964693062007427,
+ "skip_count": 0.0,
+ "step": 5108,
+ "text_loss": 0.7273620367050171
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 23.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10888671875,
+ "learning_rate": 0.0005718675579148014,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 8240818.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.007218098267912865,
+ "skip_count": 1.0,
+ "step": 5110,
+ "text_loss": 0.5607150793075562
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.0005715612416055598,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8244048.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007558444049209356,
+ "skip_count": 2.0,
+ "step": 5112,
+ "text_loss": 0.23694385588169098
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 24.009392427355444,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0005712548978739154,
+ "loss": 0.0072,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 8247240.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015726923942565918,
+ "skip_count": 1.0,
+ "step": 5114,
+ "text_loss": 0.6032099723815918
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 24.01878485471089,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 0.0005709485268372598,
+ "loss": 0.0046,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 8250585.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.011148860678076744,
+ "skip_count": 2.0,
+ "step": 5116,
+ "text_loss": 0.6825997233390808
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0005706421286129948,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8254240.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006977916229516268,
+ "skip_count": 0.0,
+ "step": 5118,
+ "text_loss": 0.2532844543457031
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.037569709421778,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0005703357033185328,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8257133.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006415650714188814,
+ "skip_count": 2.0,
+ "step": 5120,
+ "text_loss": 0.6132124066352844
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 24.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.0005700292510712967,
+ "loss": 0.0077,
+ "macro_f1": 1.0,
+ "num_tokens": 8261076.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0044475216418504715,
+ "skip_count": 1.0,
+ "step": 5122,
+ "text_loss": 0.4277699887752533
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.0005697227719887194,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8264607.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005743155721575022,
+ "skip_count": 2.0,
+ "step": 5124,
+ "text_loss": 0.2570968270301819
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0005694162661882444,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8267992.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007581565878354013,
+ "skip_count": 0.0,
+ "step": 5126,
+ "text_loss": 0.5850184559822083
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.07513941884356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0005691097337873252,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8271010.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036611228715628386,
+ "skip_count": 0.0,
+ "step": 5128,
+ "text_loss": 0.660999059677124
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0005688031749034258,
+ "loss": 0.0032,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8273638.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0039906189776957035,
+ "skip_count": 0.0,
+ "step": 5130,
+ "text_loss": 0.5839648246765137
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0005684965896540198,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 8276504.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007539632264524698,
+ "skip_count": 3.0,
+ "step": 5132,
+ "text_loss": 0.27675092220306396
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 24.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0005681899781565915,
+ "loss": 0.0033,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8279977.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0026953567285090685,
+ "skip_count": 0.0,
+ "step": 5134,
+ "text_loss": 0.532974123954773
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.000567883340528635,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8282781.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005754240322858095,
+ "skip_count": 1.0,
+ "step": 5136,
+ "text_loss": 0.31100207567214966
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0005675766768876542,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8286533.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0051517849788069725,
+ "skip_count": 0.0,
+ "step": 5138,
+ "text_loss": 0.5734741687774658
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0005672699873511635,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8289858.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025852699764072895,
+ "skip_count": 2.0,
+ "step": 5140,
+ "text_loss": 0.37045374512672424
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0005669632720366868,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8293038.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038520018570125103,
+ "skip_count": 0.0,
+ "step": 5142,
+ "text_loss": 0.25952374935150146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0005666565310617577,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8295717.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00026914477348327637,
+ "skip_count": 0.0,
+ "step": 5144,
+ "text_loss": 0.32531213760375977
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.159671265042558,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.0005663497645439203,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8299750.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0055860537104308605,
+ "skip_count": 2.0,
+ "step": 5146,
+ "text_loss": 0.2520618438720703
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0005660429726007279,
+ "loss": 0.0092,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8303075.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004446739796549082,
+ "skip_count": 1.0,
+ "step": 5148,
+ "text_loss": 0.43672287464141846
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 24.17845611975345,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07080078125,
+ "learning_rate": 0.000565736155349744,
+ "loss": 0.0076,
+ "macro_f1": 0.8814815282821655,
+ "num_tokens": 8306268.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.046915046870708466,
+ "skip_count": 4.0,
+ "step": 5150,
+ "text_loss": 0.35405927896499634
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 24.187848547108892,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0005654293129085412,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8310480.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010549088008701801,
+ "skip_count": 4.0,
+ "step": 5152,
+ "text_loss": 0.3523249626159668
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 24.19724097446434,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0005651224453947023,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8313367.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002893900265917182,
+ "skip_count": 0.0,
+ "step": 5154,
+ "text_loss": 0.4503810703754425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0005648155529258195,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8318006.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018450213829055429,
+ "skip_count": 0.0,
+ "step": 5156,
+ "text_loss": 0.5687127113342285
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.216025829175226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 0.0005645086356194943,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8320646.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026727779768407345,
+ "skip_count": 0.0,
+ "step": 5158,
+ "text_loss": 0.38920050859451294
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.225418256530673,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0005642016935933385,
+ "loss": 0.0035,
+ "macro_f1": 1.0,
+ "num_tokens": 8323915.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00611621281132102,
+ "skip_count": 2.0,
+ "step": 5160,
+ "text_loss": 0.3003547787666321
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 24.0,
+ "epoch": 24.234810683886117,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.0005638947269649726,
+ "loss": 0.0063,
+ "macro_f1": 0.9619450569152832,
+ "num_tokens": 8327073.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.028447439894080162,
+ "skip_count": 6.0,
+ "step": 5162,
+ "text_loss": 0.24053414165973663
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.0005635877358520268,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8330388.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013072624569758773,
+ "skip_count": 0.0,
+ "step": 5164,
+ "text_loss": 0.43772217631340027
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.253595538597008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0005632807203721406,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8333241.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009456822881475091,
+ "skip_count": 0.0,
+ "step": 5166,
+ "text_loss": 0.5217573046684265
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 0.000562973680642963,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8337257.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023840824142098427,
+ "skip_count": 0.0,
+ "step": 5168,
+ "text_loss": 0.31814974546432495
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 24.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 0.0005626666167821521,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8340143.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020231492817401886,
+ "skip_count": 3.0,
+ "step": 5170,
+ "text_loss": 0.5478505492210388
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.281772820663342,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0162353515625,
+ "learning_rate": 0.0005623595289073755,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 8343566.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01070715207606554,
+ "skip_count": 2.0,
+ "step": 5172,
+ "text_loss": 0.23213914036750793
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 24.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.0005620524171363099,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8346836.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003720001084730029,
+ "skip_count": 3.0,
+ "step": 5174,
+ "text_loss": 0.5114789009094238
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 24.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0005617452815866409,
+ "loss": 0.0061,
+ "macro_f1": 1.0,
+ "num_tokens": 8349726.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003322509117424488,
+ "skip_count": 1.0,
+ "step": 5176,
+ "text_loss": 0.4894506335258484
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.309950102729672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0005614381223760635,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8352478.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00028752797516062856,
+ "skip_count": 0.0,
+ "step": 5178,
+ "text_loss": 0.6418307423591614
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.31934253008512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.0005611309396222817,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8355766.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028724796138703823,
+ "skip_count": 0.0,
+ "step": 5180,
+ "text_loss": 0.23635952174663544
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.328734957440563,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0005608237334430085,
+ "loss": 0.0068,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 8358888.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.058520980179309845,
+ "skip_count": 2.0,
+ "step": 5182,
+ "text_loss": 0.23434793949127197
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.338127384796007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1015625,
+ "learning_rate": 0.000560516503955966,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8361761.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021356395445764065,
+ "skip_count": 1.0,
+ "step": 5184,
+ "text_loss": 0.40855672955513
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.000560209251278885,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8364376.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016185789136216044,
+ "skip_count": 0.0,
+ "step": 5186,
+ "text_loss": 0.6265131831169128
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0005599019755295053,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8367769.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031490204855799675,
+ "skip_count": 2.0,
+ "step": 5188,
+ "text_loss": 0.4716353118419647
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 24.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0005595946768255756,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8370705.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003500689286738634,
+ "skip_count": 0.0,
+ "step": 5190,
+ "text_loss": 0.5467679500579834
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.375697094217788,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0005592873552848532,
+ "loss": 0.0045,
+ "macro_f1": 1.0,
+ "num_tokens": 8374217.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.010764475911855698,
+ "skip_count": 3.0,
+ "step": 5192,
+ "text_loss": 0.4345340132713318
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 24.38508952157323,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0005589800110251045,
+ "loss": 0.0087,
+ "macro_f1": 1.0,
+ "num_tokens": 8378182.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0010365343187004328,
+ "skip_count": 1.0,
+ "step": 5194,
+ "text_loss": 0.46722909808158875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.394481948928675,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0005586726441641044,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8381227.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006349093746393919,
+ "skip_count": 2.0,
+ "step": 5196,
+ "text_loss": 0.35410359501838684
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.403874376284122,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.0005583652548196362,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8384886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00038166221929714084,
+ "skip_count": 0.0,
+ "step": 5198,
+ "text_loss": 0.5950250625610352
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0005580578431094924,
+ "loss": 0.0092,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8388939.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023578559048473835,
+ "skip_count": 2.0,
+ "step": 5200,
+ "text_loss": 0.6553771495819092
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0005577504091514735,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8391629.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010771085508167744,
+ "skip_count": 0.0,
+ "step": 5202,
+ "text_loss": 0.4441985785961151
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 24.432051658350456,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.000557442953063389,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8394440.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005844325292855501,
+ "skip_count": 3.0,
+ "step": 5204,
+ "text_loss": 0.5807011723518372
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.0005571354749630564,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8397731.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006837233901023865,
+ "skip_count": 1.0,
+ "step": 5206,
+ "text_loss": 0.27780941128730774
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 24.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.000556827974968302,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8400859.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007656649220734835,
+ "skip_count": 3.0,
+ "step": 5208,
+ "text_loss": 0.4746324121952057
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0005565204531969606,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8404164.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028129038400948048,
+ "skip_count": 1.0,
+ "step": 5210,
+ "text_loss": 0.8513513803482056
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0005562129097668746,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8407196.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00492360582575202,
+ "skip_count": 1.0,
+ "step": 5212,
+ "text_loss": 0.12255420535802841
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0005559053447958958,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8410633.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020713545382022858,
+ "skip_count": 0.0,
+ "step": 5214,
+ "text_loss": 0.6878522634506226
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.488406222483125,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.0005555977584018833,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8413414.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007216963567771018,
+ "skip_count": 0.0,
+ "step": 5216,
+ "text_loss": 0.845878541469574
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.49779864983857,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.057861328125,
+ "learning_rate": 0.0005552901507027048,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8416817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002400130731984973,
+ "skip_count": 1.0,
+ "step": 5218,
+ "text_loss": 0.16753672063350677
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 0.0005549825218162365,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8419617.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004563181661069393,
+ "skip_count": 0.0,
+ "step": 5220,
+ "text_loss": 0.26107168197631836
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 24.516583504549455,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.000554674871860362,
+ "loss": 0.0086,
+ "macro_f1": 1.0,
+ "num_tokens": 8422686.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006413881666958332,
+ "skip_count": 1.0,
+ "step": 5222,
+ "text_loss": 0.6333847045898438
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 24.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0005543672009529734,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8425571.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0057656955905258656,
+ "skip_count": 3.0,
+ "step": 5224,
+ "text_loss": 0.4552212357521057
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 24.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.0005540595092119709,
+ "loss": 0.0082,
+ "macro_f1": 1.0,
+ "num_tokens": 8429038.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.011755156330764294,
+ "skip_count": 2.0,
+ "step": 5226,
+ "text_loss": 0.16597330570220947
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0005537517967552626,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8432117.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007519085193052888,
+ "skip_count": 0.0,
+ "step": 5228,
+ "text_loss": 0.6283590197563171
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.000553444063700764,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8435176.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003066456411033869,
+ "skip_count": 0.0,
+ "step": 5230,
+ "text_loss": 0.2360922247171402
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 0.0005531363101663998,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8438515.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002865589689463377,
+ "skip_count": 0.0,
+ "step": 5232,
+ "text_loss": 0.8075396418571472
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.572938068682124,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0005528285362701011,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8441731.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012521179160103202,
+ "skip_count": 0.0,
+ "step": 5234,
+ "text_loss": 0.584335446357727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 24.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0005525207421298077,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8444535.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005398475099354982,
+ "skip_count": 3.0,
+ "step": 5236,
+ "text_loss": 0.22711622714996338
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0966796875,
+ "learning_rate": 0.0005522129278634669,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8448337.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002957914723083377,
+ "skip_count": 1.0,
+ "step": 5238,
+ "text_loss": 0.3157515823841095
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 24.601115350748458,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 0.0005519050935890335,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8451530.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007757039275020361,
+ "skip_count": 3.0,
+ "step": 5240,
+ "text_loss": 0.2815830111503601
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 24.610507778103905,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0005515972394244704,
+ "loss": 0.0063,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 8454171.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.021602008491754532,
+ "skip_count": 1.0,
+ "step": 5242,
+ "text_loss": 0.6024490594863892
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.61990020545935,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0005512893654877478,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8457544.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006062488537281752,
+ "skip_count": 0.0,
+ "step": 5244,
+ "text_loss": 0.550110936164856
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.629292632814792,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0005509814718968435,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8460135.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002793943975120783,
+ "skip_count": 0.0,
+ "step": 5246,
+ "text_loss": 0.4361286163330078
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.0005506735587697433,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8463516.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016669550677761436,
+ "skip_count": 0.0,
+ "step": 5248,
+ "text_loss": 0.4642958641052246
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.0005503656262244395,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8466406.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006051387754268944,
+ "skip_count": 0.0,
+ "step": 5250,
+ "text_loss": 0.3445641100406647
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 24.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0005500576743789329,
+ "loss": 0.0037,
+ "macro_f1": 1.0,
+ "num_tokens": 8468838.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.00654293829575181,
+ "skip_count": 1.0,
+ "step": 5252,
+ "text_loss": 0.2842808663845062
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.666862342236573,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0005497497033512309,
+ "loss": 0.0077,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 8471815.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03845973685383797,
+ "skip_count": 3.0,
+ "step": 5254,
+ "text_loss": 0.2597215175628662
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 24.676254769592017,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0005494417132593487,
+ "loss": 0.0047,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 8475202.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02252381667494774,
+ "skip_count": 4.0,
+ "step": 5256,
+ "text_loss": 0.32269927859306335
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.055419921875,
+ "learning_rate": 0.0005491337042213088,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8478650.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01232751365751028,
+ "skip_count": 2.0,
+ "step": 5258,
+ "text_loss": 0.6523372530937195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.695039624302908,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0005488256763551408,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8481724.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028322834987193346,
+ "skip_count": 0.0,
+ "step": 5260,
+ "text_loss": 0.4212580621242523
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0177001953125,
+ "learning_rate": 0.0005485176297788814,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8485833.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002623105887323618,
+ "skip_count": 2.0,
+ "step": 5262,
+ "text_loss": 0.16906329989433289
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 24.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0005482095646105748,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8489089.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0007179114618338645,
+ "skip_count": 0.0,
+ "step": 5264,
+ "text_loss": 0.4523872137069702
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.723216906369238,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0005479014809682721,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8492905.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005234059412032366,
+ "skip_count": 0.0,
+ "step": 5266,
+ "text_loss": 0.207139790058136
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.0005475933789700314,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8495480.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023258263245224953,
+ "skip_count": 0.0,
+ "step": 5268,
+ "text_loss": 0.18060965836048126
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.0005472852587339183,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8499070.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013497259933501482,
+ "skip_count": 0.0,
+ "step": 5270,
+ "text_loss": 0.7460769414901733
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.751394188435572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.056640625,
+ "learning_rate": 0.0005469771203780048,
+ "loss": 0.0099,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8502886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003589815751183778,
+ "skip_count": 0.0,
+ "step": 5272,
+ "text_loss": 0.48119160532951355
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0005466689640203701,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8506646.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006619705818593502,
+ "skip_count": 1.0,
+ "step": 5274,
+ "text_loss": 0.15656520426273346
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0005463607897791005,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8509450.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002992175053805113,
+ "skip_count": 1.0,
+ "step": 5276,
+ "text_loss": 0.486930251121521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.779571470501907,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0005460525977722886,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8512851.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027784097474068403,
+ "skip_count": 0.0,
+ "step": 5278,
+ "text_loss": 0.19654682278633118
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.788963897857354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0005457443881180345,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8516858.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017648129723966122,
+ "skip_count": 0.0,
+ "step": 5280,
+ "text_loss": 0.580982506275177
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0005454361609344444,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 8519912.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.010817649774253368,
+ "skip_count": 3.0,
+ "step": 5282,
+ "text_loss": 0.2644204795360565
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.80774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.000545127916339632,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8522396.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001453282660804689,
+ "skip_count": 0.0,
+ "step": 5284,
+ "text_loss": 0.5014839172363281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.817141179923688,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0005448196544517168,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8525326.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006645771209150553,
+ "skip_count": 2.0,
+ "step": 5286,
+ "text_loss": 0.2983154058456421
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.0005445113753888254,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8528611.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005447337171062827,
+ "skip_count": 0.0,
+ "step": 5288,
+ "text_loss": 0.43598243594169617
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.000544203079269091,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8531571.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026976624503731728,
+ "skip_count": 0.0,
+ "step": 5290,
+ "text_loss": 0.6454944610595703
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.845318461990022,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.0005438947662106533,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8534565.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002217630622908473,
+ "skip_count": 0.0,
+ "step": 5292,
+ "text_loss": 0.742935836315155
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 29.0,
+ "epoch": 24.854710889345466,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0005435864363316584,
+ "loss": 0.0073,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 8537581.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.030740609392523766,
+ "skip_count": 2.0,
+ "step": 5294,
+ "text_loss": 0.48913639783859253
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0005432780897502588,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8541271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005306888837367296,
+ "skip_count": 1.0,
+ "step": 5296,
+ "text_loss": 0.5820846557617188
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 24.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.0005429697265846137,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8545052.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002255369909107685,
+ "skip_count": 0.0,
+ "step": 5298,
+ "text_loss": 0.565483808517456
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0005426613469528881,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8548605.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010787079809233546,
+ "skip_count": 0.0,
+ "step": 5300,
+ "text_loss": 0.40154510736465454
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.000542352950973254,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8552581.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017972089117392898,
+ "skip_count": 0.0,
+ "step": 5302,
+ "text_loss": 0.5430748462677002
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.901673026122687,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04736328125,
+ "learning_rate": 0.0005420445387638891,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8556360.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016180560924112797,
+ "skip_count": 2.0,
+ "step": 5304,
+ "text_loss": 0.544040322303772
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.0005417361104429777,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 8559264.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012688961811363697,
+ "skip_count": 2.0,
+ "step": 5306,
+ "text_loss": 0.2018517404794693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.0005414276661287101,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8562169.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012141643092036247,
+ "skip_count": 0.0,
+ "step": 5308,
+ "text_loss": 0.5685747265815735
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.92985030818902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.059326171875,
+ "learning_rate": 0.0005411192059392826,
+ "loss": 0.0098,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8565231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015626107342541218,
+ "skip_count": 0.0,
+ "step": 5310,
+ "text_loss": 0.8073471784591675
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.0005408107299928979,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8568122.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004773529712110758,
+ "skip_count": 0.0,
+ "step": 5312,
+ "text_loss": 0.22583355009555817
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 24.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.0005405022384077644,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8571056.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025621228851377964,
+ "skip_count": 1.0,
+ "step": 5314,
+ "text_loss": 0.25274428725242615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 24.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0005401937313020967,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8574300.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009726752527058125,
+ "skip_count": 2.0,
+ "step": 5316,
+ "text_loss": 0.3283393979072571
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 24.967420017610802,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0005398852087941155,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8577424.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012483839876949787,
+ "skip_count": 4.0,
+ "step": 5318,
+ "text_loss": 0.1876130849123001
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.000539576671002047,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8580309.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009830677881836891,
+ "skip_count": 0.0,
+ "step": 5320,
+ "text_loss": 0.6955490708351135
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046875,
+ "learning_rate": 0.0005392681180441235,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8583399.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010819481685757637,
+ "skip_count": 0.0,
+ "step": 5322,
+ "text_loss": 0.4708341956138611
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 24.995597299677137,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.000538959550038583,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8586259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005763369146734476,
+ "skip_count": 0.0,
+ "step": 5324,
+ "text_loss": 0.20463642477989197
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.00469621367772,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0005386509671036695,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8589067.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006229027640074492,
+ "skip_count": 0.0,
+ "step": 5326,
+ "text_loss": 0.6819888353347778
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 24.0,
+ "epoch": 25.014088641033165,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0005383423693576325,
+ "loss": 0.0087,
+ "macro_f1": 0.9619450569152832,
+ "num_tokens": 8592837.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.030066559091210365,
+ "skip_count": 6.0,
+ "step": 5328,
+ "text_loss": 0.24606549739837646
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.0005380337569187272,
+ "loss": 0.0092,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8596293.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007445990107953548,
+ "skip_count": 0.0,
+ "step": 5330,
+ "text_loss": 0.16730253398418427
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 25.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.0005377251299052145,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 8599360.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004563331138342619,
+ "skip_count": 1.0,
+ "step": 5332,
+ "text_loss": 0.6856988668441772
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0005374164884353608,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8602376.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015491938684135675,
+ "skip_count": 0.0,
+ "step": 5334,
+ "text_loss": 1.3248854875564575
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.051658350454947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.0005371078326274382,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8605400.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016098044579848647,
+ "skip_count": 0.0,
+ "step": 5336,
+ "text_loss": 0.747150182723999
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 25.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.0005367991625997243,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8608100.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034471298567950726,
+ "skip_count": 3.0,
+ "step": 5338,
+ "text_loss": 0.6443291902542114
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.070443205165834,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0005364904784705015,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8611768.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007947597652673721,
+ "skip_count": 1.0,
+ "step": 5340,
+ "text_loss": 0.7768037915229797
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 25.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0005361817803580588,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 8614424.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.009964234195649624,
+ "skip_count": 2.0,
+ "step": 5342,
+ "text_loss": 0.22826914489269257
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.0005358730683806896,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8617826.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014116480015218258,
+ "skip_count": 0.0,
+ "step": 5344,
+ "text_loss": 0.49022090435028076
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 25.098620487232168,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0005355643426566929,
+ "loss": 0.0061,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 8621220.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013940622098743916,
+ "skip_count": 2.0,
+ "step": 5346,
+ "text_loss": 0.26819515228271484
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.108012914587615,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.000535255603304373,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8623957.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032230091746896505,
+ "skip_count": 2.0,
+ "step": 5348,
+ "text_loss": 0.46905452013015747
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.11740534194306,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.0005349468504420395,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8626760.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002631337149068713,
+ "skip_count": 1.0,
+ "step": 5350,
+ "text_loss": 0.5312309861183167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.126797769298502,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.0005346380841880068,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8630207.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004526057746261358,
+ "skip_count": 2.0,
+ "step": 5352,
+ "text_loss": 0.5810666084289551
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.0005343293046605949,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8633241.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023941127583384514,
+ "skip_count": 0.0,
+ "step": 5354,
+ "text_loss": 0.18468725681304932
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0005340205119781288,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8636215.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017020340310409665,
+ "skip_count": 0.0,
+ "step": 5356,
+ "text_loss": 0.6665788888931274
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0005337117062589383,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8639326.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004964717663824558,
+ "skip_count": 2.0,
+ "step": 5358,
+ "text_loss": 0.19770404696464539
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.164367478720283,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.0005334028876213585,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8642157.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006587155628949404,
+ "skip_count": 0.0,
+ "step": 5360,
+ "text_loss": 0.2295130044221878
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0005330940561837291,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8645355.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006586945964954793,
+ "skip_count": 0.0,
+ "step": 5362,
+ "text_loss": 0.2701159417629242
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0005327852120643947,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8648911.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0014281768817454576,
+ "skip_count": 0.0,
+ "step": 5364,
+ "text_loss": 0.8957229852676392
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.192544760786618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0005324763553817053,
+ "loss": 0.0027,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8652037.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005899337120354176,
+ "skip_count": 0.0,
+ "step": 5366,
+ "text_loss": 0.38642236590385437
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 25.20193718814206,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0005321674862540154,
+ "loss": 0.0058,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 8655381.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.024511313065886497,
+ "skip_count": 1.0,
+ "step": 5368,
+ "text_loss": 0.6439879536628723
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.000531858604799684,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8658476.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012558114249259233,
+ "skip_count": 0.0,
+ "step": 5370,
+ "text_loss": 0.3227672874927521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.220722042852948,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06005859375,
+ "learning_rate": 0.0005315497111370752,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8661982.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013541636290028691,
+ "skip_count": 0.0,
+ "step": 5372,
+ "text_loss": 0.6375321745872498
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 25.230114470208395,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.051513671875,
+ "learning_rate": 0.0005312408053845575,
+ "loss": 0.0052,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 8665071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010432626120746136,
+ "skip_count": 2.0,
+ "step": 5374,
+ "text_loss": 0.536924421787262
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.0005309318876605042,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8668411.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004450209904462099,
+ "skip_count": 1.0,
+ "step": 5376,
+ "text_loss": 0.2643466889858246
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.248899324919282,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0005306229580832933,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 8672088.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011189920827746391,
+ "skip_count": 3.0,
+ "step": 5378,
+ "text_loss": 0.8259533047676086
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.000530314016771307,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8675206.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020095291547477245,
+ "skip_count": 0.0,
+ "step": 5380,
+ "text_loss": 0.31364113092422485
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.267684179630173,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0005300050638429324,
+ "loss": 0.0078,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 8678289.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010738557204604149,
+ "skip_count": 1.0,
+ "step": 5382,
+ "text_loss": 0.19013966619968414
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.277076606985617,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.0005296960994165607,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8681555.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018534278497099876,
+ "skip_count": 1.0,
+ "step": 5384,
+ "text_loss": 0.762248694896698
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.286469034341064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0005293871236105877,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8684413.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009143726900219917,
+ "skip_count": 2.0,
+ "step": 5386,
+ "text_loss": 0.19994212687015533
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 25.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0005290781365434134,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8687450.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.002034468576312065,
+ "skip_count": 0.0,
+ "step": 5388,
+ "text_loss": 0.5519160628318787
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.30525388905195,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0005287691383334425,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8690651.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006834167055785656,
+ "skip_count": 0.0,
+ "step": 5390,
+ "text_loss": 0.5439304709434509
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.314646316407398,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060791015625,
+ "learning_rate": 0.0005284601290990832,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8693929.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0022327799815684557,
+ "skip_count": 0.0,
+ "step": 5392,
+ "text_loss": 0.24108269810676575
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0005281511089587491,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8696727.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002669565612450242,
+ "skip_count": 0.0,
+ "step": 5394,
+ "text_loss": 0.8659077286720276
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0005278420780308568,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8700934.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007252473384141922,
+ "skip_count": 0.0,
+ "step": 5396,
+ "text_loss": 0.5592793226242065
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.342823598473732,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 0.0005275330364338276,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8704449.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001793015981093049,
+ "skip_count": 0.0,
+ "step": 5398,
+ "text_loss": 0.5211784243583679
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 25.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 0.0005272239842860868,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 8707384.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.00963665172457695,
+ "skip_count": 4.0,
+ "step": 5400,
+ "text_loss": 0.6092788577079773
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 25.36160845318462,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0005269149217060642,
+ "loss": 0.0059,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 8710453.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01758105307817459,
+ "skip_count": 2.0,
+ "step": 5402,
+ "text_loss": 0.3423936069011688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0005266058488121926,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8713514.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025636721402406693,
+ "skip_count": 1.0,
+ "step": 5404,
+ "text_loss": 0.484171986579895
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.38039330789551,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0005262967657229095,
+ "loss": 0.0064,
+ "macro_f1": 0.9255813956260681,
+ "num_tokens": 8717051.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.022406045347452164,
+ "skip_count": 4.0,
+ "step": 5406,
+ "text_loss": 0.23368191719055176
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 0.0005259876725566563,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8719987.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004114408977329731,
+ "skip_count": 2.0,
+ "step": 5408,
+ "text_loss": 0.20237496495246887
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.399178162606397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.000525678569431878,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8723258.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006741158664226532,
+ "skip_count": 2.0,
+ "step": 5410,
+ "text_loss": 0.7969435453414917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.0005253694564670233,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8726294.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034468702506273985,
+ "skip_count": 0.0,
+ "step": 5412,
+ "text_loss": 0.5533816814422607
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.000525060333780545,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8729603.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01086533535271883,
+ "skip_count": 2.0,
+ "step": 5414,
+ "text_loss": 0.31856611371040344
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 25.42735544467273,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0005247512014908998,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8733423.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00512756546959281,
+ "skip_count": 6.0,
+ "step": 5416,
+ "text_loss": 0.6710903644561768
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06103515625,
+ "learning_rate": 0.0005244420597165472,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8736457.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026201079599559307,
+ "skip_count": 0.0,
+ "step": 5418,
+ "text_loss": 0.6469964981079102
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.0005241329085759514,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8739617.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004130818881094456,
+ "skip_count": 0.0,
+ "step": 5420,
+ "text_loss": 0.4868837296962738
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0005238237481875795,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8742653.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003171122632920742,
+ "skip_count": 0.0,
+ "step": 5422,
+ "text_loss": 0.12026242166757584
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.464925154094512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0634765625,
+ "learning_rate": 0.0005235145786699021,
+ "loss": 0.0091,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8745835.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008553664083592594,
+ "skip_count": 0.0,
+ "step": 5424,
+ "text_loss": 0.601640522480011
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0005232054001413941,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8749006.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006958908052183688,
+ "skip_count": 0.0,
+ "step": 5426,
+ "text_loss": 0.7083519101142883
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 0.0005228962127205329,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8752493.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012221037177368999,
+ "skip_count": 1.0,
+ "step": 5428,
+ "text_loss": 0.3949109613895416
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.493102436160846,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0005225870165257997,
+ "loss": 0.0079,
+ "macro_f1": 1.0,
+ "num_tokens": 8755294.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003924673888832331,
+ "skip_count": 2.0,
+ "step": 5430,
+ "text_loss": 0.7487186789512634
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0005222778116756793,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8758043.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002388258930295706,
+ "skip_count": 0.0,
+ "step": 5432,
+ "text_loss": 0.4092858135700226
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0005219685982886594,
+ "loss": 0.0037,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8760618.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0045886957086622715,
+ "skip_count": 0.0,
+ "step": 5434,
+ "text_loss": 0.5889580249786377
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.052978515625,
+ "learning_rate": 0.0005216593764832311,
+ "loss": 0.0074,
+ "macro_f1": 1.0,
+ "num_tokens": 8764269.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00704155582934618,
+ "skip_count": 2.0,
+ "step": 5436,
+ "text_loss": 0.2634117007255554
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0005213501463778889,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8767142.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00368728069588542,
+ "skip_count": 2.0,
+ "step": 5438,
+ "text_loss": 0.3512301445007324
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05322265625,
+ "learning_rate": 0.0005210409080911304,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8770239.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012925115879625082,
+ "skip_count": 0.0,
+ "step": 5440,
+ "text_loss": 0.9330073595046997
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0005207316617414561,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8772927.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005604506935924292,
+ "skip_count": 0.0,
+ "step": 5442,
+ "text_loss": 0.23477613925933838
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.55884942764896,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 0.0005204224074473701,
+ "loss": 0.0049,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 8776451.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010945434682071209,
+ "skip_count": 2.0,
+ "step": 5444,
+ "text_loss": 0.6184295415878296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.0005201131453273789,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8779481.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024414353538304567,
+ "skip_count": 0.0,
+ "step": 5446,
+ "text_loss": 0.16186967492103577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.57763428235985,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.0005198038754999926,
+ "loss": 0.0052,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 8782425.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013872416689991951,
+ "skip_count": 0.0,
+ "step": 5448,
+ "text_loss": 0.42294546961784363
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0005194945980837237,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8785466.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006147907115519047,
+ "skip_count": 0.0,
+ "step": 5450,
+ "text_loss": 0.6285432577133179
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0005191853131970881,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8788461.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010585964191704988,
+ "skip_count": 0.0,
+ "step": 5452,
+ "text_loss": 0.6032317876815796
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.0005188760209586044,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8791572.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005267909727990627,
+ "skip_count": 1.0,
+ "step": 5454,
+ "text_loss": 0.3015609681606293
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0005185667214867937,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8794697.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000532392121385783,
+ "skip_count": 0.0,
+ "step": 5456,
+ "text_loss": 0.9596265554428101
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0005182574149001805,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8797880.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007176774088293314,
+ "skip_count": 0.0,
+ "step": 5458,
+ "text_loss": 0.5599364638328552
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0005179481013172912,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8801995.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022756673861294985,
+ "skip_count": 0.0,
+ "step": 5460,
+ "text_loss": 0.47327280044555664
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0005176387808566558,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8805138.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025084633380174637,
+ "skip_count": 0.0,
+ "step": 5462,
+ "text_loss": 0.26674970984458923
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.0005173294536368061,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8808102.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008814680040813982,
+ "skip_count": 0.0,
+ "step": 5464,
+ "text_loss": 0.5981299877166748
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.662166128558848,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0005170201197762773,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8811431.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005443177651613951,
+ "skip_count": 0.0,
+ "step": 5466,
+ "text_loss": 1.037438988685608
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0005167107793936065,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8814256.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000494555220939219,
+ "skip_count": 0.0,
+ "step": 5468,
+ "text_loss": 0.5005733966827393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0005164014326073333,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8817024.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004793747793883085,
+ "skip_count": 2.0,
+ "step": 5470,
+ "text_loss": 0.6999614834785461
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.690343410625182,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.0005160920795360002,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8819892.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020966180600225925,
+ "skip_count": 0.0,
+ "step": 5472,
+ "text_loss": 0.5536707043647766
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.69973583798063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.0005157827202981521,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8822928.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020367507822811604,
+ "skip_count": 0.0,
+ "step": 5474,
+ "text_loss": 0.43655988574028015
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0005154733550123356,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8825842.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020070383325219154,
+ "skip_count": 0.0,
+ "step": 5476,
+ "text_loss": 0.48149657249450684
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.718520692691516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0005151639837971004,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8829534.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016327418852597475,
+ "skip_count": 0.0,
+ "step": 5478,
+ "text_loss": 0.6693689227104187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.727913120046964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.000514854606770998,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8833177.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012691980227828026,
+ "skip_count": 0.0,
+ "step": 5480,
+ "text_loss": 0.44926801323890686
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0005145452240525822,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8836933.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0007724820752628148,
+ "skip_count": 0.0,
+ "step": 5482,
+ "text_loss": 0.5759884119033813
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 25.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0005142358357604092,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 8840093.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008331702090799809,
+ "skip_count": 7.0,
+ "step": 5484,
+ "text_loss": 0.47393685579299927
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.756090402113298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0005139264420130368,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8843918.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003124477108940482,
+ "skip_count": 2.0,
+ "step": 5486,
+ "text_loss": 0.5298711061477661
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.08447265625,
+ "learning_rate": 0.0005136170429290259,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8846558.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034127775579690933,
+ "skip_count": 2.0,
+ "step": 5488,
+ "text_loss": 0.43582668900489807
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.774875256824185,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.0005133076386269383,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8849724.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0018056259723380208,
+ "skip_count": 0.0,
+ "step": 5490,
+ "text_loss": 0.8116800785064697
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 25.784267684179632,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0005129982292253384,
+ "loss": 0.0063,
+ "macro_f1": 0.6589147448539734,
+ "num_tokens": 8852447.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.021452350541949272,
+ "skip_count": 6.0,
+ "step": 5492,
+ "text_loss": 0.31878748536109924
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0005126888148427927,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8855886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026911941822618246,
+ "skip_count": 0.0,
+ "step": 5494,
+ "text_loss": 0.4021807909011841
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 25.80305253889052,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0005123793955978693,
+ "loss": 0.007,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 8859378.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019764510914683342,
+ "skip_count": 2.0,
+ "step": 5496,
+ "text_loss": 0.21608132123947144
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.812444966245963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.0005120699716091379,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8862310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008988190093077719,
+ "skip_count": 0.0,
+ "step": 5498,
+ "text_loss": 0.34666743874549866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0005117605429951707,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8865166.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011137975379824638,
+ "skip_count": 2.0,
+ "step": 5500,
+ "text_loss": 0.25385144352912903
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 25.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0005114511098745412,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 8869923.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006476947572082281,
+ "skip_count": 4.0,
+ "step": 5502,
+ "text_loss": 0.4503856301307678
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.840622248312297,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.000511141672365825,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8872451.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022727579344063997,
+ "skip_count": 0.0,
+ "step": 5504,
+ "text_loss": 0.7522464990615845
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.0005108322305875987,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8875968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020014268811792135,
+ "skip_count": 0.0,
+ "step": 5506,
+ "text_loss": 0.30184176564216614
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04833984375,
+ "learning_rate": 0.0005105227846584414,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8879705.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001179999322630465,
+ "skip_count": 0.0,
+ "step": 5508,
+ "text_loss": 0.6187804937362671
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0005102133346969329,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8883535.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002946492750197649,
+ "skip_count": 0.0,
+ "step": 5510,
+ "text_loss": 0.5961501002311707
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.878191957734078,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.0005099038808216555,
+ "loss": 0.0063,
+ "macro_f1": 1.0,
+ "num_tokens": 8886683.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004532935563474894,
+ "skip_count": 3.0,
+ "step": 5512,
+ "text_loss": 0.38462957739830017
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.88758438508952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0005095944231511922,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8891049.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00917842984199524,
+ "skip_count": 2.0,
+ "step": 5514,
+ "text_loss": 0.27541956305503845
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.896976812444965,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0005092849618041279,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8893604.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008756510796956718,
+ "skip_count": 0.0,
+ "step": 5516,
+ "text_loss": 0.681315541267395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.906369239800412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0005089754968990487,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8898072.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008704439387656748,
+ "skip_count": 1.0,
+ "step": 5518,
+ "text_loss": 0.5060005187988281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0005086660285545422,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8901539.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004750201944261789,
+ "skip_count": 1.0,
+ "step": 5520,
+ "text_loss": 0.6008047461509705
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 25.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.000508356556889197,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8904525.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026552649214863777,
+ "skip_count": 0.0,
+ "step": 5522,
+ "text_loss": 0.4539012908935547
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 25.934546521866746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0005080470820216037,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8907624.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002621029270812869,
+ "skip_count": 1.0,
+ "step": 5524,
+ "text_loss": 0.20088370144367218
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 25.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.0005077376040703533,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8910515.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.0028921898920089006,
+ "skip_count": 0.0,
+ "step": 5526,
+ "text_loss": 0.6575983166694641
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.8888888955116272,
+ "avg_layers": 21.0,
+ "epoch": 25.953331376577633,
+ "f1_execute": 0.9729729890823364,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.9411765336990356,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0005074281231540384,
+ "loss": 0.0076,
+ "macro_f1": 0.9713832139968872,
+ "num_tokens": 8914419.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.024232301861047745,
+ "skip_count": 9.0,
+ "step": 5528,
+ "text_loss": 0.5435594916343689
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 25.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.0005071186393912527,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8917543.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003731841454282403,
+ "skip_count": 2.0,
+ "step": 5530,
+ "text_loss": 0.5152071118354797
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0005068091529005909,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8920728.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005905418191105127,
+ "skip_count": 0.0,
+ "step": 5532,
+ "text_loss": 0.29741042852401733
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 25.981508658643968,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.000506499663800649,
+ "loss": 0.0096,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8924112.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0021933517418801785,
+ "skip_count": 0.0,
+ "step": 5534,
+ "text_loss": 0.45704230666160583
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 25.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0005061901722100235,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8927323.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009227502159774303,
+ "skip_count": 4.0,
+ "step": 5536,
+ "text_loss": 0.1968434453010559
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.0,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.0005058806782473125,
+ "loss": 0.0053,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 8931052.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02054760232567787,
+ "skip_count": 2.0,
+ "step": 5538,
+ "text_loss": 0.23851273953914642
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.009392427355444,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.0005055711820311144,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8934215.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008434011251665652,
+ "skip_count": 0.0,
+ "step": 5540,
+ "text_loss": 0.85942542552948
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 26.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0005052616836800288,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8937173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011105241253972054,
+ "skip_count": 4.0,
+ "step": 5542,
+ "text_loss": 0.2614556849002838
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0005049521833126561,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8940553.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006273435428738594,
+ "skip_count": 0.0,
+ "step": 5544,
+ "text_loss": 0.6430498957633972
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.037569709421778,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0005046426810475976,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8943753.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023464353289455175,
+ "skip_count": 1.0,
+ "step": 5546,
+ "text_loss": 0.7015808820724487
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06689453125,
+ "learning_rate": 0.0005043331770034547,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8947149.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016024730866774917,
+ "skip_count": 1.0,
+ "step": 5548,
+ "text_loss": 0.5875257253646851
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.0005040236712988304,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8950374.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004096277989447117,
+ "skip_count": 0.0,
+ "step": 5550,
+ "text_loss": 0.1712338626384735
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 26.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0005037141640523275,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8953256.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00441550649702549,
+ "skip_count": 0.0,
+ "step": 5552,
+ "text_loss": 0.16560404002666473
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.07513941884356,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0005034046553825501,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 8956845.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.011712636798620224,
+ "skip_count": 6.0,
+ "step": 5554,
+ "text_loss": 0.24278216063976288
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0005030951454081023,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8961165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00235542468726635,
+ "skip_count": 1.0,
+ "step": 5556,
+ "text_loss": 0.17214511334896088
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.093924273554446,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0005027856342475888,
+ "loss": 0.0037,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 8965262.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0160827673971653,
+ "skip_count": 1.0,
+ "step": 5558,
+ "text_loss": 0.40229740738868713
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 26.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 0.0005024761220196151,
+ "loss": 0.0091,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8968278.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004786997567862272,
+ "skip_count": 0.0,
+ "step": 5560,
+ "text_loss": 0.24828575551509857
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 26.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.0005021666088427868,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8971443.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015378865646198392,
+ "skip_count": 0.0,
+ "step": 5562,
+ "text_loss": 0.7269657254219055
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 0.0005018570948357099,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8975312.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015218508196994662,
+ "skip_count": 0.0,
+ "step": 5564,
+ "text_loss": 0.5198811292648315
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0005015475801169908,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8977951.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008865317329764366,
+ "skip_count": 1.0,
+ "step": 5566,
+ "text_loss": 0.1541406810283661
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 26.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0005012380648052359,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8981325.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0055318837985396385,
+ "skip_count": 0.0,
+ "step": 5568,
+ "text_loss": 0.510314404964447
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.0005009285490190523,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8984661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035060355439782143,
+ "skip_count": 0.0,
+ "step": 5570,
+ "text_loss": 0.29421761631965637
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.159671265042558,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.000500619032877047,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8987573.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0050126477144658566,
+ "skip_count": 2.0,
+ "step": 5572,
+ "text_loss": 0.1984361708164215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.0005003095164978271,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 8991136.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019407360814511776,
+ "skip_count": 0.0,
+ "step": 5574,
+ "text_loss": 0.42751404643058777
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0005,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8994198.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029819176997989416,
+ "skip_count": 2.0,
+ "step": 5576,
+ "text_loss": 0.20589640736579895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.187848547108892,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0004996904835021729,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 8997907.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000878945691511035,
+ "skip_count": 1.0,
+ "step": 5578,
+ "text_loss": 0.2801406979560852
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.19724097446434,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.000499380967122953,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9001141.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005223734769970179,
+ "skip_count": 1.0,
+ "step": 5580,
+ "text_loss": 0.20542480051517487
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0004990714509809478,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9004794.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015868612099438906,
+ "skip_count": 0.0,
+ "step": 5582,
+ "text_loss": 0.32094934582710266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 26.216025829175226,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0004987619351947643,
+ "loss": 0.0064,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 9009250.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.031923454254865646,
+ "skip_count": 4.0,
+ "step": 5584,
+ "text_loss": 0.609201967716217
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.225418256530673,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.0004984524198830095,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9013254.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033124545589089394,
+ "skip_count": 0.0,
+ "step": 5586,
+ "text_loss": 0.3698650300502777
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0004981429051642903,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9016598.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017190382350236177,
+ "skip_count": 1.0,
+ "step": 5588,
+ "text_loss": 0.5306026935577393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.24420311124156,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0004978333911572132,
+ "loss": 0.0059,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9019558.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02051064372062683,
+ "skip_count": 1.0,
+ "step": 5590,
+ "text_loss": 0.23494470119476318
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.253595538597008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.0004975238779803849,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9023024.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010489600244909525,
+ "skip_count": 0.0,
+ "step": 5592,
+ "text_loss": 0.579275906085968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0004972143657524112,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9026161.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012039231369271874,
+ "skip_count": 0.0,
+ "step": 5594,
+ "text_loss": 0.5776295065879822
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0004969048545918978,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9028814.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010212450288236141,
+ "skip_count": 1.0,
+ "step": 5596,
+ "text_loss": 0.6816855669021606
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 26.281772820663342,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00049659534461745,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9032243.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0024297661148011684,
+ "skip_count": 0.0,
+ "step": 5598,
+ "text_loss": 0.743188202381134
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.0004962858359476726,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9035493.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002151754219084978,
+ "skip_count": 0.0,
+ "step": 5600,
+ "text_loss": 0.5213983654975891
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0004959763287011698,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9038213.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028108188416808844,
+ "skip_count": 2.0,
+ "step": 5602,
+ "text_loss": 0.5128397345542908
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.309950102729672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0004956668229965454,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9041152.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004022551700472832,
+ "skip_count": 2.0,
+ "step": 5604,
+ "text_loss": 0.15361636877059937
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.31934253008512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0004953573189524026,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9044503.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010689410846680403,
+ "skip_count": 1.0,
+ "step": 5606,
+ "text_loss": 0.6454885005950928
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0004950478166873439,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9047742.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025760293938219547,
+ "skip_count": 0.0,
+ "step": 5608,
+ "text_loss": 0.7654000520706177
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.338127384796007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0004947383163199713,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9050349.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009846165776252747,
+ "skip_count": 0.0,
+ "step": 5610,
+ "text_loss": 0.41533342003822327
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 0.0004944288179688858,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9053667.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017193946987390518,
+ "skip_count": 1.0,
+ "step": 5612,
+ "text_loss": 1.0172475576400757
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0004941193217526875,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9056777.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026750199031084776,
+ "skip_count": 0.0,
+ "step": 5614,
+ "text_loss": 0.17584927380084991
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 26.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0004938098277899765,
+ "loss": 0.0068,
+ "macro_f1": 1.0,
+ "num_tokens": 9060609.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005259076599031687,
+ "skip_count": 1.0,
+ "step": 5616,
+ "text_loss": 0.5522297024726868
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.375697094217788,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0004935003361993511,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9063633.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006837095716036856,
+ "skip_count": 0.0,
+ "step": 5618,
+ "text_loss": 0.5212588310241699
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 26.38508952157323,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0004931908470994091,
+ "loss": 0.0059,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 9067777.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01067375484853983,
+ "skip_count": 1.0,
+ "step": 5620,
+ "text_loss": 0.5515062808990479
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 26.394481948928675,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 0.0004928813606087474,
+ "loss": 0.0043,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 9070938.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016635602340102196,
+ "skip_count": 3.0,
+ "step": 5622,
+ "text_loss": 0.3225076198577881
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.403874376284122,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0004925718768459617,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9074050.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002216119086369872,
+ "skip_count": 0.0,
+ "step": 5624,
+ "text_loss": 0.32438889145851135
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 26.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0004922623959296469,
+ "loss": 0.0082,
+ "macro_f1": 1.0,
+ "num_tokens": 9076785.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012125075794756413,
+ "skip_count": 5.0,
+ "step": 5626,
+ "text_loss": 0.39563658833503723
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.0004919529179783965,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9080239.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026486809365451336,
+ "skip_count": 0.0,
+ "step": 5628,
+ "text_loss": 0.5401569604873657
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.432051658350456,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0004916434431108031,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9083935.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011849761940538883,
+ "skip_count": 0.0,
+ "step": 5630,
+ "text_loss": 0.4798774719238281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 0.000491333971445458,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9087174.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002799210138618946,
+ "skip_count": 0.0,
+ "step": 5632,
+ "text_loss": 0.22488386929035187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0004910245031009515,
+ "loss": 0.0096,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9089803.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00139117450453341,
+ "skip_count": 0.0,
+ "step": 5634,
+ "text_loss": 0.6237335205078125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0004907150381958723,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9093075.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006503603886812925,
+ "skip_count": 1.0,
+ "step": 5636,
+ "text_loss": 0.18781614303588867
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.0004904055768488077,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9096355.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009764843271113932,
+ "skip_count": 0.0,
+ "step": 5638,
+ "text_loss": 0.6821450591087341
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0004900961191783445,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 9098994.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00693159457296133,
+ "skip_count": 3.0,
+ "step": 5640,
+ "text_loss": 0.214790940284729
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.488406222483125,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0004897866653030671,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9102048.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002469591563567519,
+ "skip_count": 0.0,
+ "step": 5642,
+ "text_loss": 0.1556607335805893
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.49779864983857,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0004894772153415588,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9105379.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004824921488761902,
+ "skip_count": 0.0,
+ "step": 5644,
+ "text_loss": 0.499972403049469
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0004891677694124013,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9108240.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029356612358242273,
+ "skip_count": 1.0,
+ "step": 5646,
+ "text_loss": 0.5169754028320312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.516583504549455,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0174560546875,
+ "learning_rate": 0.0004888583276341751,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9111381.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009489183314144611,
+ "skip_count": 1.0,
+ "step": 5648,
+ "text_loss": 0.23630797863006592
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.017822265625,
+ "learning_rate": 0.0004885488901254588,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9114015.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004154495894908905,
+ "skip_count": 1.0,
+ "step": 5650,
+ "text_loss": 0.3345947563648224
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0546875,
+ "learning_rate": 0.0004882394570048294,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9117044.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018865863094106317,
+ "skip_count": 0.0,
+ "step": 5652,
+ "text_loss": 0.32814112305641174
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.0004879300283908623,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9120035.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035278978757560253,
+ "skip_count": 1.0,
+ "step": 5654,
+ "text_loss": 0.4081386625766754
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 26.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.00048762060440213096,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9122955.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0053498269990086555,
+ "skip_count": 0.0,
+ "step": 5656,
+ "text_loss": 0.31027838587760925
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0004873111851572075,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9125635.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004556098487228155,
+ "skip_count": 0.0,
+ "step": 5658,
+ "text_loss": 0.25703540444374084
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.572938068682124,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0004870017707746617,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9128906.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031165245454758406,
+ "skip_count": 2.0,
+ "step": 5660,
+ "text_loss": 0.20663656294345856
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0004866923613730617,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 9132030.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004887583665549755,
+ "skip_count": 2.0,
+ "step": 5662,
+ "text_loss": 0.6062649488449097
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0004863829570709741,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9135274.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021857863757759333,
+ "skip_count": 0.0,
+ "step": 5664,
+ "text_loss": 0.49644309282302856
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 26.601115350748458,
+ "f1_execute": 0.9756097793579102,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0004860735579869631,
+ "loss": 0.0088,
+ "macro_f1": 0.925203263759613,
+ "num_tokens": 9139735.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.05413912236690521,
+ "skip_count": 5.0,
+ "step": 5666,
+ "text_loss": 0.25161290168762207
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.610507778103905,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.00048576416423959097,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9142419.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002229376696050167,
+ "skip_count": 0.0,
+ "step": 5668,
+ "text_loss": 0.5332949161529541
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 26.61990020545935,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0004854547759474179,
+ "loss": 0.0045,
+ "macro_f1": 1.0,
+ "num_tokens": 9145443.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005968933925032616,
+ "skip_count": 4.0,
+ "step": 5670,
+ "text_loss": 0.5282154083251953
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.629292632814792,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060302734375,
+ "learning_rate": 0.0004851453932290021,
+ "loss": 0.0085,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9147754.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.04015754163265228,
+ "skip_count": 1.0,
+ "step": 5672,
+ "text_loss": 0.8564629554748535
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.63868506017024,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00048483601620289974,
+ "loss": 0.0058,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 9151714.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.019172413274645805,
+ "skip_count": 2.0,
+ "step": 5674,
+ "text_loss": 0.4149441123008728
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 26.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0004845266449876645,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9154524.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005025535821914673,
+ "skip_count": 0.0,
+ "step": 5676,
+ "text_loss": 0.26525792479515076
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.000484217279701848,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9158546.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012200147612020373,
+ "skip_count": 0.0,
+ "step": 5678,
+ "text_loss": 0.5532271862030029
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.666862342236573,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0004839079204639998,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9161003.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013485675444826484,
+ "skip_count": 1.0,
+ "step": 5680,
+ "text_loss": 0.36826151609420776
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.0004835985673926668,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9164741.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00532014574855566,
+ "skip_count": 2.0,
+ "step": 5682,
+ "text_loss": 0.16154609620571136
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0004832892206063938,
+ "loss": 0.0075,
+ "macro_f1": 1.0,
+ "num_tokens": 9168079.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.007782323285937309,
+ "skip_count": 3.0,
+ "step": 5684,
+ "text_loss": 0.4323575496673584
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.695039624302908,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.0004829798802237228,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9171352.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024159469176083803,
+ "skip_count": 2.0,
+ "step": 5686,
+ "text_loss": 0.3163119852542877
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.000482670546363194,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9175197.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002455134643241763,
+ "skip_count": 0.0,
+ "step": 5688,
+ "text_loss": 0.59735506772995
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.713824479013795,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0004823612191433443,
+ "loss": 0.0042,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 9177648.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.015524548478424549,
+ "skip_count": 2.0,
+ "step": 5690,
+ "text_loss": 0.759812593460083
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.723216906369238,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.00048205189868270887,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9180694.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002112736226990819,
+ "skip_count": 2.0,
+ "step": 5692,
+ "text_loss": 0.3516882061958313
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 28.0,
+ "epoch": 26.732609333724685,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.00048174258509981973,
+ "loss": 0.0063,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 9183502.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03100527822971344,
+ "skip_count": 3.0,
+ "step": 5694,
+ "text_loss": 0.3722715973854065
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0004814332785132064,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9186417.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009176591411232948,
+ "skip_count": 2.0,
+ "step": 5696,
+ "text_loss": 0.33363673090934753
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.751394188435572,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0004811239790413958,
+ "loss": 0.0076,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9189478.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023586507886648178,
+ "skip_count": 1.0,
+ "step": 5698,
+ "text_loss": 0.19698107242584229
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.00048081468680291194,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9192115.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005083440337330103,
+ "skip_count": 1.0,
+ "step": 5700,
+ "text_loss": 0.3476336896419525
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0004805054019162764,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9195176.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007766073569655418,
+ "skip_count": 1.0,
+ "step": 5702,
+ "text_loss": 0.27114811539649963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.779571470501907,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0004801961245000076,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9199091.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009058842551894486,
+ "skip_count": 0.0,
+ "step": 5704,
+ "text_loss": 0.6249846816062927
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.788963897857354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0004798868546726212,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9202003.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005479823332279921,
+ "skip_count": 0.0,
+ "step": 5706,
+ "text_loss": 0.47223609685897827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0166015625,
+ "learning_rate": 0.00047957759255263014,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9205277.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001055705244652927,
+ "skip_count": 0.0,
+ "step": 5708,
+ "text_loss": 0.677215576171875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.80774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.00047926833825854377,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9208844.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003291431115940213,
+ "skip_count": 2.0,
+ "step": 5710,
+ "text_loss": 0.12439999729394913
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.817141179923688,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06591796875,
+ "learning_rate": 0.0004789590919088696,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9211619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005120242480188608,
+ "skip_count": 2.0,
+ "step": 5712,
+ "text_loss": 0.5771954655647278
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0004786498536221111,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 9214914.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004877795465290546,
+ "skip_count": 2.0,
+ "step": 5714,
+ "text_loss": 0.6432198882102966
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.00047834062351676893,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9218186.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026507999282330275,
+ "skip_count": 0.0,
+ "step": 5716,
+ "text_loss": 0.23814935982227325
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.845318461990022,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.00047803140171134075,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9221754.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002605629386380315,
+ "skip_count": 1.0,
+ "step": 5718,
+ "text_loss": 0.2910388708114624
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 26.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0004777221883243208,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9224502.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0048494706861674786,
+ "skip_count": 3.0,
+ "step": 5720,
+ "text_loss": 0.6195104122161865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0004774129834742004,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9227350.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003092368133366108,
+ "skip_count": 0.0,
+ "step": 5722,
+ "text_loss": 0.35447990894317627
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.00047710378727946725,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9230166.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012780336663126945,
+ "skip_count": 2.0,
+ "step": 5724,
+ "text_loss": 0.27581867575645447
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00047679459985860604,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9233029.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005429140292108059,
+ "skip_count": 1.0,
+ "step": 5726,
+ "text_loss": 0.2636827826499939
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.00047648542133009794,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9236317.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023909916635602713,
+ "skip_count": 0.0,
+ "step": 5728,
+ "text_loss": 0.4801979064941406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.901673026122687,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.00047617625181242077,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9239796.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003603481687605381,
+ "skip_count": 0.0,
+ "step": 5730,
+ "text_loss": 0.8374754786491394
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.0004758670914240488,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9243489.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004478964954614639,
+ "skip_count": 2.0,
+ "step": 5732,
+ "text_loss": 0.3870154917240143
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.000475557940283453,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9246758.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00312575395219028,
+ "skip_count": 1.0,
+ "step": 5734,
+ "text_loss": 0.42341071367263794
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 26.92985030818902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.00047524879850910026,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9250053.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010855631902813911,
+ "skip_count": 4.0,
+ "step": 5736,
+ "text_loss": 0.25729796290397644
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 26.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.0004749396662194549,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9253691.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009250419097952545,
+ "skip_count": 0.0,
+ "step": 5738,
+ "text_loss": 0.6151770949363708
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.0004746305435329767,
+ "loss": 0.0064,
+ "macro_f1": 1.0,
+ "num_tokens": 9256866.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007521102204918861,
+ "skip_count": 3.0,
+ "step": 5740,
+ "text_loss": 0.3094986379146576
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0004743214305681221,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9259790.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022241887636482716,
+ "skip_count": 1.0,
+ "step": 5742,
+ "text_loss": 0.5418204069137573
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 26.967420017610802,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.00047401232744334376,
+ "loss": 0.0071,
+ "macro_f1": 1.0,
+ "num_tokens": 9263205.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008611299097537994,
+ "skip_count": 2.0,
+ "step": 5744,
+ "text_loss": 0.35824623703956604
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 26.976812444966246,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0004737032342770906,
+ "loss": 0.0062,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 9266126.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010788857005536556,
+ "skip_count": 2.0,
+ "step": 5746,
+ "text_loss": 0.2172674983739853
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0004733941511878074,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9269308.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005309196189045906,
+ "skip_count": 2.0,
+ "step": 5748,
+ "text_loss": 0.1696814000606537
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 26.995597299677137,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.00047308507829393594,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9272801.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009940510615706444,
+ "skip_count": 2.0,
+ "step": 5750,
+ "text_loss": 0.24295592308044434
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.00469621367772,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00047277601571391314,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9276197.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000687236781232059,
+ "skip_count": 0.0,
+ "step": 5752,
+ "text_loss": 0.8511804342269897
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.014088641033165,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.00047246696356617254,
+ "loss": 0.0059,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 9278965.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009816894307732582,
+ "skip_count": 1.0,
+ "step": 5754,
+ "text_loss": 0.45420053601264954
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 0.0004721579219691434,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9282076.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015747188590466976,
+ "skip_count": 0.0,
+ "step": 5756,
+ "text_loss": 0.21671754121780396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0004718488910412511,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9285465.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008654040284454823,
+ "skip_count": 2.0,
+ "step": 5758,
+ "text_loss": 0.25920194387435913
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.00047153987090091674,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9288156.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011430777376517653,
+ "skip_count": 0.0,
+ "step": 5760,
+ "text_loss": 0.7655444741249084
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.051658350454947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0004712308616665576,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9291529.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003674200503155589,
+ "skip_count": 2.0,
+ "step": 5762,
+ "text_loss": 0.269486665725708
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0004709218634565866,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9294699.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003249827306717634,
+ "skip_count": 1.0,
+ "step": 5764,
+ "text_loss": 0.5073734521865845
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.070443205165834,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.00047061287638941235,
+ "loss": 0.0068,
+ "macro_f1": 1.0,
+ "num_tokens": 9297863.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002763139782473445,
+ "skip_count": 2.0,
+ "step": 5766,
+ "text_loss": 0.2572014033794403
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 27.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.00047030390058343935,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9301124.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007100266870111227,
+ "skip_count": 3.0,
+ "step": 5768,
+ "text_loss": 0.4147387742996216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0186767578125,
+ "learning_rate": 0.0004699949361570676,
+ "loss": 0.0034,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9304330.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005467240232974291,
+ "skip_count": 1.0,
+ "step": 5770,
+ "text_loss": 0.21510964632034302
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.098620487232168,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.000469685983228693,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9306882.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003167890477925539,
+ "skip_count": 0.0,
+ "step": 5772,
+ "text_loss": 0.45717427134513855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.108012914587615,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00046937704191670675,
+ "loss": 0.0057,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 9309767.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.014881107024848461,
+ "skip_count": 2.0,
+ "step": 5774,
+ "text_loss": 0.3464985191822052
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.11740534194306,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0004690681123394959,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9313045.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00379011663608253,
+ "skip_count": 2.0,
+ "step": 5776,
+ "text_loss": 0.33194616436958313
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.126797769298502,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00046875919461544265,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9315736.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016733441734686494,
+ "skip_count": 0.0,
+ "step": 5778,
+ "text_loss": 0.5009998679161072
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.00046845028886292493,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9318456.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005318894516676664,
+ "skip_count": 1.0,
+ "step": 5780,
+ "text_loss": 0.17702752351760864
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.145582624009393,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044921875,
+ "learning_rate": 0.00046814139520031615,
+ "loss": 0.006,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 9323152.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01133672520518303,
+ "skip_count": 2.0,
+ "step": 5782,
+ "text_loss": 0.2886650860309601
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.0004678325137459845,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9326318.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002458433620631695,
+ "skip_count": 0.0,
+ "step": 5784,
+ "text_loss": 0.5832745432853699
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.164367478720283,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0004675236446182946,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9329779.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005402310052886605,
+ "skip_count": 0.0,
+ "step": 5786,
+ "text_loss": 0.5699237585067749
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00046721478793560525,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9333360.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0002638917067088187,
+ "skip_count": 0.0,
+ "step": 5788,
+ "text_loss": 0.6555714011192322
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.00046690594381627106,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9336498.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003998351749032736,
+ "skip_count": 2.0,
+ "step": 5790,
+ "text_loss": 0.2076750248670578
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.192544760786618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.00046659711237864157,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9339724.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0045847659930586815,
+ "skip_count": 1.0,
+ "step": 5792,
+ "text_loss": 0.22027169167995453
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0186767578125,
+ "learning_rate": 0.00046628829374106167,
+ "loss": 0.0033,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9342835.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014064523857086897,
+ "skip_count": 1.0,
+ "step": 5794,
+ "text_loss": 0.5120179057121277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0004659794880218712,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9346757.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011155207175761461,
+ "skip_count": 1.0,
+ "step": 5796,
+ "text_loss": 0.6415372490882874
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.220722042852948,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0004656706953394051,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9349652.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020385095849633217,
+ "skip_count": 0.0,
+ "step": 5798,
+ "text_loss": 0.5410398840904236
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 27.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0004653619158119933,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9354286.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0012847178149968386,
+ "skip_count": 0.0,
+ "step": 5800,
+ "text_loss": 0.4386860728263855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.00046505314955796074,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9357682.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035008061677217484,
+ "skip_count": 2.0,
+ "step": 5802,
+ "text_loss": 0.13655950129032135
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.248899324919282,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00046474439669562715,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9361058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020033426117151976,
+ "skip_count": 1.0,
+ "step": 5804,
+ "text_loss": 0.6293444037437439
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00046443565734330714,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9364173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004935986362397671,
+ "skip_count": 0.0,
+ "step": 5806,
+ "text_loss": 0.2923166751861572
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0004641269316193104,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9366980.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001654456602409482,
+ "skip_count": 0.0,
+ "step": 5808,
+ "text_loss": 0.7273373007774353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.277076606985617,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0004638182196419411,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9370581.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017011919990181923,
+ "skip_count": 0.0,
+ "step": 5810,
+ "text_loss": 0.6029995083808899
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 27.286469034341064,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.0004635095215294984,
+ "loss": 0.0072,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 9374233.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01361197978258133,
+ "skip_count": 3.0,
+ "step": 5812,
+ "text_loss": 0.14051523804664612
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.00046320083740027584,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9377217.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004597014281898737,
+ "skip_count": 0.0,
+ "step": 5814,
+ "text_loss": 0.2766880691051483
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 27.30525388905195,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.00046289216737256184,
+ "loss": 0.0041,
+ "macro_f1": 1.0,
+ "num_tokens": 9380336.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.006628422066569328,
+ "skip_count": 1.0,
+ "step": 5816,
+ "text_loss": 0.8092381954193115
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.314646316407398,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.0004625835115646393,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9382968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002737772185355425,
+ "skip_count": 0.0,
+ "step": 5818,
+ "text_loss": 0.22090643644332886
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 27.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0004622748700947856,
+ "loss": 0.0057,
+ "macro_f1": 1.0,
+ "num_tokens": 9386203.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004552177153527737,
+ "skip_count": 1.0,
+ "step": 5820,
+ "text_loss": 0.42869850993156433
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0004619662430812729,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9388968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003149240743368864,
+ "skip_count": 2.0,
+ "step": 5822,
+ "text_loss": 0.45137661695480347
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.342823598473732,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0004616576306423677,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9392487.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008133690571412444,
+ "skip_count": 0.0,
+ "step": 5824,
+ "text_loss": 0.638685941696167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0004613490328963307,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9395665.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00042717234464362264,
+ "skip_count": 0.0,
+ "step": 5826,
+ "text_loss": 0.8134317398071289
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.00046104044996141716,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9398831.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0084775285795331,
+ "skip_count": 2.0,
+ "step": 5828,
+ "text_loss": 0.19263958930969238
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 27.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0004607318819558768,
+ "loss": 0.0087,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9403118.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030239911284297705,
+ "skip_count": 0.0,
+ "step": 5830,
+ "text_loss": 0.45556432008743286
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 27.38039330789551,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 0.00046042332899795313,
+ "loss": 0.0075,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 9406206.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.026389889419078827,
+ "skip_count": 2.0,
+ "step": 5832,
+ "text_loss": 0.26458361744880676
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.0004601147912058845,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9409806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013476534513756633,
+ "skip_count": 0.0,
+ "step": 5834,
+ "text_loss": 0.7443689107894897
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.399178162606397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0004598062686979033,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9412737.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004275512881577015,
+ "skip_count": 1.0,
+ "step": 5836,
+ "text_loss": 0.2808683514595032
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.00045949776159223563,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9415818.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027225434314459562,
+ "skip_count": 0.0,
+ "step": 5838,
+ "text_loss": 0.6283587217330933
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.055419921875,
+ "learning_rate": 0.0004591892700071022,
+ "loss": 0.0056,
+ "macro_f1": 1.0,
+ "num_tokens": 9419119.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01574302278459072,
+ "skip_count": 2.0,
+ "step": 5840,
+ "text_loss": 0.33239027857780457
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.42735544467273,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.00045888079406071746,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9422257.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007227854221127927,
+ "skip_count": 0.0,
+ "step": 5842,
+ "text_loss": 0.6658740043640137
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.00045857233387129,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9425071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020696306601166725,
+ "skip_count": 2.0,
+ "step": 5844,
+ "text_loss": 0.5773820877075195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.0004582638895570224,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9427980.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019764541648328304,
+ "skip_count": 0.0,
+ "step": 5846,
+ "text_loss": 0.3388919532299042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.455532726739065,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.000457955461236111,
+ "loss": 0.0058,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9430733.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04235004261136055,
+ "skip_count": 0.0,
+ "step": 5848,
+ "text_loss": 0.44346582889556885
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.464925154094512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0004576470490267462,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9433347.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000801609072368592,
+ "skip_count": 0.0,
+ "step": 5850,
+ "text_loss": 0.5825944542884827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.0004573386530471121,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9436172.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018224078230559826,
+ "skip_count": 2.0,
+ "step": 5852,
+ "text_loss": 0.8111652135848999
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.0004570302734153866,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9439040.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006614950485527515,
+ "skip_count": 2.0,
+ "step": 5854,
+ "text_loss": 0.31270334124565125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.493102436160846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05859375,
+ "learning_rate": 0.0004567219102497412,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9442138.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012984242057427764,
+ "skip_count": 0.0,
+ "step": 5856,
+ "text_loss": 0.6126856803894043
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0004564135636683416,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9445600.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008388847345486283,
+ "skip_count": 0.0,
+ "step": 5858,
+ "text_loss": 0.8526380658149719
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046875,
+ "learning_rate": 0.0004561052337893467,
+ "loss": 0.0108,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9449609.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008125773631036282,
+ "skip_count": 2.0,
+ "step": 5860,
+ "text_loss": 0.2843833863735199
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.000455796920730909,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9452756.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019371749367564917,
+ "skip_count": 0.0,
+ "step": 5862,
+ "text_loss": 0.5293750166893005
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0004554886246111746,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 9455467.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005594742484390736,
+ "skip_count": 2.0,
+ "step": 5864,
+ "text_loss": 0.572329044342041
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 27.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0004551803455482833,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9458953.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005960086826235056,
+ "skip_count": 3.0,
+ "step": 5866,
+ "text_loss": 0.19459208846092224
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.00045487208366036807,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9462130.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034781871363520622,
+ "skip_count": 1.0,
+ "step": 5868,
+ "text_loss": 0.20467053353786469
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.00045456383906555554,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9465590.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012246103724464774,
+ "skip_count": 0.0,
+ "step": 5870,
+ "text_loss": 0.6086251735687256
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.00045425561188196565,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9468092.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002874316181987524,
+ "skip_count": 1.0,
+ "step": 5872,
+ "text_loss": 0.3430633544921875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.57763428235985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0004539474022277115,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9471433.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004340244457125664,
+ "skip_count": 2.0,
+ "step": 5874,
+ "text_loss": 0.28219133615493774
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.0004536392102208997,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9474363.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007322742021642625,
+ "skip_count": 0.0,
+ "step": 5876,
+ "text_loss": 0.7305856943130493
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.0004533310359796299,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9478469.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018631393322721124,
+ "skip_count": 0.0,
+ "step": 5878,
+ "text_loss": 0.5821442604064941
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 28.0,
+ "epoch": 27.60581156442618,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0004530228796219952,
+ "loss": 0.0088,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 9481200.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.026109615340828896,
+ "skip_count": 3.0,
+ "step": 5880,
+ "text_loss": 0.3962891101837158
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.00045271474126608167,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9484200.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004716445691883564,
+ "skip_count": 0.0,
+ "step": 5882,
+ "text_loss": 0.31901776790618896
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0004524066210299685,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9488939.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003797562967520207,
+ "skip_count": 0.0,
+ "step": 5884,
+ "text_loss": 0.3992912471294403
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.0004520985190317279,
+ "loss": 0.0032,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9492010.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005681614391505718,
+ "skip_count": 1.0,
+ "step": 5886,
+ "text_loss": 0.5318995118141174
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0004517904353894253,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9494770.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021422000136226416,
+ "skip_count": 0.0,
+ "step": 5888,
+ "text_loss": 0.435088187456131
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.652773701203404,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.0004514823702211187,
+ "loss": 0.0052,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 9497327.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01593884639441967,
+ "skip_count": 2.0,
+ "step": 5890,
+ "text_loss": 0.5068450570106506
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.662166128558848,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.00045117432364485927,
+ "loss": 0.0075,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 9500488.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0729660913348198,
+ "skip_count": 2.0,
+ "step": 5892,
+ "text_loss": 0.42718732357025146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.00045086629577869127,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9503593.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007092897780239582,
+ "skip_count": 2.0,
+ "step": 5894,
+ "text_loss": 0.4264345169067383
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.00045055828674065134,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9507188.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004088073968887329,
+ "skip_count": 2.0,
+ "step": 5896,
+ "text_loss": 0.20932413637638092
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 27.690343410625182,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.00045025029664876926,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9510126.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0026970503386110067,
+ "skip_count": 0.0,
+ "step": 5898,
+ "text_loss": 0.47661110758781433
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.69973583798063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0164794921875,
+ "learning_rate": 0.0004499423256210673,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9513891.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003428407246246934,
+ "skip_count": 0.0,
+ "step": 5900,
+ "text_loss": 0.18232668936252594
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.00044963437377556066,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9516718.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020270352251827717,
+ "skip_count": 0.0,
+ "step": 5902,
+ "text_loss": 0.16833586990833282
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.718520692691516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.000449326441230257,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9520248.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019144838443025947,
+ "skip_count": 0.0,
+ "step": 5904,
+ "text_loss": 0.44434574246406555
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.727913120046964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 0.00044901852810315634,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9523651.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0044578867964446545,
+ "skip_count": 2.0,
+ "step": 5906,
+ "text_loss": 0.1248839721083641
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.0004487106345122522,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9527235.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000827222247608006,
+ "skip_count": 0.0,
+ "step": 5908,
+ "text_loss": 0.6052893996238708
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 27.74669797475785,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0004484027605755296,
+ "loss": 0.0065,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 9530407.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.029739778488874435,
+ "skip_count": 0.0,
+ "step": 5910,
+ "text_loss": 0.7625715732574463
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.756090402113298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 0.00044809490641096653,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9533229.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025658784434199333,
+ "skip_count": 0.0,
+ "step": 5912,
+ "text_loss": 0.27842655777931213
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 27.76548282946874,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.00044778707213653324,
+ "loss": 0.0069,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 9537397.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010157953947782516,
+ "skip_count": 3.0,
+ "step": 5914,
+ "text_loss": 0.45196083188056946
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.774875256824185,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.0004474792578701924,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 9540564.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.011994685977697372,
+ "skip_count": 5.0,
+ "step": 5916,
+ "text_loss": 0.22617442905902863
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.784267684179632,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.000447171463729899,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9543602.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022214490454643965,
+ "skip_count": 0.0,
+ "step": 5918,
+ "text_loss": 0.5089073777198792
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0004468636898336003,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 9546829.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009353389963507652,
+ "skip_count": 2.0,
+ "step": 5920,
+ "text_loss": 0.7560386657714844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.057373046875,
+ "learning_rate": 0.00044655593629923596,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9550259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005637963302433491,
+ "skip_count": 0.0,
+ "step": 5922,
+ "text_loss": 0.17084793746471405
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.812444966245963,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.00044624820324473766,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 9554376.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008556432090699673,
+ "skip_count": 2.0,
+ "step": 5924,
+ "text_loss": 0.5906872749328613
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 27.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0004459404907880292,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9558348.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016659445827826858,
+ "skip_count": 0.0,
+ "step": 5926,
+ "text_loss": 0.8197194933891296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 27.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.00044563279904702674,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9561139.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01341368816792965,
+ "skip_count": 3.0,
+ "step": 5928,
+ "text_loss": 0.3264874815940857
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.840622248312297,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 0.000445325128139638,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9564387.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005023977253586054,
+ "skip_count": 2.0,
+ "step": 5930,
+ "text_loss": 0.9055862426757812
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0004450174781837635,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9567053.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006051476229913533,
+ "skip_count": 0.0,
+ "step": 5932,
+ "text_loss": 0.6908539533615112
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0004447098492972951,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9570036.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003152312943711877,
+ "skip_count": 0.0,
+ "step": 5934,
+ "text_loss": 0.6321061849594116
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 0.0004444022415981167,
+ "loss": 0.0094,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9574146.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004859412554651499,
+ "skip_count": 1.0,
+ "step": 5936,
+ "text_loss": 0.5905604958534241
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 27.878191957734078,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.00044409465520410426,
+ "loss": 0.0071,
+ "macro_f1": 1.0,
+ "num_tokens": 9577071.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004376287572085857,
+ "skip_count": 1.0,
+ "step": 5938,
+ "text_loss": 0.6928377747535706
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.88758438508952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.00044378709023312535,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9580537.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004038849379867315,
+ "skip_count": 1.0,
+ "step": 5940,
+ "text_loss": 0.2686770558357239
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.896976812444965,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0004434795468030396,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9583225.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005459951236844063,
+ "skip_count": 2.0,
+ "step": 5942,
+ "text_loss": 0.16855180263519287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 27.906369239800412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.000443172025031698,
+ "loss": 0.0037,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9586018.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032985717989504337,
+ "skip_count": 2.0,
+ "step": 5944,
+ "text_loss": 0.20335732400417328
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 27.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0004428645250369437,
+ "loss": 0.0037,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9589321.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003573323367163539,
+ "skip_count": 0.0,
+ "step": 5946,
+ "text_loss": 0.6318653225898743
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.00044255704693661117,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9592518.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002226749900728464,
+ "skip_count": 0.0,
+ "step": 5948,
+ "text_loss": 0.5320658683776855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.934546521866746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0004422495908485265,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9595664.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007805621717125177,
+ "skip_count": 0.0,
+ "step": 5950,
+ "text_loss": 0.6330106258392334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0004419421568905077,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9598885.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017050127498805523,
+ "skip_count": 0.0,
+ "step": 5952,
+ "text_loss": 0.6098045706748962
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00044163474518036375,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9603021.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025974081363528967,
+ "skip_count": 0.0,
+ "step": 5954,
+ "text_loss": 0.2655932903289795
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 27.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.00044132735583589567,
+ "loss": 0.0072,
+ "macro_f1": 1.0,
+ "num_tokens": 9605841.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010364850051701069,
+ "skip_count": 2.0,
+ "step": 5956,
+ "text_loss": 0.3028552532196045
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 27.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.015869140625,
+ "learning_rate": 0.00044101998897489553,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9608810.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015063622267916799,
+ "skip_count": 0.0,
+ "step": 5958,
+ "text_loss": 0.5602094531059265
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 27.981508658643968,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.00044071264471514683,
+ "loss": 0.0051,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 9611995.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011538165621459484,
+ "skip_count": 3.0,
+ "step": 5960,
+ "text_loss": 0.14332173764705658
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 27.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00044040532317442455,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9615434.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004693889059126377,
+ "skip_count": 0.0,
+ "step": 5962,
+ "text_loss": 0.334369033575058
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 28.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.00044009802447049474,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 9618056.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0045085870660841465,
+ "skip_count": 1.0,
+ "step": 5964,
+ "text_loss": 0.8163170218467712
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.009392427355444,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.00043979074872111507,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9621428.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018220023484900594,
+ "skip_count": 0.0,
+ "step": 5966,
+ "text_loss": 0.2513850927352905
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0004394834960440341,
+ "loss": 0.0051,
+ "macro_f1": 1.0,
+ "num_tokens": 9625433.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.007051277905702591,
+ "skip_count": 5.0,
+ "step": 5968,
+ "text_loss": 0.6263421177864075
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.00043917626655699154,
+ "loss": 0.0093,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9629508.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006454752874560654,
+ "skip_count": 0.0,
+ "step": 5970,
+ "text_loss": 0.645618736743927
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.037569709421778,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0004388690603777184,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9632504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004847112577408552,
+ "skip_count": 1.0,
+ "step": 5972,
+ "text_loss": 0.47306978702545166
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.00043856187762393665,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9636685.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006580828921869397,
+ "skip_count": 0.0,
+ "step": 5974,
+ "text_loss": 0.42226532101631165
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0004382547184133593,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9639958.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002188180573284626,
+ "skip_count": 0.0,
+ "step": 5976,
+ "text_loss": 0.4456600248813629
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0004379475828636901,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 9643228.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017135308589786291,
+ "skip_count": 2.0,
+ "step": 5978,
+ "text_loss": 0.6295822262763977
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.07513941884356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0004376404710926244,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9646746.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008841048111207783,
+ "skip_count": 0.0,
+ "step": 5980,
+ "text_loss": 0.5102712512016296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.00043733338321784784,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9649452.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006229099817574024,
+ "skip_count": 0.0,
+ "step": 5982,
+ "text_loss": 0.6944046020507812
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.000437026319357037,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9652700.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005293759983032942,
+ "skip_count": 2.0,
+ "step": 5984,
+ "text_loss": 0.6748214960098267
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00043671927962785946,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9655825.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013537590857595205,
+ "skip_count": 0.0,
+ "step": 5986,
+ "text_loss": 1.000306248664856
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0004364122641479733,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9658713.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004548195283859968,
+ "skip_count": 0.0,
+ "step": 5988,
+ "text_loss": 0.24580086767673492
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 28.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0004361052730350275,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9661535.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011149964295327663,
+ "skip_count": 4.0,
+ "step": 5990,
+ "text_loss": 0.5737863779067993
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 28.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.00043579830640666154,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 9664406.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003783488878980279,
+ "skip_count": 1.0,
+ "step": 5992,
+ "text_loss": 0.7836558222770691
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.00043549136438050573,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9669050.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0050374288111925125,
+ "skip_count": 1.0,
+ "step": 5994,
+ "text_loss": 0.13072487711906433
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.150278837687114,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.00043518444707418076,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9672698.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004047670867294073,
+ "skip_count": 2.0,
+ "step": 5996,
+ "text_loss": 0.4748993217945099
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.159671265042558,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.00043487755460529796,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9676159.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008628991432487965,
+ "skip_count": 2.0,
+ "step": 5998,
+ "text_loss": 0.1921990066766739
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 28.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00043457068709145904,
+ "loss": 0.0072,
+ "macro_f1": 1.0,
+ "num_tokens": 9679528.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.01094671618193388,
+ "skip_count": 3.0,
+ "step": 6000,
+ "text_loss": 0.3651769459247589
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 28.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 0.00043426384465025604,
+ "loss": 0.0033,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9682677.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0011284075444564223,
+ "skip_count": 0.0,
+ "step": 6002,
+ "text_loss": 0.28305181860923767
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.187848547108892,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.000433957027399272,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9685310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030473743099719286,
+ "skip_count": 1.0,
+ "step": 6004,
+ "text_loss": 0.3650054931640625
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.19724097446434,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.00043365023545607965,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 9687944.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011621905490756035,
+ "skip_count": 2.0,
+ "step": 6006,
+ "text_loss": 0.5409000515937805
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0004333434689382423,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9690932.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005297541501931846,
+ "skip_count": 0.0,
+ "step": 6008,
+ "text_loss": 0.4311029314994812
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.216025829175226,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.00043303672796331336,
+ "loss": 0.0058,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9693972.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.06166421249508858,
+ "skip_count": 0.0,
+ "step": 6010,
+ "text_loss": 0.2658997178077698
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.225418256530673,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.00043273001264883655,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9697712.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018419031985104084,
+ "skip_count": 0.0,
+ "step": 6012,
+ "text_loss": 0.5813497304916382
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0004324233231123458,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9700746.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003635555040091276,
+ "skip_count": 0.0,
+ "step": 6014,
+ "text_loss": 0.24211904406547546
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 28.24420311124156,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.0004321166594713651,
+ "loss": 0.0048,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 9704087.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021067705005407333,
+ "skip_count": 2.0,
+ "step": 6016,
+ "text_loss": 0.5908042788505554
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.253595538597008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.00043181002184340857,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9708695.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008712753187865019,
+ "skip_count": 0.0,
+ "step": 6018,
+ "text_loss": 0.7788549661636353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.26298796595245,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0004315034103459803,
+ "loss": 0.0054,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9711631.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03231092542409897,
+ "skip_count": 0.0,
+ "step": 6020,
+ "text_loss": 0.6127741932868958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.0004311968250965743,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9715526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020149527117609978,
+ "skip_count": 2.0,
+ "step": 6022,
+ "text_loss": 0.49970078468322754
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.281772820663342,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.0004308902662126748,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9718475.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031795913819223642,
+ "skip_count": 0.0,
+ "step": 6024,
+ "text_loss": 0.3254713714122772
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.291165248018785,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.00043058373381175567,
+ "loss": 0.004,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9722194.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0148378387093544,
+ "skip_count": 1.0,
+ "step": 6026,
+ "text_loss": 0.17670343816280365
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0004302772280112806,
+ "loss": 0.0076,
+ "macro_f1": 1.0,
+ "num_tokens": 9725489.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005742347799241543,
+ "skip_count": 2.0,
+ "step": 6028,
+ "text_loss": 0.26184776425361633
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.309950102729672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.00042997074892870335,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9729416.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023561837151646614,
+ "skip_count": 0.0,
+ "step": 6030,
+ "text_loss": 0.3026008605957031
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.31934253008512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.0004296642966814673,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9732559.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010108393616974354,
+ "skip_count": 1.0,
+ "step": 6032,
+ "text_loss": 0.43198078870773315
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 28.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.00042935787138700525,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 9736324.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005443581845611334,
+ "skip_count": 2.0,
+ "step": 6034,
+ "text_loss": 0.24883155524730682
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.338127384796007,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0004290514731627403,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 9739630.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010645060800015926,
+ "skip_count": 2.0,
+ "step": 6036,
+ "text_loss": 0.24207182228565216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018798828125,
+ "learning_rate": 0.0004287451021260846,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9742221.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008162845042534173,
+ "skip_count": 0.0,
+ "step": 6038,
+ "text_loss": 0.33018553256988525
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.356912239506897,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0004284387583944403,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9744925.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003782407147809863,
+ "skip_count": 1.0,
+ "step": 6040,
+ "text_loss": 0.6600399613380432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0004281324420851987,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9748103.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009834285592660308,
+ "skip_count": 0.0,
+ "step": 6042,
+ "text_loss": 0.6402350664138794
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.375697094217788,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0004278261533157409,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9751128.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004100334830582142,
+ "skip_count": 2.0,
+ "step": 6044,
+ "text_loss": 0.1545136719942093
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.38508952157323,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0004275198922034372,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9754140.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017166603356599808,
+ "skip_count": 1.0,
+ "step": 6046,
+ "text_loss": 0.5875935554504395
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.394481948928675,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.00042721365886564766,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 9756945.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00915827602148056,
+ "skip_count": 2.0,
+ "step": 6048,
+ "text_loss": 0.3885214328765869
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.403874376284122,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.00042690745341972134,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9759738.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0057020667009055614,
+ "skip_count": 2.0,
+ "step": 6050,
+ "text_loss": 0.3107164204120636
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.00042660127598299647,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9762987.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004196313209831715,
+ "skip_count": 2.0,
+ "step": 6052,
+ "text_loss": 0.3073577582836151
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.00042629512667280135,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9765828.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023119752295315266,
+ "skip_count": 1.0,
+ "step": 6054,
+ "text_loss": 0.8228643536567688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.432051658350456,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0004259890056064527,
+ "loss": 0.009,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9769129.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021007524337619543,
+ "skip_count": 1.0,
+ "step": 6056,
+ "text_loss": 0.8334706425666809
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0004256829129012568,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 9771821.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00671970471739769,
+ "skip_count": 2.0,
+ "step": 6058,
+ "text_loss": 0.17845536768436432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00042537684867450875,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9774566.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014770646812394261,
+ "skip_count": 0.0,
+ "step": 6060,
+ "text_loss": 0.4445459246635437
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 28.46022894041679,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00042507081304349315,
+ "loss": 0.0067,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 9777909.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.014822427183389664,
+ "skip_count": 0.0,
+ "step": 6062,
+ "text_loss": 0.45526158809661865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0004247648061254833,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9781159.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00568385748192668,
+ "skip_count": 1.0,
+ "step": 6064,
+ "text_loss": 0.18535588681697845
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.479013795127678,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.00042445882803774173,
+ "loss": 0.0046,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9784960.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0179694052785635,
+ "skip_count": 0.0,
+ "step": 6066,
+ "text_loss": 0.23591181635856628
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.488406222483125,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.00042415287889751966,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9787941.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019039154285565019,
+ "skip_count": 0.0,
+ "step": 6068,
+ "text_loss": 0.9447930455207825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.49779864983857,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0004238469588220575,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9791096.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004039563238620758,
+ "skip_count": 0.0,
+ "step": 6070,
+ "text_loss": 0.3134256601333618
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.00042354106792858446,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9794082.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018352365586906672,
+ "skip_count": 0.0,
+ "step": 6072,
+ "text_loss": 0.5681536197662354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.516583504549455,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 0.00042323520633431833,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9797303.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019325513858348131,
+ "skip_count": 0.0,
+ "step": 6074,
+ "text_loss": 0.2835809290409088
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.00042292937415646574,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9800435.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002513401210308075,
+ "skip_count": 0.0,
+ "step": 6076,
+ "text_loss": 0.1931663602590561
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00042262357151222265,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9803873.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004864581860601902,
+ "skip_count": 0.0,
+ "step": 6078,
+ "text_loss": 0.25809767842292786
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 28.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0004223177985187728,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9806438.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004932792857289314,
+ "skip_count": 0.0,
+ "step": 6080,
+ "text_loss": 0.6409249305725098
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00042201205529328925,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9809400.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00590938376262784,
+ "skip_count": 1.0,
+ "step": 6082,
+ "text_loss": 0.31158050894737244
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.00042170634195293314,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9813246.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006805860437452793,
+ "skip_count": 0.0,
+ "step": 6084,
+ "text_loss": 0.32945963740348816
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.572938068682124,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0004214006586148545,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9816513.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010186503641307354,
+ "skip_count": 0.0,
+ "step": 6086,
+ "text_loss": 0.48659923672676086
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.0004210950053961917,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9819908.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00402973173186183,
+ "skip_count": 1.0,
+ "step": 6088,
+ "text_loss": 0.6249601244926453
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.00042078938241407174,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9822950.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00236532068811357,
+ "skip_count": 1.0,
+ "step": 6090,
+ "text_loss": 0.26589256525039673
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.601115350748458,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0004204837897856098,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 9826493.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003072192659601569,
+ "skip_count": 2.0,
+ "step": 6092,
+ "text_loss": 0.5216912627220154
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.610507778103905,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.0004201782276279096,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9829698.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027553171385079622,
+ "skip_count": 1.0,
+ "step": 6094,
+ "text_loss": 0.40127676725387573
+ },
+ {
+ "acc_repeat": 0.75,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.61990020545935,
+ "f1_execute": 0.9756097793579102,
+ "f1_repeat": 0.8571428656578064,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.00041987269605806325,
+ "loss": 0.0045,
+ "macro_f1": 0.9442509412765503,
+ "num_tokens": 9833719.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.013845407404005527,
+ "skip_count": 4.0,
+ "step": 6096,
+ "text_loss": 0.23114071786403656
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.629292632814792,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 0.0004195671951931509,
+ "loss": 0.0116,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9838235.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019887303933501244,
+ "skip_count": 2.0,
+ "step": 6098,
+ "text_loss": 0.7467341423034668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0004192617251502409,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9840867.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007213905337266624,
+ "skip_count": 0.0,
+ "step": 6100,
+ "text_loss": 0.6283472180366516
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.00041895628604639036,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9843827.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003863139310851693,
+ "skip_count": 1.0,
+ "step": 6102,
+ "text_loss": 0.3602744936943054
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.00041865087799864374,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9846939.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013336286647245288,
+ "skip_count": 0.0,
+ "step": 6104,
+ "text_loss": 0.4182434678077698
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.666862342236573,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.0004183455011240341,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9849827.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00038455065805464983,
+ "skip_count": 0.0,
+ "step": 6106,
+ "text_loss": 0.7122722864151001
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 28.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0004180401555395826,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 9853487.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.0038226440083235502,
+ "skip_count": 1.0,
+ "step": 6108,
+ "text_loss": 0.2521185576915741
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0004177348413622981,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9856321.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015809801407158375,
+ "skip_count": 0.0,
+ "step": 6110,
+ "text_loss": 0.423979252576828
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.695039624302908,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0004174295587091776,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9859238.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007586454739794135,
+ "skip_count": 0.0,
+ "step": 6112,
+ "text_loss": 0.4720100462436676
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 28.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.00041712430769720593,
+ "loss": 0.0091,
+ "macro_f1": 1.0,
+ "num_tokens": 9862282.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0045816488564014435,
+ "skip_count": 1.0,
+ "step": 6114,
+ "text_loss": 0.279577374458313
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 28.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0004168190884433559,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 9865394.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004728195257484913,
+ "skip_count": 1.0,
+ "step": 6116,
+ "text_loss": 0.3826395571231842
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 28.723216906369238,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0179443359375,
+ "learning_rate": 0.0004165139010645881,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9869165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006160226184874773,
+ "skip_count": 3.0,
+ "step": 6118,
+ "text_loss": 0.4668935537338257
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 24.0,
+ "epoch": 28.732609333724685,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.04736328125,
+ "learning_rate": 0.0004162087456778509,
+ "loss": 0.0074,
+ "macro_f1": 0.9619450569152832,
+ "num_tokens": 9872381.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.027831824496388435,
+ "skip_count": 6.0,
+ "step": 6120,
+ "text_loss": 0.28708913922309875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0004159036224000804,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9875668.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030764432158321142,
+ "skip_count": 1.0,
+ "step": 6122,
+ "text_loss": 0.37078607082366943
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.751394188435572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0004155985313482002,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9878533.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00043521137558855116,
+ "skip_count": 0.0,
+ "step": 6124,
+ "text_loss": 0.34975379705429077
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00041529347263912224,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9881478.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016251741908490658,
+ "skip_count": 0.0,
+ "step": 6126,
+ "text_loss": 0.39166271686553955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.770179043146463,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00041498844638974535,
+ "loss": 0.005,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 9884252.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.019553523510694504,
+ "skip_count": 0.0,
+ "step": 6128,
+ "text_loss": 0.2309480905532837
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 28.779571470501907,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.0004146834527169562,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9887485.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0036251386627554893,
+ "skip_count": 0.0,
+ "step": 6130,
+ "text_loss": 0.4464457631111145
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.788963897857354,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.00041437849173762894,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9890711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008515548543073237,
+ "skip_count": 0.0,
+ "step": 6132,
+ "text_loss": 0.5012133717536926
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 28.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0004140735635686251,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9894458.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001084602321498096,
+ "skip_count": 0.0,
+ "step": 6134,
+ "text_loss": 0.32015663385391235
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.80774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.0004137686683267938,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9897634.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025203595869243145,
+ "skip_count": 0.0,
+ "step": 6136,
+ "text_loss": 0.15804508328437805
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.817141179923688,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.0004134638061289715,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9901157.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029381231870502234,
+ "skip_count": 0.0,
+ "step": 6138,
+ "text_loss": 0.14375236630439758
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.0004131589770919819,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9903958.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002789110178127885,
+ "skip_count": 0.0,
+ "step": 6140,
+ "text_loss": 0.2474033683538437
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0004128541813326361,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 9906799.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.010770512744784355,
+ "skip_count": 3.0,
+ "step": 6142,
+ "text_loss": 0.2304249256849289
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 28.845318461990022,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0004125494189677325,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9909286.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003122122259810567,
+ "skip_count": 0.0,
+ "step": 6144,
+ "text_loss": 0.3781827688217163
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 28.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.00041224469011405643,
+ "loss": 0.0045,
+ "macro_f1": 1.0,
+ "num_tokens": 9912416.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008443298749625683,
+ "skip_count": 1.0,
+ "step": 6146,
+ "text_loss": 0.3004767596721649
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0004119399948883806,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9915290.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033219947945326567,
+ "skip_count": 1.0,
+ "step": 6148,
+ "text_loss": 0.748744547367096
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 28.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 0.0004116353334074647,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9918493.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005501769948750734,
+ "skip_count": 0.0,
+ "step": 6150,
+ "text_loss": 0.330759733915329
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.000411330705788056,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9921027.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013694261433556676,
+ "skip_count": 0.0,
+ "step": 6152,
+ "text_loss": 0.43070924282073975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0164794921875,
+ "learning_rate": 0.000411026112146888,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9924303.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00046192589798010886,
+ "skip_count": 0.0,
+ "step": 6154,
+ "text_loss": 0.5674887895584106
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 28.901673026122687,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0004107215526006817,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9927065.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004311304073780775,
+ "skip_count": 0.0,
+ "step": 6156,
+ "text_loss": 0.16138267517089844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0004104170272661449,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9930713.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035845425445586443,
+ "skip_count": 0.0,
+ "step": 6158,
+ "text_loss": 0.18728356063365936
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.00041011253625997227,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9934393.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00247366214171052,
+ "skip_count": 0.0,
+ "step": 6160,
+ "text_loss": 0.3624019920825958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.92985030818902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0004098080796988452,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9937457.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003240241203457117,
+ "skip_count": 0.0,
+ "step": 6162,
+ "text_loss": 0.12348521500825882
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 0.0004095036576994321,
+ "loss": 0.0035,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9940523.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001985874492675066,
+ "skip_count": 1.0,
+ "step": 6164,
+ "text_loss": 0.2688066363334656
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 28.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.00040919927037838815,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9943802.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004264154937118292,
+ "skip_count": 3.0,
+ "step": 6166,
+ "text_loss": 0.49316367506980896
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0556640625,
+ "learning_rate": 0.00040889491785235513,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9946649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002545441733673215,
+ "skip_count": 0.0,
+ "step": 6168,
+ "text_loss": 0.4079313576221466
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.967420017610802,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0004085906002379614,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9949800.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009590961271896958,
+ "skip_count": 0.0,
+ "step": 6170,
+ "text_loss": 0.6166561245918274
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 28.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0004082863176518221,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9954008.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003795337164774537,
+ "skip_count": 2.0,
+ "step": 6172,
+ "text_loss": 0.4791361689567566
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 28.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044921875,
+ "learning_rate": 0.0004079820702105388,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9957153.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015634822193533182,
+ "skip_count": 0.0,
+ "step": 6174,
+ "text_loss": 0.7208777666091919
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 28.995597299677137,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0004076778580306999,
+ "loss": 0.0056,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 9960060.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03223998099565506,
+ "skip_count": 2.0,
+ "step": 6176,
+ "text_loss": 0.6617992520332336
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.00469621367772,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.00040737368122887983,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9963396.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033978577703237534,
+ "skip_count": 0.0,
+ "step": 6178,
+ "text_loss": 0.7339215278625488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.014088641033165,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00040706953992164,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9966364.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005358994239941239,
+ "skip_count": 0.0,
+ "step": 6180,
+ "text_loss": 0.44187214970588684
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.023481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00040676543422552767,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9969813.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018544091144576669,
+ "skip_count": 1.0,
+ "step": 6182,
+ "text_loss": 0.6244927048683167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0004064613642570769,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9973015.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005692692007869482,
+ "skip_count": 0.0,
+ "step": 6184,
+ "text_loss": 0.18860043585300446
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00040615733013280784,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9976201.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018737476784735918,
+ "skip_count": 0.0,
+ "step": 6186,
+ "text_loss": 0.21189232170581818
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.051658350454947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.00040585333196922687,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9979711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011945146135985851,
+ "skip_count": 2.0,
+ "step": 6188,
+ "text_loss": 0.2628154456615448
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.00040554936988282663,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9983003.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036045778542757034,
+ "skip_count": 1.0,
+ "step": 6190,
+ "text_loss": 0.5926038026809692
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.070443205165834,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0004052454439900861,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9986841.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004170368425548077,
+ "skip_count": 0.0,
+ "step": 6192,
+ "text_loss": 0.3088737726211548
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00040494155440747015,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 9989596.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002254750579595566,
+ "skip_count": 2.0,
+ "step": 6194,
+ "text_loss": 0.6309700012207031
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 29.089228059876724,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.00040463770125142987,
+ "loss": 0.0087,
+ "macro_f1": 0.8814815282821655,
+ "num_tokens": 9992789.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04092822223901749,
+ "skip_count": 4.0,
+ "step": 6196,
+ "text_loss": 0.09625697880983353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.098620487232168,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.00040433388463840213,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9995782.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00029065192211419344,
+ "skip_count": 0.0,
+ "step": 6198,
+ "text_loss": 0.5600258111953735
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.108012914587615,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0004040301046848105,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 9998712.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005865268758498132,
+ "skip_count": 0.0,
+ "step": 6200,
+ "text_loss": 0.6426429748535156
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 29.11740534194306,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.0004037263615070638,
+ "loss": 0.0078,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 10002020.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.025357060134410858,
+ "skip_count": 3.0,
+ "step": 6202,
+ "text_loss": 0.25125735998153687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.126797769298502,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.000403422655221557,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10005381.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003139561740681529,
+ "skip_count": 1.0,
+ "step": 6204,
+ "text_loss": 0.3639419376850128
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.00040311898594467085,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10008348.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004091196693480015,
+ "skip_count": 2.0,
+ "step": 6206,
+ "text_loss": 0.1602363884449005
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00040281535379277204,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10011171.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005771483760327101,
+ "skip_count": 0.0,
+ "step": 6208,
+ "text_loss": 0.5593504905700684
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.000402511758882213,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10014374.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005212264601141214,
+ "skip_count": 1.0,
+ "step": 6210,
+ "text_loss": 0.15668229758739471
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.164367478720283,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0004022082013293319,
+ "loss": 0.0032,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10017327.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027585842180997133,
+ "skip_count": 1.0,
+ "step": 6212,
+ "text_loss": 0.21188466250896454
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.173759906075727,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.00040190468125045255,
+ "loss": 0.0061,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 10020518.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013210589066147804,
+ "skip_count": 1.0,
+ "step": 6214,
+ "text_loss": 0.2551073729991913
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 29.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01708984375,
+ "learning_rate": 0.00040160119876188436,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10023799.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001590219559147954,
+ "skip_count": 0.0,
+ "step": 6216,
+ "text_loss": 0.5634782314300537
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.192544760786618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0004012977539799224,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10027107.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003917343448847532,
+ "skip_count": 0.0,
+ "step": 6218,
+ "text_loss": 0.6412819027900696
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.0004009943470208473,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10030460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00874288845807314,
+ "skip_count": 2.0,
+ "step": 6220,
+ "text_loss": 0.13269923627376556
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.211329615497505,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.000400690978000925,
+ "loss": 0.0075,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 10034086.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03736349940299988,
+ "skip_count": 3.0,
+ "step": 6222,
+ "text_loss": 0.4956454336643219
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.220722042852948,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0004003876470364075,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10037312.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008481289260089397,
+ "skip_count": 2.0,
+ "step": 6224,
+ "text_loss": 0.2148810178041458
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0152587890625,
+ "learning_rate": 0.0004000843542435315,
+ "loss": 0.0028,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10040393.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002235144842416048,
+ "skip_count": 0.0,
+ "step": 6226,
+ "text_loss": 0.17645306885242462
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 29.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.0003997810997385195,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10044386.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004541373811662197,
+ "skip_count": 0.0,
+ "step": 6228,
+ "text_loss": 0.5098661184310913
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.248899324919282,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.00039947788363757915,
+ "loss": 0.0088,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10049046.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019183673430234194,
+ "skip_count": 1.0,
+ "step": 6230,
+ "text_loss": 0.6953724026679993
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.00039917470605690334,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 10051787.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0032311067916452885,
+ "skip_count": 4.0,
+ "step": 6232,
+ "text_loss": 0.475127637386322
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 29.267684179630173,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.00039887156711267043,
+ "loss": 0.0079,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 10055396.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03247373178601265,
+ "skip_count": 0.0,
+ "step": 6234,
+ "text_loss": 0.4239100515842438
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 29.277076606985617,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.00039856846692104363,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10058395.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006287421099841595,
+ "skip_count": 3.0,
+ "step": 6236,
+ "text_loss": 0.24084535241127014
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 29.286469034341064,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.016357421875,
+ "learning_rate": 0.0003982654055981718,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 10061302.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0008686117362231016,
+ "skip_count": 1.0,
+ "step": 6238,
+ "text_loss": 0.4740419089794159
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.0003979623832601884,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10065318.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037686119321733713,
+ "skip_count": 2.0,
+ "step": 6240,
+ "text_loss": 0.43965795636177063
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.30525388905195,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0003976594000232123,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10068291.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005804901942610741,
+ "skip_count": 0.0,
+ "step": 6242,
+ "text_loss": 0.24424348771572113
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.314646316407398,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.00039735645600334714,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10071645.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002001055981963873,
+ "skip_count": 1.0,
+ "step": 6244,
+ "text_loss": 0.6524377465248108
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0003970535513166815,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10075136.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001252001617103815,
+ "skip_count": 0.0,
+ "step": 6246,
+ "text_loss": 0.22803714871406555
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0003967506860792893,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10078230.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004913780372589827,
+ "skip_count": 1.0,
+ "step": 6248,
+ "text_loss": 0.9835516214370728
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.342823598473732,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.000396447860407229,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10080852.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037437966093420982,
+ "skip_count": 2.0,
+ "step": 6250,
+ "text_loss": 0.4021640121936798
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05078125,
+ "learning_rate": 0.00039614507441654393,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10084139.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005433002021163702,
+ "skip_count": 2.0,
+ "step": 6252,
+ "text_loss": 0.23060470819473267
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.00039584232822326224,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10088501.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007705377647653222,
+ "skip_count": 0.0,
+ "step": 6254,
+ "text_loss": 0.5994830131530762
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 0.0003955396219433969,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10091506.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012310115853324533,
+ "skip_count": 0.0,
+ "step": 6256,
+ "text_loss": 0.4639038145542145
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0003952369556929455,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10096236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008964627049863338,
+ "skip_count": 2.0,
+ "step": 6258,
+ "text_loss": 0.24845287203788757
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0003949343295878903,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10099213.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033088945783674717,
+ "skip_count": 0.0,
+ "step": 6260,
+ "text_loss": 0.6527073979377747
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 29.399178162606397,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00039463174374419817,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 10103160.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.003462672932073474,
+ "skip_count": 1.0,
+ "step": 6262,
+ "text_loss": 0.4209299683570862
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 29.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00039432919827782066,
+ "loss": 0.0036,
+ "macro_f1": 1.0,
+ "num_tokens": 10105881.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0027124532498419285,
+ "skip_count": 2.0,
+ "step": 6264,
+ "text_loss": 0.4442266821861267
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0172119140625,
+ "learning_rate": 0.00039402669330469367,
+ "loss": 0.0032,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10108596.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005055282264947891,
+ "skip_count": 2.0,
+ "step": 6266,
+ "text_loss": 0.3331456780433655
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.42735544467273,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.00039372422894073765,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10111673.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009340311517007649,
+ "skip_count": 0.0,
+ "step": 6268,
+ "text_loss": 0.7664456367492676
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.436747872028178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.00039342180530185745,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10116141.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00032052272581495345,
+ "skip_count": 0.0,
+ "step": 6270,
+ "text_loss": 0.47610244154930115
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00039311942250394274,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10119151.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015820999396964908,
+ "skip_count": 0.0,
+ "step": 6272,
+ "text_loss": 0.3815282881259918
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.0003928170806628669,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10122684.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007423736387863755,
+ "skip_count": 0.0,
+ "step": 6274,
+ "text_loss": 0.4630914628505707
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.464925154094512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00039251477989448797,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10126751.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006216703332029283,
+ "skip_count": 0.0,
+ "step": 6276,
+ "text_loss": 0.4342454671859741
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 29.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.00039221252031464816,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10129784.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004239698871970177,
+ "skip_count": 3.0,
+ "step": 6278,
+ "text_loss": 0.24661089479923248
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 29.4837100088054,
+ "f1_execute": 0.9743589162826538,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.0003919103020391738,
+ "loss": 0.006,
+ "macro_f1": 0.8803418874740601,
+ "num_tokens": 10133066.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.027879100292921066,
+ "skip_count": 7.0,
+ "step": 6280,
+ "text_loss": 0.4705188274383545
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.493102436160846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.00039160812518387574,
+ "loss": 0.0099,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10136860.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002533538034185767,
+ "skip_count": 0.0,
+ "step": 6282,
+ "text_loss": 0.1953880786895752
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00039130598986454845,
+ "loss": 0.005,
+ "macro_f1": 1.0,
+ "num_tokens": 10140066.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002462630858644843,
+ "skip_count": 2.0,
+ "step": 6284,
+ "text_loss": 0.378487765789032
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 29.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.000391003896196971,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 10143646.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011922914534807205,
+ "skip_count": 1.0,
+ "step": 6286,
+ "text_loss": 0.2467316836118698
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 29.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.00039070184429690607,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 10146507.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0059767309576272964,
+ "skip_count": 1.0,
+ "step": 6288,
+ "text_loss": 0.9603674411773682
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.0003903998342801006,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 10149301.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030056277755647898,
+ "skip_count": 2.0,
+ "step": 6290,
+ "text_loss": 0.36631715297698975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 29.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.00039009786626228543,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10152158.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005298118572682142,
+ "skip_count": 3.0,
+ "step": 6292,
+ "text_loss": 0.2876455783843994
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0003897959403591751,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10155852.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004937763791531324,
+ "skip_count": 2.0,
+ "step": 6294,
+ "text_loss": 0.14649681746959686
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0003894940566864683,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10159164.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021474575623869896,
+ "skip_count": 0.0,
+ "step": 6296,
+ "text_loss": 0.5694304704666138
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 29.568241855004402,
+ "f1_execute": 0.9583333134651184,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.08251953125,
+ "learning_rate": 0.00038919221535984753,
+ "loss": 0.0073,
+ "macro_f1": 0.875,
+ "num_tokens": 10161806.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.040340203791856766,
+ "skip_count": 3.0,
+ "step": 6298,
+ "text_loss": 0.1574537754058838
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.57763428235985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.00038889041649497894,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10165669.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028486931696534157,
+ "skip_count": 0.0,
+ "step": 6300,
+ "text_loss": 0.9158071279525757
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0003885886602075123,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10168945.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006565484683960676,
+ "skip_count": 2.0,
+ "step": 6302,
+ "text_loss": 0.3530846834182739
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.00038828694661308116,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10171914.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009084723424166441,
+ "skip_count": 0.0,
+ "step": 6304,
+ "text_loss": 0.4603337347507477
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0003879852758273029,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 10175737.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004121702630072832,
+ "skip_count": 2.0,
+ "step": 6306,
+ "text_loss": 0.5294032096862793
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00038768364796577814,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10178543.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013208909658715129,
+ "skip_count": 0.0,
+ "step": 6308,
+ "text_loss": 0.41084006428718567
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 29.62459641913707,
+ "f1_execute": 0.9743589162826538,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00038738206314409144,
+ "loss": 0.0079,
+ "macro_f1": 0.9247862696647644,
+ "num_tokens": 10181880.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.03674180060625076,
+ "skip_count": 6.0,
+ "step": 6310,
+ "text_loss": 0.6920746564865112
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0003870805214778106,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10185173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00221974472515285,
+ "skip_count": 2.0,
+ "step": 6312,
+ "text_loss": 0.1376657634973526
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.0003867790230824869,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10188642.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001809283159673214,
+ "skip_count": 0.0,
+ "step": 6314,
+ "text_loss": 0.5220870971679688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0003864775680736552,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10191750.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013956360053271055,
+ "skip_count": 0.0,
+ "step": 6316,
+ "text_loss": 0.4109838902950287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.662166128558848,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.00038617615656683356,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10194578.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002947692759335041,
+ "skip_count": 2.0,
+ "step": 6318,
+ "text_loss": 0.4818590581417084
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0003858747886775232,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10197131.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008140999125316739,
+ "skip_count": 2.0,
+ "step": 6320,
+ "text_loss": 0.4004709720611572
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.68095098326974,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 0.0003855734645212093,
+ "loss": 0.0089,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 10199965.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.013056626543402672,
+ "skip_count": 2.0,
+ "step": 6322,
+ "text_loss": 0.3367139995098114
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.690343410625182,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.00038527218421335977,
+ "loss": 0.0087,
+ "macro_f1": 1.0,
+ "num_tokens": 10203184.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038112467154860497,
+ "skip_count": 2.0,
+ "step": 6324,
+ "text_loss": 0.5747989416122437
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.69973583798063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0003849709478694255,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10206436.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001232540002092719,
+ "skip_count": 0.0,
+ "step": 6326,
+ "text_loss": 0.4981732964515686
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.00038466975560484115,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10209889.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004343799781054258,
+ "skip_count": 0.0,
+ "step": 6328,
+ "text_loss": 0.2160186469554901
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.718520692691516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.000384368607535024,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10212520.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014161963481456041,
+ "skip_count": 1.0,
+ "step": 6330,
+ "text_loss": 0.3556232154369354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.727913120046964,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0185546875,
+ "learning_rate": 0.0003840675037753745,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10215456.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014989010524004698,
+ "skip_count": 0.0,
+ "step": 6332,
+ "text_loss": 0.8510926961898804
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0003837664444412762,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10218558.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006702739745378494,
+ "skip_count": 0.0,
+ "step": 6334,
+ "text_loss": 0.3995226323604584
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0003834654296480958,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10221862.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00826781615614891,
+ "skip_count": 2.0,
+ "step": 6336,
+ "text_loss": 0.3534671664237976
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.756090402113298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0003831644595111825,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10224820.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002143894787877798,
+ "skip_count": 0.0,
+ "step": 6338,
+ "text_loss": 0.20216144621372223
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 29.76548282946874,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.04736328125,
+ "learning_rate": 0.0003828635341458687,
+ "loss": 0.0064,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 10227479.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012319118715822697,
+ "skip_count": 2.0,
+ "step": 6340,
+ "text_loss": 0.26248639822006226
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.774875256824185,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.0003825626536674697,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10231347.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00334449321962893,
+ "skip_count": 0.0,
+ "step": 6342,
+ "text_loss": 0.6357201337814331
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.784267684179632,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.000382261818191283,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10234347.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027788348961621523,
+ "skip_count": 0.0,
+ "step": 6344,
+ "text_loss": 0.2813846468925476
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.00038196102783258996,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10237105.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001545077539049089,
+ "skip_count": 0.0,
+ "step": 6346,
+ "text_loss": 0.47612661123275757
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.060791015625,
+ "learning_rate": 0.0003816602827066537,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10240249.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005602670833468437,
+ "skip_count": 2.0,
+ "step": 6348,
+ "text_loss": 0.18197228014469147
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.812444966245963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0003813595829287204,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10243417.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004317959537729621,
+ "skip_count": 0.0,
+ "step": 6350,
+ "text_loss": 0.3818575143814087
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 0.0003810589286140186,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10246824.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002225276781246066,
+ "skip_count": 0.0,
+ "step": 6352,
+ "text_loss": 0.14129821956157684
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 29.831229820956853,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0003807583198777599,
+ "loss": 0.0062,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 10249836.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.02445496805012226,
+ "skip_count": 1.0,
+ "step": 6354,
+ "text_loss": 0.3237064480781555
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.840622248312297,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.00038045775683513786,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10252900.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009264222462661564,
+ "skip_count": 0.0,
+ "step": 6356,
+ "text_loss": 0.6777551174163818
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 29.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.0003801572396013289,
+ "loss": 0.0051,
+ "macro_f1": 1.0,
+ "num_tokens": 10255526.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007189550437033176,
+ "skip_count": 5.0,
+ "step": 6358,
+ "text_loss": 0.25438982248306274
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.859407103023187,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.00037985676829149187,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10258865.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014201018493622541,
+ "skip_count": 0.0,
+ "step": 6360,
+ "text_loss": 0.5063154101371765
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 29.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0003795563430207678,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10261677.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035477925557643175,
+ "skip_count": 3.0,
+ "step": 6362,
+ "text_loss": 0.4815357029438019
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.878191957734078,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.0003792559639042803,
+ "loss": 0.0049,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 10264805.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013723359443247318,
+ "skip_count": 1.0,
+ "step": 6364,
+ "text_loss": 0.5563676357269287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.88758438508952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06494140625,
+ "learning_rate": 0.0003789556310571351,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10267885.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028159532230347395,
+ "skip_count": 0.0,
+ "step": 6366,
+ "text_loss": 0.7284183502197266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.896976812444965,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0003786553445944204,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10270934.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005918835522606969,
+ "skip_count": 0.0,
+ "step": 6368,
+ "text_loss": 0.7387746572494507
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.906369239800412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0003783551046312067,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10273818.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011416864581406116,
+ "skip_count": 0.0,
+ "step": 6370,
+ "text_loss": 0.5360285043716431
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 29.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.00037805491128254645,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 10276494.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.002382483799010515,
+ "skip_count": 1.0,
+ "step": 6372,
+ "text_loss": 0.7536854147911072
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.00037775476466347414,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10279719.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021104486659169197,
+ "skip_count": 1.0,
+ "step": 6374,
+ "text_loss": 0.6807253956794739
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.934546521866746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0003774546648890066,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10283000.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003148776013404131,
+ "skip_count": 2.0,
+ "step": 6376,
+ "text_loss": 0.30774110555648804
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 29.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0003771546120741426,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 10285666.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007700880523771048,
+ "skip_count": 1.0,
+ "step": 6378,
+ "text_loss": 0.4476076364517212
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 29.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0003768546063338631,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10289127.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023625255562365055,
+ "skip_count": 1.0,
+ "step": 6380,
+ "text_loss": 0.4350969195365906
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0179443359375,
+ "learning_rate": 0.0003765546477831307,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10292485.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001428726245649159,
+ "skip_count": 0.0,
+ "step": 6382,
+ "text_loss": 0.49078530073165894
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 29.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0003762547365368902,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10295361.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027160397730767727,
+ "skip_count": 2.0,
+ "step": 6384,
+ "text_loss": 0.3476370573043823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 29.981508658643968,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.00037595487271006807,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10298717.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002456068294122815,
+ "skip_count": 0.0,
+ "step": 6386,
+ "text_loss": 0.3634916841983795
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 29.99090108599941,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.0003756550564175727,
+ "loss": 0.0049,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 10302102.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02546076290309429,
+ "skip_count": 3.0,
+ "step": 6388,
+ "text_loss": 0.2422582060098648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.00037535528777429426,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10305060.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001045907847583294,
+ "skip_count": 0.0,
+ "step": 6390,
+ "text_loss": 0.5563194155693054
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.009392427355444,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0003750555668951045,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 10307903.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007391332648694515,
+ "skip_count": 2.0,
+ "step": 6392,
+ "text_loss": 0.3423991799354553
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 30.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.00037475589389485744,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 10311396.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0029360291082412004,
+ "skip_count": 1.0,
+ "step": 6394,
+ "text_loss": 0.9877024292945862
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.028177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.00037445626888838807,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10314250.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014932662015780807,
+ "skip_count": 0.0,
+ "step": 6396,
+ "text_loss": 0.3978523313999176
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 30.037569709421778,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0003741566919905133,
+ "loss": 0.0051,
+ "macro_f1": 1.0,
+ "num_tokens": 10316894.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007003722712397575,
+ "skip_count": 5.0,
+ "step": 6398,
+ "text_loss": 0.2945566475391388
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 30.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.00037385716331603155,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 10319603.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006710570305585861,
+ "skip_count": 1.0,
+ "step": 6400,
+ "text_loss": 0.2984389662742615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0179443359375,
+ "learning_rate": 0.00037355768297972275,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10322670.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00048738415353000164,
+ "skip_count": 0.0,
+ "step": 6402,
+ "text_loss": 0.483262300491333
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 30.065746991488112,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 0.00037325825109634837,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 10326280.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001625525183044374,
+ "skip_count": 1.0,
+ "step": 6404,
+ "text_loss": 0.42678722739219666
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.07513941884356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0003729588677806513,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10329008.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004408636130392551,
+ "skip_count": 0.0,
+ "step": 6406,
+ "text_loss": 0.2264070063829422
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.084531846199003,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0003726595331473557,
+ "loss": 0.0032,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10332533.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038099216762930155,
+ "skip_count": 2.0,
+ "step": 6408,
+ "text_loss": 0.6670092940330505
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.093924273554446,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0003723602473111672,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10335643.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003097689710557461,
+ "skip_count": 0.0,
+ "step": 6410,
+ "text_loss": 0.45228812098503113
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.00037206101038677274,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10338522.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005268602631986141,
+ "skip_count": 1.0,
+ "step": 6412,
+ "text_loss": 0.7288079857826233
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.112709128265337,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0003717618224888405,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10341516.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004640138708055019,
+ "skip_count": 2.0,
+ "step": 6414,
+ "text_loss": 0.22850871086120605
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.00037146268373201954,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10344831.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006379318656399846,
+ "skip_count": 0.0,
+ "step": 6416,
+ "text_loss": 0.7864460945129395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.0003711635942309408,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10348499.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004005273221991956,
+ "skip_count": 0.0,
+ "step": 6418,
+ "text_loss": 0.605839192867279
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0157470703125,
+ "learning_rate": 0.0003708645541002159,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10351722.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001061634044162929,
+ "skip_count": 0.0,
+ "step": 6420,
+ "text_loss": 0.8226510286331177
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 30.150278837687114,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.0003705655634544374,
+ "loss": 0.0052,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 10355275.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013980664312839508,
+ "skip_count": 2.0,
+ "step": 6422,
+ "text_loss": 0.2709597647190094
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.159671265042558,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.0003702666224081792,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10359702.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0013196271611377597,
+ "skip_count": 0.0,
+ "step": 6424,
+ "text_loss": 0.6451483368873596
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00036996773107599604,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10363364.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028023163322359324,
+ "skip_count": 1.0,
+ "step": 6426,
+ "text_loss": 0.2770799398422241
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01373291015625,
+ "learning_rate": 0.0003696688895724235,
+ "loss": 0.0029,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10366554.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011023655533790588,
+ "skip_count": 0.0,
+ "step": 6428,
+ "text_loss": 0.5466503500938416
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.187848547108892,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.0003693700980119784,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10369733.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00230707717128098,
+ "skip_count": 0.0,
+ "step": 6430,
+ "text_loss": 0.45667049288749695
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.19724097446434,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.00036907135650915824,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10373382.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036784098483622074,
+ "skip_count": 2.0,
+ "step": 6432,
+ "text_loss": 0.13856995105743408
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.206633401819783,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.00036877266517844115,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10376202.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008461157558485866,
+ "skip_count": 0.0,
+ "step": 6434,
+ "text_loss": 0.27238601446151733
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.216025829175226,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.0003684740241342863,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10380748.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0052765593864023685,
+ "skip_count": 0.0,
+ "step": 6436,
+ "text_loss": 0.6182295083999634
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.225418256530673,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.00036817543349113355,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 10386148.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005562922917306423,
+ "skip_count": 2.0,
+ "step": 6438,
+ "text_loss": 0.5591027140617371
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.234810683886117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0003678768933634033,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10389385.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008686366491019726,
+ "skip_count": 0.0,
+ "step": 6440,
+ "text_loss": 0.5158660411834717
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.0003675784038654968,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10391893.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022222092375159264,
+ "skip_count": 1.0,
+ "step": 6442,
+ "text_loss": 0.2865697741508484
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.253595538597008,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0003672799651117958,
+ "loss": 0.0099,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10395082.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030799773521721363,
+ "skip_count": 2.0,
+ "step": 6444,
+ "text_loss": 0.21298295259475708
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 30.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0003669815772166625,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10398015.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035721305757761,
+ "skip_count": 3.0,
+ "step": 6446,
+ "text_loss": 0.5286803841590881
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 30.272380393307895,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 0.00036668324029443975,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10400749.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00741040613502264,
+ "skip_count": 4.0,
+ "step": 6448,
+ "text_loss": 0.3922366201877594
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.281772820663342,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.064453125,
+ "learning_rate": 0.0003663849544594507,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10404439.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002974750241264701,
+ "skip_count": 2.0,
+ "step": 6450,
+ "text_loss": 0.21894219517707825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 0.00036608671982599927,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10408476.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004810616374015808,
+ "skip_count": 0.0,
+ "step": 6452,
+ "text_loss": 0.3928622305393219
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.0003657885365083694,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10411533.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005527745466679335,
+ "skip_count": 0.0,
+ "step": 6454,
+ "text_loss": 0.22816279530525208
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.309950102729672,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052734375,
+ "learning_rate": 0.00036549040462082556,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10414501.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021297158673405647,
+ "skip_count": 0.0,
+ "step": 6456,
+ "text_loss": 0.20487719774246216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 30.31934253008512,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0003651923242776124,
+ "loss": 0.0082,
+ "macro_f1": 0.6592592597007751,
+ "num_tokens": 10418296.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.046412210911512375,
+ "skip_count": 5.0,
+ "step": 6458,
+ "text_loss": 0.2890419065952301
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.328734957440563,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.00036489429559295484,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10421211.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004002603702247143,
+ "skip_count": 0.0,
+ "step": 6460,
+ "text_loss": 0.23165544867515564
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.338127384796007,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0003645963186810581,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10424231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003480088198557496,
+ "skip_count": 1.0,
+ "step": 6462,
+ "text_loss": 0.6286683082580566
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0003642983936561075,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10427387.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009358933195471764,
+ "skip_count": 2.0,
+ "step": 6464,
+ "text_loss": 0.3258316218852997
+ },
+ {
+ "acc_repeat": 0.800000011920929,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.356912239506897,
+ "f1_execute": 0.9729729890823364,
+ "f1_repeat": 0.888888955116272,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.00036400052063226816,
+ "loss": 0.0048,
+ "macro_f1": 0.9539539813995361,
+ "num_tokens": 10430813.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.03567950055003166,
+ "skip_count": 5.0,
+ "step": 6466,
+ "text_loss": 0.7278715968132019
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.00036370269972368615,
+ "loss": 0.008,
+ "macro_f1": 1.0,
+ "num_tokens": 10434175.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00226925453171134,
+ "skip_count": 2.0,
+ "step": 6468,
+ "text_loss": 0.5652450919151306
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.375697094217788,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0174560546875,
+ "learning_rate": 0.0003634049310444867,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10437393.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013644809368997812,
+ "skip_count": 0.0,
+ "step": 6470,
+ "text_loss": 0.5985191464424133
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.38508952157323,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.0003631072147087753,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10440412.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003114990540780127,
+ "skip_count": 0.0,
+ "step": 6472,
+ "text_loss": 0.5588209629058838
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.394481948928675,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.00036280955083063747,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10443471.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005486322334036231,
+ "skip_count": 0.0,
+ "step": 6474,
+ "text_loss": 0.6969016194343567
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.403874376284122,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.00036251193952413865,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 10446548.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008256378583610058,
+ "skip_count": 2.0,
+ "step": 6476,
+ "text_loss": 0.27083566784858704
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.0003622143809033239,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10449478.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001008771825581789,
+ "skip_count": 0.0,
+ "step": 6478,
+ "text_loss": 0.1689433604478836
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.00036191687508221827,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10453017.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0014678959269076586,
+ "skip_count": 0.0,
+ "step": 6480,
+ "text_loss": 0.9571998715400696
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.432051658350456,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 0.0003616194221748267,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10456061.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001516164978966117,
+ "skip_count": 0.0,
+ "step": 6482,
+ "text_loss": 0.5750429034233093
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.4414440857059,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.0003613220222951335,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10459130.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031315975356847048,
+ "skip_count": 0.0,
+ "step": 6484,
+ "text_loss": 0.47120073437690735
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.450836513061343,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.0003610246755571029,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10462190.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006079549202695489,
+ "skip_count": 0.0,
+ "step": 6486,
+ "text_loss": 0.8426173329353333
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.000360727382074679,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10465233.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00596054969355464,
+ "skip_count": 0.0,
+ "step": 6488,
+ "text_loss": 0.18435880541801453
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.469621367772234,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00036043014196178463,
+ "loss": 0.0046,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 10468135.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008584967814385891,
+ "skip_count": 1.0,
+ "step": 6490,
+ "text_loss": 0.3827758729457855
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 30.479013795127678,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 0.00036013295533232344,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 10471032.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005076571833342314,
+ "skip_count": 5.0,
+ "step": 6492,
+ "text_loss": 0.1215854063630104
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 30.488406222483125,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.0003598358223001776,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10474779.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.005972118582576513,
+ "skip_count": 0.0,
+ "step": 6494,
+ "text_loss": 0.22768665850162506
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.49779864983857,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.0003595387429792091,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10478015.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004733685404062271,
+ "skip_count": 1.0,
+ "step": 6496,
+ "text_loss": 0.5013535618782043
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.507191077194012,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.00035924171748325916,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10481113.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01148980576545,
+ "skip_count": 2.0,
+ "step": 6498,
+ "text_loss": 0.3281762897968292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.516583504549455,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0003589447459261487,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10484049.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007726775947958231,
+ "skip_count": 2.0,
+ "step": 6500,
+ "text_loss": 0.46294569969177246
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.525975931904902,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00035864782842167763,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10487443.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0013331319205462933,
+ "skip_count": 0.0,
+ "step": 6502,
+ "text_loss": 0.5122153759002686
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.00035835096508362544,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10490535.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011629529763013124,
+ "skip_count": 0.0,
+ "step": 6504,
+ "text_loss": 0.40683525800704956
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00035805415602575054,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10493575.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004780632443726063,
+ "skip_count": 0.0,
+ "step": 6506,
+ "text_loss": 0.37263134121894836
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.554153213971237,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.00035775740136179075,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10496193.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018355643842369318,
+ "skip_count": 0.0,
+ "step": 6508,
+ "text_loss": 0.2074306458234787
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.56354564132668,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.00035746070120546314,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10500135.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004067617934197187,
+ "skip_count": 1.0,
+ "step": 6510,
+ "text_loss": 0.26313406229019165
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.572938068682124,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00035716405567046383,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10503533.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005438363179564476,
+ "skip_count": 0.0,
+ "step": 6512,
+ "text_loss": 0.3448122441768646
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.00035686746487046767,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10506207.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012895528925582767,
+ "skip_count": 0.0,
+ "step": 6514,
+ "text_loss": 0.43096476793289185
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0003565709289191291,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10509257.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003141741268336773,
+ "skip_count": 0.0,
+ "step": 6516,
+ "text_loss": 0.22349724173545837
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.601115350748458,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 0.0003562744479300811,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10512554.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005669888923875988,
+ "skip_count": 0.0,
+ "step": 6518,
+ "text_loss": 0.5319190621376038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.610507778103905,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.00035597802201693587,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10515720.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020814717281609774,
+ "skip_count": 0.0,
+ "step": 6520,
+ "text_loss": 0.20216144621372223
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.61990020545935,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0003556816512932841,
+ "loss": 0.0057,
+ "macro_f1": 1.0,
+ "num_tokens": 10518517.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.010716461576521397,
+ "skip_count": 3.0,
+ "step": 6522,
+ "text_loss": 0.15843836963176727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.629292632814792,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01806640625,
+ "learning_rate": 0.0003553853358726959,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10521414.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014748790999874473,
+ "skip_count": 0.0,
+ "step": 6524,
+ "text_loss": 0.393892377614975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00035508907586871984,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10524210.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004757299611810595,
+ "skip_count": 0.0,
+ "step": 6526,
+ "text_loss": 0.2557907700538635
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.648077487525683,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.00035479287139488327,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10527327.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002445317106321454,
+ "skip_count": 0.0,
+ "step": 6528,
+ "text_loss": 0.48338422179222107
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.0003544967225646922,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10530363.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015845977468416095,
+ "skip_count": 0.0,
+ "step": 6530,
+ "text_loss": 0.6474354267120361
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.666862342236573,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.00035420062949163166,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10533444.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002190655330196023,
+ "skip_count": 0.0,
+ "step": 6532,
+ "text_loss": 0.3789777457714081
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.676254769592017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0003539045922891649,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10536711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00317079434171319,
+ "skip_count": 0.0,
+ "step": 6534,
+ "text_loss": 0.25758084654808044
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.68564719694746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00035360861107073394,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10539849.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010938458144664764,
+ "skip_count": 0.0,
+ "step": 6536,
+ "text_loss": 0.9821014404296875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.695039624302908,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0003533126859497592,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10543004.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003071998478844762,
+ "skip_count": 2.0,
+ "step": 6538,
+ "text_loss": 0.6314182281494141
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0003530168170396401,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10545965.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006067665759474039,
+ "skip_count": 2.0,
+ "step": 6540,
+ "text_loss": 0.5021927356719971
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0167236328125,
+ "learning_rate": 0.000352721004453754,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10549188.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019109295681118965,
+ "skip_count": 0.0,
+ "step": 6542,
+ "text_loss": 0.3008780777454376
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 30.723216906369238,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.00035242524830545683,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10552298.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007457790896296501,
+ "skip_count": 3.0,
+ "step": 6544,
+ "text_loss": 0.5675695538520813
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.732609333724685,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.0003521295487080829,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10555123.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007243642583489418,
+ "skip_count": 1.0,
+ "step": 6546,
+ "text_loss": 0.17955881357192993
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.00035183390577494476,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10559653.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004024330526590347,
+ "skip_count": 0.0,
+ "step": 6548,
+ "text_loss": 0.2634682357311249
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.751394188435572,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017578125,
+ "learning_rate": 0.0003515383196193336,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10563770.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010837121866643429,
+ "skip_count": 0.0,
+ "step": 6550,
+ "text_loss": 0.1608252227306366
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0003512427903545183,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10567117.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003473864868283272,
+ "skip_count": 0.0,
+ "step": 6552,
+ "text_loss": 0.231611430644989
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.770179043146463,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0003509473180937464,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10570622.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004441239405423403,
+ "skip_count": 1.0,
+ "step": 6554,
+ "text_loss": 0.3193909227848053
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.779571470501907,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0654296875,
+ "learning_rate": 0.0003506519029502433,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10573411.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008821079391054809,
+ "skip_count": 0.0,
+ "step": 6556,
+ "text_loss": 0.4478783905506134
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.788963897857354,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.0003503565450372128,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10576422.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0014448441797867417,
+ "skip_count": 0.0,
+ "step": 6558,
+ "text_loss": 0.46065983176231384
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.798356325212797,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0003500612444678365,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10579879.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007939066737890244,
+ "skip_count": 1.0,
+ "step": 6560,
+ "text_loss": 0.3299395740032196
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.80774875256824,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.000349766001355274,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10583067.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010073966346681118,
+ "skip_count": 2.0,
+ "step": 6562,
+ "text_loss": 0.278255820274353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.817141179923688,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.00034947081581266335,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10586276.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0062315030954778194,
+ "skip_count": 1.0,
+ "step": 6564,
+ "text_loss": 0.22706018388271332
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0003491756879531201,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 10589257.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.0023778853937983513,
+ "skip_count": 4.0,
+ "step": 6566,
+ "text_loss": 0.5567800998687744
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 0.0003488806178897377,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10592163.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004184350254945457,
+ "skip_count": 0.0,
+ "step": 6568,
+ "text_loss": 0.4027897119522095
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.845318461990022,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.0003485856057355876,
+ "loss": 0.0027,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10595326.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035254736430943012,
+ "skip_count": 1.0,
+ "step": 6570,
+ "text_loss": 0.3044572174549103
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.854710889345466,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.000348290651603719,
+ "loss": 0.0029,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10598236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030894684605300426,
+ "skip_count": 0.0,
+ "step": 6572,
+ "text_loss": 0.23021161556243896
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 30.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.00034799575560715896,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10601653.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0036557347048074007,
+ "skip_count": 0.0,
+ "step": 6574,
+ "text_loss": 0.5437754392623901
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.0003477009178589121,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 10604581.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.021344119682908058,
+ "skip_count": 4.0,
+ "step": 6576,
+ "text_loss": 0.29078927636146545
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 30.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0003474061384719608,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 10607676.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0037169242277741432,
+ "skip_count": 1.0,
+ "step": 6578,
+ "text_loss": 1.1790896654129028
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.892280598767243,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.0003471114175592649,
+ "loss": 0.0057,
+ "macro_f1": 1.0,
+ "num_tokens": 10611269.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005873420741409063,
+ "skip_count": 4.0,
+ "step": 6580,
+ "text_loss": 0.36204129457473755
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.901673026122687,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0003468167552337624,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 10614335.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01030842587351799,
+ "skip_count": 2.0,
+ "step": 6582,
+ "text_loss": 0.20400437712669373
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.911065453478134,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.061767578125,
+ "learning_rate": 0.00034652215160836826,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10617565.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025721401907503605,
+ "skip_count": 0.0,
+ "step": 6584,
+ "text_loss": 0.44676345586776733
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 30.920457880833577,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.00034622760679597507,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10620706.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005751762073487043,
+ "skip_count": 1.0,
+ "step": 6586,
+ "text_loss": 0.4733653664588928
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 30.92985030818902,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.00034593312090945306,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10623916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029759553726762533,
+ "skip_count": 3.0,
+ "step": 6588,
+ "text_loss": 0.49876922369003296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.939242735544468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0003456386940616498,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10628093.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010031822603195906,
+ "skip_count": 0.0,
+ "step": 6590,
+ "text_loss": 0.42708611488342285
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.00034534432636539004,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10631739.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014793311711400747,
+ "skip_count": 0.0,
+ "step": 6592,
+ "text_loss": 0.18193726241588593
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0003450500179334762,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10634862.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0059733521193265915,
+ "skip_count": 2.0,
+ "step": 6594,
+ "text_loss": 0.28596529364585876
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.967420017610802,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0003447557688786879,
+ "loss": 0.0043,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 10637758.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0076768649742007256,
+ "skip_count": 1.0,
+ "step": 6596,
+ "text_loss": 0.39428210258483887
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.976812444966246,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00034446157931378185,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10640440.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015128811355680227,
+ "skip_count": 0.0,
+ "step": 6598,
+ "text_loss": 0.45584383606910706
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 30.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043701171875,
+ "learning_rate": 0.00034416744935149193,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10643600.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000757391273509711,
+ "skip_count": 0.0,
+ "step": 6600,
+ "text_loss": 0.503209114074707
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 30.995597299677137,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.060302734375,
+ "learning_rate": 0.0003438733791045294,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10646907.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025944956578314304,
+ "skip_count": 2.0,
+ "step": 6602,
+ "text_loss": 0.4370735287666321
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.00469621367772,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.00034357936868558255,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10649995.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006543452036567032,
+ "skip_count": 0.0,
+ "step": 6604,
+ "text_loss": 0.4125586748123169
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.014088641033165,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00034328541820731663,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10653251.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00027016724925488234,
+ "skip_count": 1.0,
+ "step": 6606,
+ "text_loss": 0.7309898734092712
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 31.023481068388612,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 0.00034299152778237413,
+ "loss": 0.0062,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 10657229.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01905548945069313,
+ "skip_count": 2.0,
+ "step": 6608,
+ "text_loss": 0.42367079854011536
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 0.0003426976975233744,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10660524.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004718089767266065,
+ "skip_count": 0.0,
+ "step": 6610,
+ "text_loss": 0.6613664627075195
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 31.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.00034240392754291343,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10663908.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0027069442439824343,
+ "skip_count": 0.0,
+ "step": 6612,
+ "text_loss": 0.859471321105957
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.051658350454947,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.000342110217953565,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10667814.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015497280983254313,
+ "skip_count": 0.0,
+ "step": 6614,
+ "text_loss": 0.18337638676166534
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.06105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.0003418165688678788,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10671630.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013396464055404067,
+ "skip_count": 0.0,
+ "step": 6616,
+ "text_loss": 0.860016405582428
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 31.070443205165834,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.0003415229803983819,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10675308.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007542039267718792,
+ "skip_count": 3.0,
+ "step": 6618,
+ "text_loss": 0.15481022000312805
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0003412294526575779,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10678092.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002029839437454939,
+ "skip_count": 2.0,
+ "step": 6620,
+ "text_loss": 0.5121933221817017
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.00034093598575794706,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10681382.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013001341139897704,
+ "skip_count": 0.0,
+ "step": 6622,
+ "text_loss": 0.4555061161518097
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.098620487232168,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.00034064257981194655,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10684255.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007926415419206023,
+ "skip_count": 0.0,
+ "step": 6624,
+ "text_loss": 0.7298227548599243
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.108012914587615,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0003403492349320101,
+ "loss": 0.0031,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10686904.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021080176811665297,
+ "skip_count": 1.0,
+ "step": 6626,
+ "text_loss": 0.45434215664863586
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.11740534194306,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.000340055951230548,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10690311.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004011874087154865,
+ "skip_count": 0.0,
+ "step": 6628,
+ "text_loss": 0.15496443212032318
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.126797769298502,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.00033976272881994707,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10693395.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031893099658191204,
+ "skip_count": 2.0,
+ "step": 6630,
+ "text_loss": 0.5291517972946167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0003394695678125708,
+ "loss": 0.0085,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10697046.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033124347683042288,
+ "skip_count": 1.0,
+ "step": 6632,
+ "text_loss": 0.2893230617046356
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.145582624009393,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.00033917646832075886,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10700111.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002547801472246647,
+ "skip_count": 0.0,
+ "step": 6634,
+ "text_loss": 0.10363512486219406
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 31.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.0003388834304568275,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10703939.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0019040531478822231,
+ "skip_count": 0.0,
+ "step": 6636,
+ "text_loss": 0.5185034275054932
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.164367478720283,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00033859045433306975,
+ "loss": 0.0034,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10707187.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0074104927480220795,
+ "skip_count": 2.0,
+ "step": 6638,
+ "text_loss": 0.1618153154850006
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.173759906075727,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048583984375,
+ "learning_rate": 0.0003382975400617543,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10710029.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013861875049769878,
+ "skip_count": 1.0,
+ "step": 6640,
+ "text_loss": 0.6674485206604004
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.18315233343117,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0003380046877551266,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10713318.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034452753607183695,
+ "skip_count": 0.0,
+ "step": 6642,
+ "text_loss": 0.39299124479293823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.192544760786618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.0003377118975254082,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10716130.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006802885327488184,
+ "skip_count": 2.0,
+ "step": 6644,
+ "text_loss": 0.12942606210708618
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.20193718814206,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.0003374191694847968,
+ "loss": 0.0052,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 10719400.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03718209266662598,
+ "skip_count": 2.0,
+ "step": 6646,
+ "text_loss": 0.34327754378318787
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0003371265037454663,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10722108.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006016947794705629,
+ "skip_count": 2.0,
+ "step": 6648,
+ "text_loss": 0.15644726157188416
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.220722042852948,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00033683390041956663,
+ "loss": 0.0075,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 10725709.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04308273270726204,
+ "skip_count": 2.0,
+ "step": 6650,
+ "text_loss": 0.1875772923231125
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 31.230114470208395,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.0003365413596192243,
+ "loss": 0.0037,
+ "macro_f1": 1.0,
+ "num_tokens": 10728717.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006372809875756502,
+ "skip_count": 1.0,
+ "step": 6652,
+ "text_loss": 0.4948291778564453
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.00033624888145654137,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10732082.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014530479675158858,
+ "skip_count": 0.0,
+ "step": 6654,
+ "text_loss": 0.44932305812835693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.248899324919282,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00033595646604359585,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10734663.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001924810465425253,
+ "skip_count": 0.0,
+ "step": 6656,
+ "text_loss": 0.45626893639564514
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 31.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.00033566411349244206,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10737470.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0040014320984482765,
+ "skip_count": 0.0,
+ "step": 6658,
+ "text_loss": 0.2700682580471039
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.267684179630173,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.00033537182391510996,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10740228.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008573737577535212,
+ "skip_count": 0.0,
+ "step": 6660,
+ "text_loss": 0.5626822113990784
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.277076606985617,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0003350795974236055,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10742883.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011166860349476337,
+ "skip_count": 1.0,
+ "step": 6662,
+ "text_loss": 0.23357805609703064
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 31.286469034341064,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.00033478743412991037,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 10746459.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01719980500638485,
+ "skip_count": 6.0,
+ "step": 6664,
+ "text_loss": 0.150017648935318
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.295861461696507,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.00033449533414598223,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10749984.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038280142471194267,
+ "skip_count": 2.0,
+ "step": 6666,
+ "text_loss": 0.6312657594680786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.30525388905195,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.00033420329758375423,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10752792.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007688060286454856,
+ "skip_count": 1.0,
+ "step": 6668,
+ "text_loss": 0.6794863939285278
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.314646316407398,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.00033391132455513537,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10756125.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003196930279955268,
+ "skip_count": 2.0,
+ "step": 6670,
+ "text_loss": 0.22897565364837646
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0003336194151720102,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10759296.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026212623342871666,
+ "skip_count": 0.0,
+ "step": 6672,
+ "text_loss": 0.5236268639564514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.0003333275695462391,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10762574.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007855101488530636,
+ "skip_count": 2.0,
+ "step": 6674,
+ "text_loss": 0.2971038818359375
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.342823598473732,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0003330357877896577,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10765758.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004191791173070669,
+ "skip_count": 2.0,
+ "step": 6676,
+ "text_loss": 0.17358586192131042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.352216025829176,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.0003327440700140774,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10769396.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004101858474314213,
+ "skip_count": 1.0,
+ "step": 6678,
+ "text_loss": 0.28932204842567444
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.000332452416331285,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10772605.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008305918308906257,
+ "skip_count": 0.0,
+ "step": 6680,
+ "text_loss": 0.47090092301368713
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 0.0003321608268530427,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10776576.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003022305201739073,
+ "skip_count": 1.0,
+ "step": 6682,
+ "text_loss": 0.4467788338661194
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 31.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.00033186930169108795,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10779648.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0021474999375641346,
+ "skip_count": 0.0,
+ "step": 6684,
+ "text_loss": 0.6249470710754395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.389785735250953,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.054931640625,
+ "learning_rate": 0.00033157784095713417,
+ "loss": 0.009,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10782665.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025120675563812256,
+ "skip_count": 1.0,
+ "step": 6686,
+ "text_loss": 0.6763803958892822
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.399178162606397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0003312864447628695,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10785789.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013111691223457456,
+ "skip_count": 1.0,
+ "step": 6688,
+ "text_loss": 0.6609058380126953
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.408570589961844,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.00033099511321995744,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10788846.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012354454956948757,
+ "skip_count": 0.0,
+ "step": 6690,
+ "text_loss": 0.4421829283237457
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.417963017317287,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0003307038464400368,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10791611.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035219944547861814,
+ "skip_count": 2.0,
+ "step": 6692,
+ "text_loss": 0.16222824156284332
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 31.42735544467273,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.00033041264453472153,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10794868.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0007216202793642879,
+ "skip_count": 0.0,
+ "step": 6694,
+ "text_loss": 0.37388721108436584
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 31.436747872028178,
+ "f1_execute": 0.9743589162826538,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0003301215076156008,
+ "loss": 0.0063,
+ "macro_f1": 0.8803418874740601,
+ "num_tokens": 10797737.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.025403080508112907,
+ "skip_count": 7.0,
+ "step": 6696,
+ "text_loss": 0.5086690187454224
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0003298304357942389,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10800972.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010532539337873459,
+ "skip_count": 2.0,
+ "step": 6698,
+ "text_loss": 0.22500646114349365
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.00032953942918217494,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10803654.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009591903653927147,
+ "skip_count": 0.0,
+ "step": 6700,
+ "text_loss": 0.6256277561187744
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.464925154094512,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0003292484878909232,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10807506.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003801517654210329,
+ "skip_count": 2.0,
+ "step": 6702,
+ "text_loss": 0.522081196308136
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.474317581449956,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0189208984375,
+ "learning_rate": 0.00032895761203197317,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10810163.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002608039416372776,
+ "skip_count": 2.0,
+ "step": 6704,
+ "text_loss": 0.3600201904773712
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00032866680171678874,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10813202.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026464913971722126,
+ "skip_count": 0.0,
+ "step": 6706,
+ "text_loss": 0.2513798773288727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.493102436160846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.00032837605705680895,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10816484.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027157769072800875,
+ "skip_count": 0.0,
+ "step": 6708,
+ "text_loss": 0.34391456842422485
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 31.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 0.0003280853781634481,
+ "loss": 0.0041,
+ "macro_f1": 1.0,
+ "num_tokens": 10819794.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016086180694401264,
+ "skip_count": 1.0,
+ "step": 6710,
+ "text_loss": 0.6535179615020752
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0003277947651480946,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10823033.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002368347719311714,
+ "skip_count": 0.0,
+ "step": 6712,
+ "text_loss": 0.5596423745155334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.0003275042181221119,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10826276.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003124286886304617,
+ "skip_count": 0.0,
+ "step": 6714,
+ "text_loss": 0.6584402322769165
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.530672145582624,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0003272137371968382,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10828846.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006088328082114458,
+ "skip_count": 0.0,
+ "step": 6716,
+ "text_loss": 0.4602710008621216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.540064572938068,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.00032692332248358645,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10832025.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002511275466531515,
+ "skip_count": 2.0,
+ "step": 6718,
+ "text_loss": 0.42790886759757996
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 31.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.000326632974093644,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10835110.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01076667383313179,
+ "skip_count": 0.0,
+ "step": 6720,
+ "text_loss": 0.5659847855567932
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 31.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.0003263426921382728,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 10838279.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.004973042290657759,
+ "skip_count": 2.0,
+ "step": 6722,
+ "text_loss": 0.675341010093689
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.568241855004402,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.00032605247672870964,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10841381.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013990222942084074,
+ "skip_count": 0.0,
+ "step": 6724,
+ "text_loss": 0.5389315485954285
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.57763428235985,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.00032576232797616554,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10844583.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003186358604580164,
+ "skip_count": 1.0,
+ "step": 6726,
+ "text_loss": 0.5603348016738892
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.587026709715293,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0003254722459918261,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10847670.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001443870598450303,
+ "skip_count": 0.0,
+ "step": 6728,
+ "text_loss": 0.6922405362129211
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.596419137070736,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0003251822308868512,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10851479.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004294445738196373,
+ "skip_count": 0.0,
+ "step": 6730,
+ "text_loss": 0.7145437002182007
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.00032489228277237514,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10854489.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032078945077955723,
+ "skip_count": 0.0,
+ "step": 6732,
+ "text_loss": 0.4077773094177246
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.615203991781627,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.00032460240175950664,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 10856954.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038214854430407286,
+ "skip_count": 2.0,
+ "step": 6734,
+ "text_loss": 0.32071781158447266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.0003243125879593286,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10860016.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013407845981419086,
+ "skip_count": 0.0,
+ "step": 6736,
+ "text_loss": 0.45335495471954346
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0003240228414828984,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10863021.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010989385191351175,
+ "skip_count": 0.0,
+ "step": 6738,
+ "text_loss": 0.562619149684906
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046630859375,
+ "learning_rate": 0.0003237331624412473,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10866548.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006139552686363459,
+ "skip_count": 0.0,
+ "step": 6740,
+ "text_loss": 0.14510060846805573
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.652773701203404,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00032344355094538087,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10869402.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004785746335983276,
+ "skip_count": 0.0,
+ "step": 6742,
+ "text_loss": 0.5655979514122009
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.662166128558848,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.00032315400710627876,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10874165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0052397786639630795,
+ "skip_count": 0.0,
+ "step": 6744,
+ "text_loss": 0.4785873591899872
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 31.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.0003228645310348948,
+ "loss": 0.0036,
+ "macro_f1": 1.0,
+ "num_tokens": 10876919.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.00460197776556015,
+ "skip_count": 1.0,
+ "step": 6746,
+ "text_loss": 0.5683879256248474
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.0003225751228421566,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10880179.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032690472435206175,
+ "skip_count": 0.0,
+ "step": 6748,
+ "text_loss": 0.5268497467041016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.690343410625182,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052978515625,
+ "learning_rate": 0.00032228578263896607,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10883711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036305058747529984,
+ "skip_count": 0.0,
+ "step": 6750,
+ "text_loss": 0.16675594449043274
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.69973583798063,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.0003219965105361989,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10887041.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002453352091833949,
+ "skip_count": 1.0,
+ "step": 6752,
+ "text_loss": 0.7010246515274048
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.709128265336073,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.00032170730664470465,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10890053.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020381701178848743,
+ "skip_count": 0.0,
+ "step": 6754,
+ "text_loss": 0.46637895703315735
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.718520692691516,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0003214181710753069,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10893501.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004525696858763695,
+ "skip_count": 0.0,
+ "step": 6756,
+ "text_loss": 0.1768684983253479
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 31.727913120046964,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0003211291039388026,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10896480.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038154330104589462,
+ "skip_count": 0.0,
+ "step": 6758,
+ "text_loss": 0.7908347845077515
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.737305547402407,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.00032084010534596326,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10899158.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004711449146270752,
+ "skip_count": 2.0,
+ "step": 6760,
+ "text_loss": 0.37209007143974304
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 31.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0003205511754075335,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 10901791.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0025003373157233,
+ "skip_count": 1.0,
+ "step": 6762,
+ "text_loss": 0.8081201314926147
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 31.756090402113298,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.00032026231423423204,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10904817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007387075573205948,
+ "skip_count": 3.0,
+ "step": 6764,
+ "text_loss": 0.30355480313301086
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 31.76548282946874,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0003199735219367507,
+ "loss": 0.0061,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 10908018.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04275592789053917,
+ "skip_count": 0.0,
+ "step": 6766,
+ "text_loss": 0.26562029123306274
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.774875256824185,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.0003196847986257553,
+ "loss": 0.008,
+ "macro_f1": 0.9255813956260681,
+ "num_tokens": 10911264.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.034824032336473465,
+ "skip_count": 4.0,
+ "step": 6768,
+ "text_loss": 0.2761698067188263
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.784267684179632,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00031939614441188523,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10915964.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011179742868989706,
+ "skip_count": 0.0,
+ "step": 6770,
+ "text_loss": 0.4107927083969116
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.00031910755940575344,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10918678.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011521469568833709,
+ "skip_count": 0.0,
+ "step": 6772,
+ "text_loss": 0.43064895272254944
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 31.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01708984375,
+ "learning_rate": 0.000318819043717946,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 10921757.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002861087443307042,
+ "skip_count": 1.0,
+ "step": 6774,
+ "text_loss": 0.5945150852203369
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.812444966245963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.0003185305974590229,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10924767.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011365334503352642,
+ "skip_count": 0.0,
+ "step": 6776,
+ "text_loss": 0.36615172028541565
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 31.82183739360141,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0003182422207395171,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10927750.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0034391419030725956,
+ "skip_count": 0.0,
+ "step": 6778,
+ "text_loss": 0.17081251740455627
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.831229820956853,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0003179539136699351,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10930817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004941808991134167,
+ "skip_count": 2.0,
+ "step": 6780,
+ "text_loss": 0.7683762311935425
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 31.840622248312297,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.00031766567636075675,
+ "loss": 0.0061,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 10933882.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.017502857372164726,
+ "skip_count": 2.0,
+ "step": 6782,
+ "text_loss": 0.38010457158088684
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0003173775089224353,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 10936909.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0035372809506952763,
+ "skip_count": 2.0,
+ "step": 6784,
+ "text_loss": 0.5760656595230103
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.859407103023187,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.00031708941146539707,
+ "loss": 0.0061,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 10940032.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02229934185743332,
+ "skip_count": 0.0,
+ "step": 6786,
+ "text_loss": 0.5767728090286255
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.00031680138410004123,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10943217.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028649091254919767,
+ "skip_count": 1.0,
+ "step": 6788,
+ "text_loss": 0.9756367802619934
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.878191957734078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.00031651342693674066,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10947847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0039158593863248825,
+ "skip_count": 2.0,
+ "step": 6790,
+ "text_loss": 0.2504335045814514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.88758438508952,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.000316225540085841,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10950879.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022091215942054987,
+ "skip_count": 0.0,
+ "step": 6792,
+ "text_loss": 0.525842547416687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.896976812444965,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.00031593772365766105,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10954960.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006841494468972087,
+ "skip_count": 0.0,
+ "step": 6794,
+ "text_loss": 0.6383582353591919
+ },
+ {
+ "acc_repeat": 0.800000011920929,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 31.906369239800412,
+ "f1_execute": 0.9729729890823364,
+ "f1_repeat": 0.888888955116272,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0003156499777624926,
+ "loss": 0.006,
+ "macro_f1": 0.9539539813995361,
+ "num_tokens": 10958278.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.03810702636837959,
+ "skip_count": 5.0,
+ "step": 6796,
+ "text_loss": 0.5901661515235901
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01708984375,
+ "learning_rate": 0.0003153623025106005,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10962412.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00046833412488922477,
+ "skip_count": 0.0,
+ "step": 6798,
+ "text_loss": 0.42693984508514404
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00031507469801222233,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10966037.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006818041671067476,
+ "skip_count": 2.0,
+ "step": 6800,
+ "text_loss": 0.5326262712478638
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.934546521866746,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00031478716437756876,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10969369.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029889161232858896,
+ "skip_count": 0.0,
+ "step": 6802,
+ "text_loss": 0.49028220772743225
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.94393894922219,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0003144997017168232,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10972016.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038266500923782587,
+ "skip_count": 2.0,
+ "step": 6804,
+ "text_loss": 0.43391722440719604
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.953331376577633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0189208984375,
+ "learning_rate": 0.0003142123101401417,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10975153.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005866789724677801,
+ "skip_count": 0.0,
+ "step": 6806,
+ "text_loss": 0.5888382196426392
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.00031392498975765353,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10977881.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002122384263202548,
+ "skip_count": 0.0,
+ "step": 6808,
+ "text_loss": 0.30313390493392944
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0003136377406794604,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10982025.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005535652744583786,
+ "skip_count": 0.0,
+ "step": 6810,
+ "text_loss": 0.5788959264755249
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 31.981508658643968,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0003133505630156365,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 10985419.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010623604990541935,
+ "skip_count": 2.0,
+ "step": 6812,
+ "text_loss": 0.18577243387699127
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 31.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.00031306345687622905,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10989116.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004721239674836397,
+ "skip_count": 0.0,
+ "step": 6814,
+ "text_loss": 0.4818301200866699
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0167236328125,
+ "learning_rate": 0.0003127764223712575,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 10992064.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004238430701661855,
+ "skip_count": 0.0,
+ "step": 6816,
+ "text_loss": 0.7482771277427673
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.00939242735544,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0003124894596107141,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 10994903.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005224394146353006,
+ "skip_count": 2.0,
+ "step": 6818,
+ "text_loss": 0.186603844165802
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.00031220256870456356,
+ "loss": 0.0069,
+ "macro_f1": 1.0,
+ "num_tokens": 10998692.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0021751862950623035,
+ "skip_count": 2.0,
+ "step": 6820,
+ "text_loss": 0.45633986592292786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 32.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.00031191574976274284,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11001284.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004747046157717705,
+ "skip_count": 4.0,
+ "step": 6822,
+ "text_loss": 0.5651670694351196
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0003116290028951617,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11004293.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008316585444845259,
+ "skip_count": 0.0,
+ "step": 6824,
+ "text_loss": 0.3167279362678528
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055419921875,
+ "learning_rate": 0.000311342328211702,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11007080.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004732926026917994,
+ "skip_count": 0.0,
+ "step": 6826,
+ "text_loss": 0.49171411991119385
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.000311055725822218,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11010078.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004238729365170002,
+ "skip_count": 0.0,
+ "step": 6828,
+ "text_loss": 0.21484950184822083
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0003107691958365361,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11013368.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029175232630223036,
+ "skip_count": 2.0,
+ "step": 6830,
+ "text_loss": 0.3718266189098358
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0003104827383644555,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11016704.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00191891985014081,
+ "skip_count": 0.0,
+ "step": 6832,
+ "text_loss": 0.28772637248039246
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.084531846199,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.00031019635351574705,
+ "loss": 0.0035,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11019651.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004300855100154877,
+ "skip_count": 2.0,
+ "step": 6834,
+ "text_loss": 0.6583508849143982
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.09392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.000309910041400154,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11023847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00037701442488469183,
+ "skip_count": 0.0,
+ "step": 6836,
+ "text_loss": 0.36090534925460815
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 32.10331670090989,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.0003096238021273917,
+ "loss": 0.0077,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 11027804.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03601725772023201,
+ "skip_count": 3.0,
+ "step": 6838,
+ "text_loss": 0.24180401861667633
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.11270912826534,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.00030933763580714757,
+ "loss": 0.0052,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 11030778.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.023780640214681625,
+ "skip_count": 2.0,
+ "step": 6840,
+ "text_loss": 0.4978102743625641
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00030905154254908104,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11034863.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00565778324380517,
+ "skip_count": 0.0,
+ "step": 6842,
+ "text_loss": 0.558772623538971
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.00030876552246282356,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11038488.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010575232096016407,
+ "skip_count": 0.0,
+ "step": 6844,
+ "text_loss": 0.2955974340438843
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.0003084795756579787,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11041796.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015910190995782614,
+ "skip_count": 0.0,
+ "step": 6846,
+ "text_loss": 0.5009704828262329
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0003081937022441217,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11045141.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008034126949496567,
+ "skip_count": 0.0,
+ "step": 6848,
+ "text_loss": 0.3965311646461487
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 32.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0003079079023307999,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11047814.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.00810160581022501,
+ "skip_count": 0.0,
+ "step": 6850,
+ "text_loss": 0.24341927468776703
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0003076221760275321,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11051330.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006590691395103931,
+ "skip_count": 0.0,
+ "step": 6852,
+ "text_loss": 0.5887606739997864
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00030733652344380936,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11055006.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005845054984092712,
+ "skip_count": 0.0,
+ "step": 6854,
+ "text_loss": 0.6621366739273071
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0003070509446890944,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11058470.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0041051446460187435,
+ "skip_count": 1.0,
+ "step": 6856,
+ "text_loss": 0.31603100895881653
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0003067654398728214,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11061620.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001603201380930841,
+ "skip_count": 0.0,
+ "step": 6858,
+ "text_loss": 0.5167516469955444
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 0.00030648000910439636,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11064727.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024816282093524933,
+ "skip_count": 0.0,
+ "step": 6860,
+ "text_loss": 0.5869330167770386
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.21602582917523,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00030619465249319693,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11068208.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003121294779703021,
+ "skip_count": 0.0,
+ "step": 6862,
+ "text_loss": 0.3920222818851471
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 0.0003059093701485722,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11071315.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033239589538425207,
+ "skip_count": 1.0,
+ "step": 6864,
+ "text_loss": 0.4201887845993042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 0.00030562416217984296,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11074144.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016117560444399714,
+ "skip_count": 0.0,
+ "step": 6866,
+ "text_loss": 0.5283045172691345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.0003053390286963015,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11077152.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003879208816215396,
+ "skip_count": 0.0,
+ "step": 6868,
+ "text_loss": 0.16188788414001465
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.00030505396980721143,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11080200.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007632353343069553,
+ "skip_count": 1.0,
+ "step": 6870,
+ "text_loss": 0.25986847281455994
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.00030476898562180793,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11083356.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004322016146034002,
+ "skip_count": 2.0,
+ "step": 6872,
+ "text_loss": 0.49556297063827515
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.0003044840762492974,
+ "loss": 0.0037,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11086354.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031272871419787407,
+ "skip_count": 2.0,
+ "step": 6874,
+ "text_loss": 0.1658666580915451
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.0003041992417988577,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11088850.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005371398758143187,
+ "skip_count": 2.0,
+ "step": 6876,
+ "text_loss": 0.22437214851379395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.0003039144823796378,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11091784.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025086402893066406,
+ "skip_count": 0.0,
+ "step": 6878,
+ "text_loss": 0.7293354868888855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.0003036297981007581,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11095204.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015590827912092209,
+ "skip_count": 1.0,
+ "step": 6880,
+ "text_loss": 0.6406328678131104
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.30995010272967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.0003033451890713103,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11098367.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013142531970515847,
+ "skip_count": 0.0,
+ "step": 6882,
+ "text_loss": 0.5209086537361145
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 32.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0003030606554003571,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11101047.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0018484699539840221,
+ "skip_count": 0.0,
+ "step": 6884,
+ "text_loss": 0.743188202381134
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.32873495744057,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.00030277619719693217,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11104269.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016667681047692895,
+ "skip_count": 0.0,
+ "step": 6886,
+ "text_loss": 0.7918420433998108
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.33812738479601,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 0.0003024918145700406,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11107248.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008098077378235757,
+ "skip_count": 0.0,
+ "step": 6888,
+ "text_loss": 0.3871288299560547
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.0003022075076286582,
+ "loss": 0.0031,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11111204.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002324736909940839,
+ "skip_count": 0.0,
+ "step": 6890,
+ "text_loss": 0.3722921907901764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0003019232764817321,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11114363.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00254769716411829,
+ "skip_count": 0.0,
+ "step": 6892,
+ "text_loss": 0.418519526720047
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.00030163912123818006,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11117718.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000547234492842108,
+ "skip_count": 0.0,
+ "step": 6894,
+ "text_loss": 0.6087009310722351
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.375697094217784,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.0003013550420068909,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11120437.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00015221568173728883,
+ "skip_count": 0.0,
+ "step": 6896,
+ "text_loss": 0.6013991832733154
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 32.385089521573235,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.00030107103889672436,
+ "loss": 0.0085,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 11123708.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.024048971012234688,
+ "skip_count": 2.0,
+ "step": 6898,
+ "text_loss": 0.3612423837184906
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0003007871120165111,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11127294.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013236473314464092,
+ "skip_count": 0.0,
+ "step": 6900,
+ "text_loss": 0.5277031064033508
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.00030050326147505226,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11130270.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028277861420065165,
+ "skip_count": 0.0,
+ "step": 6902,
+ "text_loss": 0.5726971626281738
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0003002194873811197,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11132955.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022369837388396263,
+ "skip_count": 0.0,
+ "step": 6904,
+ "text_loss": 0.18510448932647705
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.00029993578984345673,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11136387.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038351211696863174,
+ "skip_count": 0.0,
+ "step": 6906,
+ "text_loss": 0.28313153982162476
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.43205165835045,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0002996521689707764,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11139740.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00032925375853665173,
+ "skip_count": 0.0,
+ "step": 6908,
+ "text_loss": 0.7315025329589844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.441444085705896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0002993686248717629,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11142587.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002886304398998618,
+ "skip_count": 0.0,
+ "step": 6910,
+ "text_loss": 0.677378237247467
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.45083651306135,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00029908515765507084,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11145415.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038471966981887817,
+ "skip_count": 0.0,
+ "step": 6912,
+ "text_loss": 0.5207083225250244
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0002988017674293254,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11148524.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023522782139480114,
+ "skip_count": 0.0,
+ "step": 6914,
+ "text_loss": 0.42507871985435486
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0189208984375,
+ "learning_rate": 0.0002985184543031222,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11152069.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012464249739423394,
+ "skip_count": 0.0,
+ "step": 6916,
+ "text_loss": 0.5694169998168945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.47901379512768,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 0.0002982352183850274,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11155675.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00828156154602766,
+ "skip_count": 2.0,
+ "step": 6918,
+ "text_loss": 0.22304373979568481
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.48840622248312,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.00029795205978357754,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11158555.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019234733190387487,
+ "skip_count": 0.0,
+ "step": 6920,
+ "text_loss": 0.5519064664840698
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.0002976689786072795,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11161407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003542431222740561,
+ "skip_count": 0.0,
+ "step": 6922,
+ "text_loss": 0.6748810410499573
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.507191077194015,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0002973859749646104,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11166007.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004024899681098759,
+ "skip_count": 0.0,
+ "step": 6924,
+ "text_loss": 0.6613664627075195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 32.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.000297103048964018,
+ "loss": 0.0076,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11169007.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005519595462828875,
+ "skip_count": 3.0,
+ "step": 6926,
+ "text_loss": 0.3815552592277527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.00029682020071392,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11172939.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016999440267682076,
+ "skip_count": 0.0,
+ "step": 6928,
+ "text_loss": 0.6727893352508545
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.535368359260346,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0002965374303227044,
+ "loss": 0.0055,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 11176232.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.030950307846069336,
+ "skip_count": 0.0,
+ "step": 6930,
+ "text_loss": 0.5577763915061951
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.00029625473789872923,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11179775.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00525702815502882,
+ "skip_count": 1.0,
+ "step": 6932,
+ "text_loss": 0.5860039591789246
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.55415321397123,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.000295972123550323,
+ "loss": 0.005,
+ "macro_f1": 1.0,
+ "num_tokens": 11183262.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0048187971115112305,
+ "skip_count": 2.0,
+ "step": 6934,
+ "text_loss": 0.7328732013702393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.563545641326684,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.016357421875,
+ "learning_rate": 0.00029568958738578364,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11186591.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015159632312133908,
+ "skip_count": 0.0,
+ "step": 6936,
+ "text_loss": 0.40563541650772095
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 32.57293806868213,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.017333984375,
+ "learning_rate": 0.0002954071295133801,
+ "loss": 0.005,
+ "macro_f1": 1.0,
+ "num_tokens": 11190056.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011282073333859444,
+ "skip_count": 1.0,
+ "step": 6938,
+ "text_loss": 0.15986496210098267
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 0.0002951247500413504,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 11193504.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.010220487602055073,
+ "skip_count": 5.0,
+ "step": 6940,
+ "text_loss": 0.2604432702064514
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0002948424490779029,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11196725.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002620660001412034,
+ "skip_count": 1.0,
+ "step": 6942,
+ "text_loss": 0.48028868436813354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.00029456022673121597,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11199303.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00042651945841498673,
+ "skip_count": 0.0,
+ "step": 6944,
+ "text_loss": 0.5135554671287537
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.0002942780831094377,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11202319.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005366047378629446,
+ "skip_count": 2.0,
+ "step": 6946,
+ "text_loss": 0.2809196710586548
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.619900205459345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0002939960183206861,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11205622.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033479216508567333,
+ "skip_count": 0.0,
+ "step": 6948,
+ "text_loss": 0.2013140618801117
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.629292632814796,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00029371403247304887,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11208637.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0013508419506251812,
+ "skip_count": 0.0,
+ "step": 6950,
+ "text_loss": 0.4427332580089569
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0002934321256745833,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11211618.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020944071002304554,
+ "skip_count": 0.0,
+ "step": 6952,
+ "text_loss": 0.5406652688980103
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.00029315029803331704,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11214432.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012655078899115324,
+ "skip_count": 0.0,
+ "step": 6954,
+ "text_loss": 0.7720552086830139
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.00029286854965724686,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11218127.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009041395038366318,
+ "skip_count": 0.0,
+ "step": 6956,
+ "text_loss": 0.258109986782074
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 32.66686234223657,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0002925868806543391,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 11221440.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0034558263141661882,
+ "skip_count": 1.0,
+ "step": 6958,
+ "text_loss": 0.5378029942512512
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.00029230529113253,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11225391.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005263930186629295,
+ "skip_count": 2.0,
+ "step": 6960,
+ "text_loss": 0.3616539537906647
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.685647196947464,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0002920237811997251,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11228648.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003730480559170246,
+ "skip_count": 1.0,
+ "step": 6962,
+ "text_loss": 0.46682238578796387
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043701171875,
+ "learning_rate": 0.00029174235096379963,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11231828.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004831735976040363,
+ "skip_count": 1.0,
+ "step": 6964,
+ "text_loss": 0.5718355178833008
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 32.70443205165835,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.046875,
+ "learning_rate": 0.0002914610005325981,
+ "loss": 0.0102,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 11234984.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03880132734775543,
+ "skip_count": 2.0,
+ "step": 6966,
+ "text_loss": 0.3139013946056366
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0002911797300139345,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11239153.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006673726020380855,
+ "skip_count": 0.0,
+ "step": 6968,
+ "text_loss": 0.6040399074554443
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00029089853951559235,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11242178.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0028971200808882713,
+ "skip_count": 0.0,
+ "step": 6970,
+ "text_loss": 0.304967999458313
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.73260933372468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.00029061742914532427,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11245865.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010410466929897666,
+ "skip_count": 0.0,
+ "step": 6972,
+ "text_loss": 0.47892290353775024
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.0002903363990108524,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11248806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002133697969838977,
+ "skip_count": 0.0,
+ "step": 6974,
+ "text_loss": 0.2561415433883667
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 32.751394188435576,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.0002900554492198677,
+ "loss": 0.011,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11251807.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.002402493730187416,
+ "skip_count": 0.0,
+ "step": 6976,
+ "text_loss": 0.652428388595581
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.0002897745798800311,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11254615.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006423915736377239,
+ "skip_count": 0.0,
+ "step": 6978,
+ "text_loss": 0.22414511442184448
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.77017904314646,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.000289493791098972,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11257721.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002536606043577194,
+ "skip_count": 0.0,
+ "step": 6980,
+ "text_loss": 0.1328018754720688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.00028921308298428933,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11260840.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000745086173992604,
+ "skip_count": 0.0,
+ "step": 6982,
+ "text_loss": 0.61724853515625
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05029296875,
+ "learning_rate": 0.0002889324556435509,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11264279.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005258981604129076,
+ "skip_count": 0.0,
+ "step": 6984,
+ "text_loss": 0.1664455235004425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.00028865190918429356,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11268096.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008756023598834872,
+ "skip_count": 0.0,
+ "step": 6986,
+ "text_loss": 0.45111921429634094
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.807748752568244,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.00028837144371402336,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11270611.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008175788098014891,
+ "skip_count": 0.0,
+ "step": 6988,
+ "text_loss": 0.5332239270210266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.81714117992369,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.00028809105934021517,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11273826.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003494064789265394,
+ "skip_count": 0.0,
+ "step": 6990,
+ "text_loss": 0.20264241099357605
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.82653360727913,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0002878107561703127,
+ "loss": 0.0056,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 11276917.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.025257345288991928,
+ "skip_count": 3.0,
+ "step": 6992,
+ "text_loss": 0.18000070750713348
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.835926034634575,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.0002875305343117289,
+ "loss": 0.0044,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 11279637.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.019206687808036804,
+ "skip_count": 1.0,
+ "step": 6994,
+ "text_loss": 0.5872798562049866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.00028725039387184504,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11282717.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009358765557408333,
+ "skip_count": 1.0,
+ "step": 6996,
+ "text_loss": 0.3412095904350281
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 32.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.00028697033495801163,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 11285433.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038775671273469925,
+ "skip_count": 1.0,
+ "step": 6998,
+ "text_loss": 0.4316727817058563
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 32.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0002866903576775475,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11288414.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004292591474950314,
+ "skip_count": 0.0,
+ "step": 7000,
+ "text_loss": 0.45106515288352966
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.873495744056356,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046875,
+ "learning_rate": 0.0002864104621377409,
+ "loss": 0.007,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 11291811.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02195967361330986,
+ "skip_count": 2.0,
+ "step": 7002,
+ "text_loss": 0.29841285943984985
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.0002861306484458481,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11295179.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010119527578353882,
+ "skip_count": 0.0,
+ "step": 7004,
+ "text_loss": 0.5218569040298462
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.89228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.00028585091670909436,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11298182.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002615996403619647,
+ "skip_count": 0.0,
+ "step": 7006,
+ "text_loss": 0.20382621884346008
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00028557126703467316,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11301262.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002726050792261958,
+ "skip_count": 0.0,
+ "step": 7008,
+ "text_loss": 0.26718559861183167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0002852916995297471,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11304590.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005590448854491115,
+ "skip_count": 0.0,
+ "step": 7010,
+ "text_loss": 0.5392091274261475
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.92045788083358,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.00028501221430144667,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11307690.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004541353322565556,
+ "skip_count": 2.0,
+ "step": 7012,
+ "text_loss": 0.16159705817699432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 32.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.00028473281145687137,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11310866.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029630991630256176,
+ "skip_count": 1.0,
+ "step": 7014,
+ "text_loss": 0.9148072600364685
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 28.0,
+ "epoch": 32.93924273554447,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0002844534911030888,
+ "loss": 0.0067,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 11314517.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.023258809000253677,
+ "skip_count": 3.0,
+ "step": 7016,
+ "text_loss": 0.3853590488433838
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.94863516289991,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060546875,
+ "learning_rate": 0.000284174253347135,
+ "loss": 0.0064,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 11317526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010060093365609646,
+ "skip_count": 1.0,
+ "step": 7018,
+ "text_loss": 0.3412325382232666
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00028389509829601444,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11321684.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016713893273845315,
+ "skip_count": 0.0,
+ "step": 7020,
+ "text_loss": 0.9049796462059021
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00028361602605670003,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11324709.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004167001228779554,
+ "skip_count": 2.0,
+ "step": 7022,
+ "text_loss": 0.24364058673381805
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 32.97681244496625,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00028333703673613224,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11327449.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027954576071351767,
+ "skip_count": 4.0,
+ "step": 7024,
+ "text_loss": 0.2872125506401062
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 32.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.00028305813044122096,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11330846.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004644687287509441,
+ "skip_count": 0.0,
+ "step": 7026,
+ "text_loss": 0.1717570424079895
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 32.99559729967714,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06884765625,
+ "learning_rate": 0.00028277930727884336,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11333575.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00557848671451211,
+ "skip_count": 2.0,
+ "step": 7028,
+ "text_loss": 0.3501792550086975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.004696213677725,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00028250056735584496,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11336899.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005694970604963601,
+ "skip_count": 0.0,
+ "step": 7030,
+ "text_loss": 0.5541794300079346
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.01408864103317,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.00028222191077903946,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11340163.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032896639313548803,
+ "skip_count": 0.0,
+ "step": 7032,
+ "text_loss": 0.5618721842765808
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 33.02348106838861,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00028194333765520853,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11343494.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005377276800572872,
+ "skip_count": 0.0,
+ "step": 7034,
+ "text_loss": 0.325153648853302
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.00028166484809110206,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11346126.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001204605447128415,
+ "skip_count": 0.0,
+ "step": 7036,
+ "text_loss": 0.5016651749610901
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.00028138644219343736,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11348879.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005026837810873985,
+ "skip_count": 2.0,
+ "step": 7038,
+ "text_loss": 0.2430499643087387
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.05165835045494,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00028110812006890064,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11352457.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019850607495754957,
+ "skip_count": 0.0,
+ "step": 7040,
+ "text_loss": 0.42376917600631714
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.061050777810394,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0179443359375,
+ "learning_rate": 0.00028082988182414524,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 11356602.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003362950636073947,
+ "skip_count": 2.0,
+ "step": 7042,
+ "text_loss": 0.4165397882461548
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.07044320516584,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.0002805517275657926,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11359451.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019725612364709377,
+ "skip_count": 1.0,
+ "step": 7044,
+ "text_loss": 0.5597621202468872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.0002802736574004319,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11363614.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013963640667498112,
+ "skip_count": 0.0,
+ "step": 7046,
+ "text_loss": 0.6112356185913086
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.00027999567143462015,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11367015.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005658161826431751,
+ "skip_count": 0.0,
+ "step": 7048,
+ "text_loss": 0.4920886754989624
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 33.09862048723217,
+ "f1_execute": 0.9756097793579102,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00027971776977488193,
+ "loss": 0.0064,
+ "macro_f1": 0.925203263759613,
+ "num_tokens": 11370489.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.03657131269574165,
+ "skip_count": 5.0,
+ "step": 7050,
+ "text_loss": 0.28003939986228943
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.10801291458761,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01708984375,
+ "learning_rate": 0.00027943995252771017,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11373614.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004096088465303183,
+ "skip_count": 2.0,
+ "step": 7052,
+ "text_loss": 0.3145081400871277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.117405341943055,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.00027916221979956457,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11377631.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009888096246868372,
+ "skip_count": 0.0,
+ "step": 7054,
+ "text_loss": 0.4898056983947754
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.126797769298506,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.00027888457169687297,
+ "loss": 0.0065,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 11380620.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013347696512937546,
+ "skip_count": 1.0,
+ "step": 7056,
+ "text_loss": 0.7011964917182922
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.00027860700832603056,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11383297.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000849733711220324,
+ "skip_count": 1.0,
+ "step": 7058,
+ "text_loss": 0.4007014334201813
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.14558262400939,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.0002783295297934003,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11386460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001546313869766891,
+ "skip_count": 1.0,
+ "step": 7060,
+ "text_loss": 0.3992713689804077
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0002780521362053123,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11389605.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001045585609972477,
+ "skip_count": 0.0,
+ "step": 7062,
+ "text_loss": 0.4440680146217346
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 33.16436747872028,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.00027777482766806446,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11392105.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00752411549910903,
+ "skip_count": 0.0,
+ "step": 7064,
+ "text_loss": 0.20152349770069122
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 33.17375990607572,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0002774976042879218,
+ "loss": 0.0088,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 11396142.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019917849451303482,
+ "skip_count": 3.0,
+ "step": 7066,
+ "text_loss": 0.24365149438381195
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 33.183152333431174,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.00027722046617111696,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11398827.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015933843096718192,
+ "skip_count": 0.0,
+ "step": 7068,
+ "text_loss": 0.31948477029800415
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.19254476078662,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.00027694341342384977,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11402623.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018986845389008522,
+ "skip_count": 2.0,
+ "step": 7070,
+ "text_loss": 0.47721394896507263
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.00027666644615228727,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11405628.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002975719515234232,
+ "skip_count": 1.0,
+ "step": 7072,
+ "text_loss": 0.3972358703613281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0002763895644625637,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11409468.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005657708737999201,
+ "skip_count": 1.0,
+ "step": 7074,
+ "text_loss": 0.6004229187965393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0002761127684607811,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11412572.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038351903203874826,
+ "skip_count": 2.0,
+ "step": 7076,
+ "text_loss": 1.0837591886520386
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 33.23011447020839,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.00027583605825300795,
+ "loss": 0.0056,
+ "macro_f1": 1.0,
+ "num_tokens": 11416831.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005529445596039295,
+ "skip_count": 2.0,
+ "step": 7078,
+ "text_loss": 0.575986921787262
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.00027555943394528014,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11420557.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006243749521672726,
+ "skip_count": 0.0,
+ "step": 7080,
+ "text_loss": 0.606263279914856
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.248899324919286,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00027528289564360064,
+ "loss": 0.0058,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 11423471.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.031515009701251984,
+ "skip_count": 1.0,
+ "step": 7082,
+ "text_loss": 0.19393208622932434
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0002750064434539394,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11426732.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005052287015132606,
+ "skip_count": 0.0,
+ "step": 7084,
+ "text_loss": 0.7202399969100952
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.26768417963017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00027473007748223357,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11429391.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005099403206259012,
+ "skip_count": 1.0,
+ "step": 7086,
+ "text_loss": 0.20651355385780334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.27707660698562,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00027445379783438685,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11432161.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001447655027732253,
+ "skip_count": 0.0,
+ "step": 7088,
+ "text_loss": 0.34758952260017395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.28646903434106,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.00027417760461627037,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11435417.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000808655982837081,
+ "skip_count": 0.0,
+ "step": 7090,
+ "text_loss": 0.7414838671684265
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.295861461696504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.00027390149793372177,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11438313.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005151710007339716,
+ "skip_count": 0.0,
+ "step": 7092,
+ "text_loss": 0.17792417109012604
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.305253889051954,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.00027362547789254574,
+ "loss": 0.0064,
+ "macro_f1": 1.0,
+ "num_tokens": 11441681.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0037353152874857187,
+ "skip_count": 3.0,
+ "step": 7094,
+ "text_loss": 0.5577781796455383
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.3146463164074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 0.0002733495445985135,
+ "loss": 0.0026,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11444521.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00038075417978689075,
+ "skip_count": 0.0,
+ "step": 7096,
+ "text_loss": 0.5052862167358398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.32403874376284,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.0002730736981573632,
+ "loss": 0.0033,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 11448481.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007313522044569254,
+ "skip_count": 1.0,
+ "step": 7098,
+ "text_loss": 0.5869139432907104
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.0002727979386748001,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11452164.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020673887338489294,
+ "skip_count": 0.0,
+ "step": 7100,
+ "text_loss": 0.4354212284088135
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0002725222662564954,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11455995.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008315460290759802,
+ "skip_count": 0.0,
+ "step": 7102,
+ "text_loss": 0.8714128732681274
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 33.35221602582917,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0002722466810080874,
+ "loss": 0.0053,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 11458828.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010913078673183918,
+ "skip_count": 1.0,
+ "step": 7104,
+ "text_loss": 0.6226683855056763
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.36160845318462,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0002719711830351809,
+ "loss": 0.0076,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 11462448.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.040428292006254196,
+ "skip_count": 1.0,
+ "step": 7106,
+ "text_loss": 0.2543688118457794
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00027169577244334726,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11465796.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004473939072340727,
+ "skip_count": 1.0,
+ "step": 7108,
+ "text_loss": 0.12356872111558914
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 0.00027142044933812424,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11469176.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017961655976250768,
+ "skip_count": 0.0,
+ "step": 7110,
+ "text_loss": 0.6800211668014526
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 33.38978573525095,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.046142578125,
+ "learning_rate": 0.0002711452138250162,
+ "loss": 0.0065,
+ "macro_f1": 1.0,
+ "num_tokens": 11471983.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.003279087832197547,
+ "skip_count": 2.0,
+ "step": 7112,
+ "text_loss": 0.340279757976532
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.3991781626064,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.00027087006600949403,
+ "loss": 0.0065,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 11475656.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.017024178057909012,
+ "skip_count": 1.0,
+ "step": 7114,
+ "text_loss": 0.3556337058544159
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.40857058996184,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0002705950059969948,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11479410.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015487123280763626,
+ "skip_count": 1.0,
+ "step": 7116,
+ "text_loss": 0.4404350817203522
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.41796301731729,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 0.00027032003389292194,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11483302.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011217560386285186,
+ "skip_count": 0.0,
+ "step": 7118,
+ "text_loss": 0.46771445870399475
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.427355444672735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.0002700451498026454,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11486212.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010832607513293624,
+ "skip_count": 0.0,
+ "step": 7120,
+ "text_loss": 0.6795281767845154
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.43674787202818,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00026977035383150106,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11489320.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002290027216076851,
+ "skip_count": 1.0,
+ "step": 7122,
+ "text_loss": 0.5304523706436157
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 33.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.00026949564608479164,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 11492056.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.009950211271643639,
+ "skip_count": 6.0,
+ "step": 7124,
+ "text_loss": 0.21328973770141602
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 33.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0185546875,
+ "learning_rate": 0.0002692210266677855,
+ "loss": 0.0033,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11495165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0079165268689394,
+ "skip_count": 3.0,
+ "step": 7126,
+ "text_loss": 0.19840657711029053
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.00026894649568571724,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11497636.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013852717820554972,
+ "skip_count": 0.0,
+ "step": 7128,
+ "text_loss": 0.3360055088996887
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.47431758144996,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.00026867205324378776,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11500806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010151927126571536,
+ "skip_count": 0.0,
+ "step": 7130,
+ "text_loss": 0.6827390193939209
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.00026839769944716373,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11504187.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001110393786802888,
+ "skip_count": 0.0,
+ "step": 7132,
+ "text_loss": 0.5081584453582764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.49310243616085,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0002681234344009783,
+ "loss": 0.0071,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 11507900.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010587670840322971,
+ "skip_count": 1.0,
+ "step": 7134,
+ "text_loss": 0.28684356808662415
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.00026784925821033014,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11510627.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006658690981566906,
+ "skip_count": 0.0,
+ "step": 7136,
+ "text_loss": 0.24232104420661926
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.00026757517098028417,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11513304.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014556109672412276,
+ "skip_count": 0.0,
+ "step": 7138,
+ "text_loss": 0.4718358516693115
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 33.52127971822718,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00026730117281587116,
+ "loss": 0.0062,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 11516593.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01590067707002163,
+ "skip_count": 3.0,
+ "step": 7140,
+ "text_loss": 0.2810344696044922
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.53067214558262,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00026702726382208774,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11519776.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014479428064078093,
+ "skip_count": 0.0,
+ "step": 7142,
+ "text_loss": 0.48876339197158813
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.54006457293807,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00026675344410389623,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11522499.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003729258431121707,
+ "skip_count": 2.0,
+ "step": 7144,
+ "text_loss": 0.5350890755653381
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 33.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.0002664797137662248,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 11525220.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015156447188928723,
+ "skip_count": 1.0,
+ "step": 7146,
+ "text_loss": 0.5742373466491699
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 33.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.00026620607291396773,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 11527926.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.004842780064791441,
+ "skip_count": 2.0,
+ "step": 7148,
+ "text_loss": 0.4994547665119171
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.5682418550044,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.00026593252165198455,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11531622.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026556351222097874,
+ "skip_count": 0.0,
+ "step": 7150,
+ "text_loss": 0.1567893922328949
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.577634282359845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.00026565906008510064,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11535191.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008135059848427773,
+ "skip_count": 1.0,
+ "step": 7152,
+ "text_loss": 0.289173424243927
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 33.58702670971529,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.000265385688318107,
+ "loss": 0.0083,
+ "macro_f1": 1.0,
+ "num_tokens": 11539060.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0020754633005708456,
+ "skip_count": 1.0,
+ "step": 7154,
+ "text_loss": 0.35089045763015747
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 33.59641913707074,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.0002651124064557602,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11541662.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0023738413583487272,
+ "skip_count": 0.0,
+ "step": 7156,
+ "text_loss": 0.5026801228523254
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.00026483921460278227,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11544763.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003311366541311145,
+ "skip_count": 1.0,
+ "step": 7158,
+ "text_loss": 0.22975654900074005
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.61520399178163,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.0002645661128638609,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11547649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008209354127757251,
+ "skip_count": 0.0,
+ "step": 7160,
+ "text_loss": 0.32840636372566223
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.00026429310134364926,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11550648.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028574815951287746,
+ "skip_count": 0.0,
+ "step": 7162,
+ "text_loss": 0.23239612579345703
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0177001953125,
+ "learning_rate": 0.00026402018014676584,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11553790.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005469404626637697,
+ "skip_count": 1.0,
+ "step": 7164,
+ "text_loss": 0.22877025604248047
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.0002637473493777943,
+ "loss": 0.0046,
+ "macro_f1": 1.0,
+ "num_tokens": 11556802.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0032242932356894016,
+ "skip_count": 2.0,
+ "step": 7166,
+ "text_loss": 0.6376226544380188
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.65277370120341,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.00026347460914128443,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 11559607.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0040627880953252316,
+ "skip_count": 2.0,
+ "step": 7168,
+ "text_loss": 0.6879657506942749
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.00026320195954175043,
+ "loss": 0.0069,
+ "macro_f1": 1.0,
+ "num_tokens": 11562677.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.020494163036346436,
+ "skip_count": 4.0,
+ "step": 7170,
+ "text_loss": 0.3710069954395294
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06787109375,
+ "learning_rate": 0.00026292940068367224,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11565948.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002662271959707141,
+ "skip_count": 0.0,
+ "step": 7172,
+ "text_loss": 0.15041157603263855
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00026265693267149494,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11568836.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0039914860390126705,
+ "skip_count": 1.0,
+ "step": 7174,
+ "text_loss": 0.5372130870819092
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.69034341062518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 0.00026238455560962884,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11572542.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034708199091255665,
+ "skip_count": 0.0,
+ "step": 7176,
+ "text_loss": 0.2956286072731018
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.699735837980626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 0.00026211226960244914,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11575352.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007794995326548815,
+ "skip_count": 2.0,
+ "step": 7178,
+ "text_loss": 0.3691073954105377
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.70912826533607,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.0002618400747542964,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11579110.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009694626205600798,
+ "skip_count": 0.0,
+ "step": 7180,
+ "text_loss": 0.6523211598396301
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.71852069269152,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0002615679711694764,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11582476.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004227840341627598,
+ "skip_count": 1.0,
+ "step": 7182,
+ "text_loss": 0.1997286081314087
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.72791312004696,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 0.00026129595895225965,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11585685.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00126146269030869,
+ "skip_count": 0.0,
+ "step": 7184,
+ "text_loss": 0.486299604177475
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 33.73730554740241,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.0002610240382068818,
+ "loss": 0.006,
+ "macro_f1": 0.8814815282821655,
+ "num_tokens": 11588804.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04553814232349396,
+ "skip_count": 4.0,
+ "step": 7186,
+ "text_loss": 0.1622236669063568
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 0.00026075220903754324,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11591822.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002460496500134468,
+ "skip_count": 2.0,
+ "step": 7188,
+ "text_loss": 0.5573232173919678
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.756090402113294,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0002604804715484095,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11594899.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006854622159153223,
+ "skip_count": 1.0,
+ "step": 7190,
+ "text_loss": 0.4753095507621765
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.00026020882584361094,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11598333.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001945660449564457,
+ "skip_count": 1.0,
+ "step": 7192,
+ "text_loss": 0.8912903666496277
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 31.0,
+ "epoch": 33.77487525682419,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.061767578125,
+ "learning_rate": 0.0002599372720272426,
+ "loss": 0.0064,
+ "macro_f1": 1.0,
+ "num_tokens": 11601814.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.005749753676354885,
+ "skip_count": 1.0,
+ "step": 7194,
+ "text_loss": 0.6041871905326843
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0002596658102033643,
+ "loss": 0.0097,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11604661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025942171923816204,
+ "skip_count": 1.0,
+ "step": 7196,
+ "text_loss": 0.4760607182979584
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 33.793660111535075,
+ "f1_execute": 0.9756097793579102,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.00025939444047600114,
+ "loss": 0.0075,
+ "macro_f1": 0.8807588815689087,
+ "num_tokens": 11608459.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.020141327753663063,
+ "skip_count": 6.0,
+ "step": 7198,
+ "text_loss": 0.6670252084732056
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0002591231629491423,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11611489.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005721202120184898,
+ "skip_count": 1.0,
+ "step": 7200,
+ "text_loss": 0.31318753957748413
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.81244496624596,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.00025885197772674174,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11615234.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027279339265078306,
+ "skip_count": 1.0,
+ "step": 7202,
+ "text_loss": 0.25728851556777954
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.821837393601406,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.00025858088491271825,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11618892.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006987092201597989,
+ "skip_count": 0.0,
+ "step": 7204,
+ "text_loss": 0.5504243969917297
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.83122982095686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00025830988461095504,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11622237.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029056845232844353,
+ "skip_count": 0.0,
+ "step": 7206,
+ "text_loss": 0.5319080948829651
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.8406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 0.0002580389769253001,
+ "loss": 0.0041,
+ "macro_f1": 1.0,
+ "num_tokens": 11624713.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.007346974220126867,
+ "skip_count": 5.0,
+ "step": 7208,
+ "text_loss": 0.8925374746322632
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 0.0002577681619595655,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11628689.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004166684520896524,
+ "skip_count": 0.0,
+ "step": 7210,
+ "text_loss": 0.37282413244247437
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.85940710302319,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.00025749743981752824,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11631581.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013194780796766281,
+ "skip_count": 2.0,
+ "step": 7212,
+ "text_loss": 0.220115065574646
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.0002572268106029295,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11634503.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009112557163462043,
+ "skip_count": 0.0,
+ "step": 7214,
+ "text_loss": 0.5631879568099976
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.878191957734074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.00025695627441947496,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11637790.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011178883723914623,
+ "skip_count": 2.0,
+ "step": 7216,
+ "text_loss": 0.24482154846191406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.887584385089525,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00025668583137083447,
+ "loss": 0.0047,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 11640806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01877705194056034,
+ "skip_count": 2.0,
+ "step": 7218,
+ "text_loss": 0.2229214459657669
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.89697681244497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0002564154815606422,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11644479.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030277224723249674,
+ "skip_count": 0.0,
+ "step": 7220,
+ "text_loss": 0.6025711894035339
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.00025614522509249715,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11647340.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002354414900764823,
+ "skip_count": 1.0,
+ "step": 7222,
+ "text_loss": 0.6497155427932739
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0002558750620699618,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 11650433.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009801039472222328,
+ "skip_count": 2.0,
+ "step": 7224,
+ "text_loss": 0.32049307227134705
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.0002556049925965632,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11654451.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002949854824692011,
+ "skip_count": 0.0,
+ "step": 7226,
+ "text_loss": 0.17923395335674286
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 33.93454652186674,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.00025533501677579254,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 11657440.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0032915703486651182,
+ "skip_count": 1.0,
+ "step": 7228,
+ "text_loss": 0.60064297914505
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 33.943938949222186,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.0002550651347111049,
+ "loss": 0.0046,
+ "macro_f1": 1.0,
+ "num_tokens": 11660599.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00594533933326602,
+ "skip_count": 1.0,
+ "step": 7230,
+ "text_loss": 0.32829397916793823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 33.95333137657764,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.00025479534650591976,
+ "loss": 0.0032,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11663387.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014214308466762304,
+ "skip_count": 0.0,
+ "step": 7232,
+ "text_loss": 0.7317177653312683
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 33.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 0.00025452565226362036,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11666729.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0056374757550656796,
+ "skip_count": 2.0,
+ "step": 7234,
+ "text_loss": 0.3394623398780823
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 33.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0152587890625,
+ "learning_rate": 0.00025425605208755406,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11669871.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006422565318644047,
+ "skip_count": 3.0,
+ "step": 7236,
+ "text_loss": 0.1725512444972992
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 33.98150865864397,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0002539865460810322,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11673008.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0023537934757769108,
+ "skip_count": 0.0,
+ "step": 7238,
+ "text_loss": 0.8873519897460938
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 33.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.00025371713434733,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11675988.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026300614699721336,
+ "skip_count": 1.0,
+ "step": 7240,
+ "text_loss": 0.4877084195613861
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 34.0,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0002534478169896864,
+ "loss": 0.0052,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 11679068.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.019549336284399033,
+ "skip_count": 3.0,
+ "step": 7242,
+ "text_loss": 0.15101417899131775
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.00939242735544,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0002531785941113044,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11682205.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007769173942506313,
+ "skip_count": 1.0,
+ "step": 7244,
+ "text_loss": 0.4035153090953827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0002529094658153508,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11685162.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003636054927483201,
+ "skip_count": 0.0,
+ "step": 7246,
+ "text_loss": 0.21048080921173096
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048583984375,
+ "learning_rate": 0.00025264043220495606,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11688512.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013363865436986089,
+ "skip_count": 0.0,
+ "step": 7248,
+ "text_loss": 0.6582038402557373
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00025237149338321437,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11691753.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005587349878624082,
+ "skip_count": 0.0,
+ "step": 7250,
+ "text_loss": 0.6899203658103943
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0002521026494531835,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11694689.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006221035961061716,
+ "skip_count": 0.0,
+ "step": 7252,
+ "text_loss": 0.17377600073814392
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.000251833900517885,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11697950.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004368607886135578,
+ "skip_count": 1.0,
+ "step": 7254,
+ "text_loss": 0.4147649109363556
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.000251565246680304,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11701214.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038269520737230778,
+ "skip_count": 2.0,
+ "step": 7256,
+ "text_loss": 0.42076823115348816
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.00025129668804338906,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11703935.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011755652958527207,
+ "skip_count": 0.0,
+ "step": 7258,
+ "text_loss": 0.5484340190887451
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.084531846199,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.00025102822471005247,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 11706818.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00735129788517952,
+ "skip_count": 2.0,
+ "step": 7260,
+ "text_loss": 0.29214802384376526
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.09392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00025075985678316983,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11709979.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0011552777141332626,
+ "skip_count": 0.0,
+ "step": 7262,
+ "text_loss": 0.6514551639556885
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 34.10331670090989,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0002504915843655802,
+ "loss": 0.0067,
+ "macro_f1": 0.8814815282821655,
+ "num_tokens": 11714075.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01438678614795208,
+ "skip_count": 4.0,
+ "step": 7264,
+ "text_loss": 0.5144859552383423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.11270912826534,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0002502234075600862,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11717610.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027831171173602343,
+ "skip_count": 0.0,
+ "step": 7266,
+ "text_loss": 0.6494308114051819
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00024995532646945336,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11721415.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012327058939263225,
+ "skip_count": 0.0,
+ "step": 7268,
+ "text_loss": 0.5111991763114929
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 34.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.0002496873411964113,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 11724488.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.003060065908357501,
+ "skip_count": 1.0,
+ "step": 7270,
+ "text_loss": 0.5780492424964905
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.0002494194518436523,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11727708.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001369593315757811,
+ "skip_count": 0.0,
+ "step": 7272,
+ "text_loss": 0.3151950240135193
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.00024915165851383203,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11730897.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005724756047129631,
+ "skip_count": 0.0,
+ "step": 7274,
+ "text_loss": 0.5267965197563171
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.00024888396130956947,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11733870.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010036137886345387,
+ "skip_count": 0.0,
+ "step": 7276,
+ "text_loss": 0.5330777168273926
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00024861636033344657,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11737413.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008341848850250244,
+ "skip_count": 2.0,
+ "step": 7278,
+ "text_loss": 0.25949522852897644
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.0002483488556880087,
+ "loss": 0.0061,
+ "macro_f1": 1.0,
+ "num_tokens": 11740691.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008208763785660267,
+ "skip_count": 2.0,
+ "step": 7280,
+ "text_loss": 0.1867891401052475
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.000248081447475764,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11743715.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038434381131082773,
+ "skip_count": 0.0,
+ "step": 7282,
+ "text_loss": 0.4835410416126251
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0002478141357991838,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11746818.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019067893736064434,
+ "skip_count": 0.0,
+ "step": 7284,
+ "text_loss": 0.5959038734436035
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.00024754692076070256,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11750160.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007199060171842575,
+ "skip_count": 0.0,
+ "step": 7286,
+ "text_loss": 0.5068115592002869
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.21602582917523,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.0002472798024627175,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11752836.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014214382972568274,
+ "skip_count": 0.0,
+ "step": 7288,
+ "text_loss": 0.5742631554603577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0002470127810075889,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11756276.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018025166355073452,
+ "skip_count": 0.0,
+ "step": 7290,
+ "text_loss": 0.6616888642311096
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.00024674585649763983,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11760235.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0024077212437987328,
+ "skip_count": 0.0,
+ "step": 7292,
+ "text_loss": 0.7984768748283386
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06494140625,
+ "learning_rate": 0.00024647902903515614,
+ "loss": 0.009,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11763430.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007843999192118645,
+ "skip_count": 1.0,
+ "step": 7294,
+ "text_loss": 0.1943647861480713
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0002462122987223869,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11766583.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019727738108485937,
+ "skip_count": 0.0,
+ "step": 7296,
+ "text_loss": 0.43924200534820557
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6000000238418579,
+ "avg_layers": 27.0,
+ "epoch": 34.26298796595245,
+ "f1_execute": 0.9545454382896423,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.75,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.0002459456656615436,
+ "loss": 0.0069,
+ "macro_f1": 0.9015151858329773,
+ "num_tokens": 11770360.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04594529792666435,
+ "skip_count": 5.0,
+ "step": 7298,
+ "text_loss": 0.32582250237464905
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.0002456791299548004,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11773239.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0011880286037921906,
+ "skip_count": 0.0,
+ "step": 7300,
+ "text_loss": 0.7723727226257324
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.00024541269170429435,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11776945.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010577787179499865,
+ "skip_count": 0.0,
+ "step": 7302,
+ "text_loss": 0.8173839449882507
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0002451463510121252,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11780121.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019757342524826527,
+ "skip_count": 0.0,
+ "step": 7304,
+ "text_loss": 0.4015064239501953
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.000244880107980355,
+ "loss": 0.0106,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11783172.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002577328821644187,
+ "skip_count": 0.0,
+ "step": 7306,
+ "text_loss": 0.5465171933174133
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.30995010272967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 0.00024461396271100876,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11788608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004162502940744162,
+ "skip_count": 0.0,
+ "step": 7308,
+ "text_loss": 0.2419646978378296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.0002443479153060735,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11791912.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003301614662632346,
+ "skip_count": 0.0,
+ "step": 7310,
+ "text_loss": 0.2568489909172058
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.32873495744057,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.00024408196586749964,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11794849.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019893983844667673,
+ "skip_count": 0.0,
+ "step": 7312,
+ "text_loss": 0.7044196128845215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.33812738479601,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0002438161144971992,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11797587.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006637922488152981,
+ "skip_count": 1.0,
+ "step": 7314,
+ "text_loss": 0.6863232254981995
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.000243550361297047,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11800173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003078785724937916,
+ "skip_count": 2.0,
+ "step": 7316,
+ "text_loss": 0.2868897616863251
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.00024328470636888005,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11802889.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011882453691214323,
+ "skip_count": 0.0,
+ "step": 7318,
+ "text_loss": 0.5522798299789429
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0002430191498144979,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11805607.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008720619371160865,
+ "skip_count": 0.0,
+ "step": 7320,
+ "text_loss": 0.5531370639801025
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.375697094217784,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.00024275369173566236,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11808838.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003213440766558051,
+ "skip_count": 0.0,
+ "step": 7322,
+ "text_loss": 0.5252627730369568
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.385089521573235,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.00024248833223409715,
+ "loss": 0.0102,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11811965.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004736232105642557,
+ "skip_count": 1.0,
+ "step": 7324,
+ "text_loss": 0.6033701300621033
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.00024222307141148907,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11814832.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007559265359304845,
+ "skip_count": 0.0,
+ "step": 7326,
+ "text_loss": 0.5607737302780151
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.00024195790936948626,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11818802.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005338212475180626,
+ "skip_count": 2.0,
+ "step": 7328,
+ "text_loss": 0.20618735253810883
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 34.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0002416928462096994,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11821998.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001919696107506752,
+ "skip_count": 3.0,
+ "step": 7330,
+ "text_loss": 0.42486369609832764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.00024142788203370107,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11824505.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013797834981232882,
+ "skip_count": 0.0,
+ "step": 7332,
+ "text_loss": 0.48403388261795044
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.43205165835045,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.00024116301694302621,
+ "loss": 0.0053,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 11828504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008978237397968769,
+ "skip_count": 1.0,
+ "step": 7334,
+ "text_loss": 0.43872755765914917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.441444085705896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 0.00024089825103917152,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11831171.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004589964635670185,
+ "skip_count": 1.0,
+ "step": 7336,
+ "text_loss": 0.5126842260360718
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.45083651306135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.00024063358442359572,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11834387.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002857893006876111,
+ "skip_count": 0.0,
+ "step": 7338,
+ "text_loss": 0.7521272301673889
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.0002403690171977197,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11838693.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009023012826219201,
+ "skip_count": 0.0,
+ "step": 7340,
+ "text_loss": 0.6335242390632629
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.00024010454946292586,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11841882.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010992717929184437,
+ "skip_count": 0.0,
+ "step": 7342,
+ "text_loss": 0.64045649766922
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.47901379512768,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0002398401813205592,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11845181.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002247930970042944,
+ "skip_count": 2.0,
+ "step": 7344,
+ "text_loss": 0.31022098660469055
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.48840622248312,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.00023957591287192577,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11848537.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003184020286425948,
+ "skip_count": 2.0,
+ "step": 7346,
+ "text_loss": 0.5709269642829895
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.00023931174421829376,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 11851437.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006582654081285,
+ "skip_count": 4.0,
+ "step": 7348,
+ "text_loss": 0.3547070026397705
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.507191077194015,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.00023904767546089318,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11854161.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0022124287206679583,
+ "skip_count": 0.0,
+ "step": 7350,
+ "text_loss": 0.6984702348709106
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.00023878370670091565,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11856811.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0029868825804442167,
+ "skip_count": 0.0,
+ "step": 7352,
+ "text_loss": 0.25389090180397034
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01708984375,
+ "learning_rate": 0.00023851983803951444,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11860110.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028468978125602007,
+ "skip_count": 1.0,
+ "step": 7354,
+ "text_loss": 0.5729252099990845
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.00023825606957780454,
+ "loss": 0.0041,
+ "macro_f1": 1.0,
+ "num_tokens": 11863058.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003115740604698658,
+ "skip_count": 2.0,
+ "step": 7356,
+ "text_loss": 0.60753333568573
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.00023799240141686258,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11865865.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022254586219787598,
+ "skip_count": 0.0,
+ "step": 7358,
+ "text_loss": 0.2568866014480591
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.55415321397123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 0.00023772883365772658,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11869133.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017388637643307447,
+ "skip_count": 0.0,
+ "step": 7360,
+ "text_loss": 0.7657097578048706
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.563545641326684,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.00023746536640139633,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11872988.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002158832037821412,
+ "skip_count": 0.0,
+ "step": 7362,
+ "text_loss": 0.19717472791671753
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.57293806868213,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00023720199974883294,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11875810.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001037398586049676,
+ "skip_count": 0.0,
+ "step": 7364,
+ "text_loss": 0.47334593534469604
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 34.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.00023693873380095876,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11878558.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011853457428514957,
+ "skip_count": 5.0,
+ "step": 7366,
+ "text_loss": 0.2567826211452484
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01806640625,
+ "learning_rate": 0.00023667556865865824,
+ "loss": 0.0034,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11881473.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015339091187343001,
+ "skip_count": 0.0,
+ "step": 7368,
+ "text_loss": 0.40981143712997437
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.00023641250442277655,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11885033.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010062574408948421,
+ "skip_count": 0.0,
+ "step": 7370,
+ "text_loss": 0.3183043301105499
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 0.00023614954119412042,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11889136.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010769609361886978,
+ "skip_count": 0.0,
+ "step": 7372,
+ "text_loss": 0.5279555916786194
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 34.619900205459345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 0.00023588667907345785,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11893102.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032862431835383177,
+ "skip_count": 3.0,
+ "step": 7374,
+ "text_loss": 0.5425930023193359
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 34.629292632814796,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.00023562391816151808,
+ "loss": 0.0057,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 11895841.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02405562624335289,
+ "skip_count": 3.0,
+ "step": 7376,
+ "text_loss": 0.26054954528808594
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00023536125855899153,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 11899594.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008315852843225002,
+ "skip_count": 3.0,
+ "step": 7378,
+ "text_loss": 0.19068174064159393
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 34.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.00023509870036652998,
+ "loss": 0.0065,
+ "macro_f1": 1.0,
+ "num_tokens": 11902843.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006180883850902319,
+ "skip_count": 4.0,
+ "step": 7380,
+ "text_loss": 0.18461982905864716
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.00023483624368474614,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11905786.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008856299100443721,
+ "skip_count": 0.0,
+ "step": 7382,
+ "text_loss": 0.5216618180274963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.66686234223657,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.00023457388861421397,
+ "loss": 0.0059,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 11908706.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04762765392661095,
+ "skip_count": 1.0,
+ "step": 7384,
+ "text_loss": 0.25329193472862244
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 34.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.00023431163525546833,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 11911862.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.000989250373095274,
+ "skip_count": 1.0,
+ "step": 7386,
+ "text_loss": 0.2657507658004761
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.685647196947464,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01708984375,
+ "learning_rate": 0.0002340494837090053,
+ "loss": 0.0032,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11915483.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008857969660311937,
+ "skip_count": 0.0,
+ "step": 7388,
+ "text_loss": 0.5136669874191284
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.00023378743407528164,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11918778.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0041572838090360165,
+ "skip_count": 1.0,
+ "step": 7390,
+ "text_loss": 0.5212553143501282
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.00023352548645471556,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11921916.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010537431808188558,
+ "skip_count": 0.0,
+ "step": 7392,
+ "text_loss": 0.48122525215148926
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00023326364094768576,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11924273.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004077036865055561,
+ "skip_count": 0.0,
+ "step": 7394,
+ "text_loss": 0.2128690630197525
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.00023300189765453194,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11927424.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005371362902224064,
+ "skip_count": 2.0,
+ "step": 7396,
+ "text_loss": 0.19448284804821014
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.73260933372468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00023274025667555464,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11930919.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002137752715498209,
+ "skip_count": 0.0,
+ "step": 7398,
+ "text_loss": 0.7537064552307129
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 0.00023247871811101512,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11933680.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0002398790093138814,
+ "skip_count": 0.0,
+ "step": 7400,
+ "text_loss": 0.5589297413825989
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.751394188435576,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 0.00023221728206113546,
+ "loss": 0.008,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 11937090.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019718777388334274,
+ "skip_count": 1.0,
+ "step": 7402,
+ "text_loss": 0.8014751672744751
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 0.0002319559486260985,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11940581.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001230534864589572,
+ "skip_count": 0.0,
+ "step": 7404,
+ "text_loss": 0.5218383073806763
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.77017904314646,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.0002316947179060477,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11943832.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016393321566283703,
+ "skip_count": 0.0,
+ "step": 7406,
+ "text_loss": 0.17122556269168854
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.00023143359000108704,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11947025.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005269679240882397,
+ "skip_count": 2.0,
+ "step": 7408,
+ "text_loss": 0.2015499323606491
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 34.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.00023117256501128136,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 11950077.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005140089895576239,
+ "skip_count": 2.0,
+ "step": 7410,
+ "text_loss": 0.39068636298179626
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.00023091164303665592,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11953800.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005578748416155577,
+ "skip_count": 0.0,
+ "step": 7412,
+ "text_loss": 0.18851874768733978
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.807748752568244,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.068359375,
+ "learning_rate": 0.00023065082417719624,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11956383.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006410991190932691,
+ "skip_count": 0.0,
+ "step": 7414,
+ "text_loss": 0.5663703083992004
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 34.81714117992369,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.0002303901085328491,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11959554.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005902954144403338,
+ "skip_count": 5.0,
+ "step": 7416,
+ "text_loss": 0.5225661993026733
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0002301294962035209,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11962582.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00045644037891179323,
+ "skip_count": 0.0,
+ "step": 7418,
+ "text_loss": 0.40572360157966614
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 0.0002298689872890789,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11965649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01017778366804123,
+ "skip_count": 2.0,
+ "step": 7420,
+ "text_loss": 0.12190715968608856
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.00022960858188935052,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11968850.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008010792662389576,
+ "skip_count": 0.0,
+ "step": 7422,
+ "text_loss": 0.5606820583343506
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0002293482801041236,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11972064.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001889281440526247,
+ "skip_count": 0.0,
+ "step": 7424,
+ "text_loss": 0.44142210483551025
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.00022908808203314635,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11975466.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00647713290527463,
+ "skip_count": 2.0,
+ "step": 7426,
+ "text_loss": 0.23273423314094543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.0002288279877761271,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11979875.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004027119372040033,
+ "skip_count": 0.0,
+ "step": 7428,
+ "text_loss": 0.5608086585998535
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.0002285679974327345,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11982808.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009015435934998095,
+ "skip_count": 0.0,
+ "step": 7430,
+ "text_loss": 0.3976539373397827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.89228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.0002283081111025973,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11985978.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00047143330448307097,
+ "skip_count": 0.0,
+ "step": 7432,
+ "text_loss": 0.4280148446559906
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.00022804832888530447,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11988925.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004895820748060942,
+ "skip_count": 0.0,
+ "step": 7434,
+ "text_loss": 0.5137463808059692
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 0.000227788650880405,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11991631.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008349024574272335,
+ "skip_count": 0.0,
+ "step": 7436,
+ "text_loss": 0.4306720197200775
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.92045788083358,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00022752907718740807,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 11995476.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038723985198885202,
+ "skip_count": 0.0,
+ "step": 7438,
+ "text_loss": 0.6413722038269043
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 34.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043701171875,
+ "learning_rate": 0.00022726960790578248,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 11998846.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004433541093021631,
+ "skip_count": 0.0,
+ "step": 7440,
+ "text_loss": 0.6424159407615662
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 23.0,
+ "epoch": 34.93924273554447,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.0002270102431349579,
+ "loss": 0.0062,
+ "macro_f1": 0.6289562582969666,
+ "num_tokens": 12002228.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023979803547263145,
+ "skip_count": 6.0,
+ "step": 7442,
+ "text_loss": 0.16657918691635132
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 34.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.00022675098297432307,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 12005003.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.005645833443850279,
+ "skip_count": 1.0,
+ "step": 7444,
+ "text_loss": 0.6388722658157349
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.00022649182752322705,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12007657.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001629356062039733,
+ "skip_count": 2.0,
+ "step": 7446,
+ "text_loss": 0.35670006275177
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 34.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00022623277688097864,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12010652.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006375396624207497,
+ "skip_count": 2.0,
+ "step": 7448,
+ "text_loss": 0.24273613095283508
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.97681244496625,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0002259738311468466,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12014042.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003734540194272995,
+ "skip_count": 0.0,
+ "step": 7450,
+ "text_loss": 0.4262580871582031
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 34.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.0002257149904200592,
+ "loss": 0.0076,
+ "macro_f1": 1.0,
+ "num_tokens": 12016987.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0027926203329116106,
+ "skip_count": 1.0,
+ "step": 7452,
+ "text_loss": 0.366216778755188
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 34.99559729967714,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.00022545625479980508,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12021584.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008985420572571456,
+ "skip_count": 0.0,
+ "step": 7454,
+ "text_loss": 0.533937394618988
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.004696213677725,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.00022519762438523205,
+ "loss": 0.0029,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12024142.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005394646432250738,
+ "skip_count": 1.0,
+ "step": 7456,
+ "text_loss": 0.2401239275932312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.01408864103317,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0002249390992754477,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12027262.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00275063537992537,
+ "skip_count": 0.0,
+ "step": 7458,
+ "text_loss": 0.21824975311756134
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.02348106838861,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00022468067956951944,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12030528.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008951274212449789,
+ "skip_count": 1.0,
+ "step": 7460,
+ "text_loss": 0.610903263092041
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.00022442236536647408,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12033699.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004062872380018234,
+ "skip_count": 2.0,
+ "step": 7462,
+ "text_loss": 0.26921433210372925
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.00022416415676529823,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12037402.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023089025635272264,
+ "skip_count": 1.0,
+ "step": 7464,
+ "text_loss": 0.4746153950691223
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.05165835045494,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.00022390605386493756,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12041129.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021355501376092434,
+ "skip_count": 2.0,
+ "step": 7466,
+ "text_loss": 0.4265538454055786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.061050777810394,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.00022364805676429816,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12044356.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0061582159250974655,
+ "skip_count": 1.0,
+ "step": 7468,
+ "text_loss": 0.12020833045244217
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.07044320516584,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.00022339016556224467,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12047158.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003753372235223651,
+ "skip_count": 1.0,
+ "step": 7470,
+ "text_loss": 0.6406939625740051
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 35.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.00022313238035760158,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 12050149.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005371729377657175,
+ "skip_count": 5.0,
+ "step": 7472,
+ "text_loss": 0.5184400677680969
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.0002228747012491526,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12053560.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000824139395263046,
+ "skip_count": 0.0,
+ "step": 7474,
+ "text_loss": 0.32644152641296387
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0002226171283356409,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12056309.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0044801668263971806,
+ "skip_count": 1.0,
+ "step": 7476,
+ "text_loss": 0.7027081847190857
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.10801291458761,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.00022235966171576887,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12059191.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007496353704482317,
+ "skip_count": 2.0,
+ "step": 7478,
+ "text_loss": 0.28705671429634094
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.117405341943055,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.0002221023014881982,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12062365.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018641395727172494,
+ "skip_count": 1.0,
+ "step": 7480,
+ "text_loss": 0.715477466583252
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.126797769298506,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.00022184504775154984,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12065508.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005825075786560774,
+ "skip_count": 0.0,
+ "step": 7482,
+ "text_loss": 0.7481293678283691
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.00022158790060440394,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12068043.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028906071092933416,
+ "skip_count": 0.0,
+ "step": 7484,
+ "text_loss": 0.6151962876319885
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.14558262400939,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.00022133086014529968,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12070897.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030862605199217796,
+ "skip_count": 1.0,
+ "step": 7486,
+ "text_loss": 0.4923575222492218
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.00022107392647273527,
+ "loss": 0.009,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12074644.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011101154377683997,
+ "skip_count": 0.0,
+ "step": 7488,
+ "text_loss": 0.5217859148979187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.16436747872028,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.00022081709968516867,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12077718.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004303969442844391,
+ "skip_count": 0.0,
+ "step": 7490,
+ "text_loss": 0.18933317065238953
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.17375990607572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.00022056037988101612,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12080509.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019941304344683886,
+ "skip_count": 1.0,
+ "step": 7492,
+ "text_loss": 0.6760565042495728
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.183152333431174,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 0.00022030376715865313,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12083580.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017090907786041498,
+ "skip_count": 0.0,
+ "step": 7494,
+ "text_loss": 0.4140956401824951
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.19254476078662,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.0002200472616164142,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12086923.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005131757352501154,
+ "skip_count": 1.0,
+ "step": 7496,
+ "text_loss": 0.43287888169288635
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00021979086335259269,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12090003.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007472267607226968,
+ "skip_count": 0.0,
+ "step": 7498,
+ "text_loss": 0.6692602038383484
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.00021953457246544095,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12092936.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012374494690448046,
+ "skip_count": 0.0,
+ "step": 7500,
+ "text_loss": 0.5170100331306458
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.00021927838905317016,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12096395.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006784295197576284,
+ "skip_count": 2.0,
+ "step": 7502,
+ "text_loss": 0.340880811214447
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.23011447020839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 0.00021902231321395017,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12099743.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0058755455538630486,
+ "skip_count": 1.0,
+ "step": 7504,
+ "text_loss": 0.5299809575080872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00021876634504590985,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12103121.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010622406378388405,
+ "skip_count": 2.0,
+ "step": 7506,
+ "text_loss": 0.1817338913679123
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 35.248899324919286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.00021851048464713662,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12105883.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004382388666272163,
+ "skip_count": 3.0,
+ "step": 7508,
+ "text_loss": 0.5718557834625244
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.00021825473211567665,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12108936.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001638208981603384,
+ "skip_count": 0.0,
+ "step": 7510,
+ "text_loss": 0.4684678316116333
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.26768417963017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 0.00021799908754953468,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12112060.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007894381997175515,
+ "skip_count": 2.0,
+ "step": 7512,
+ "text_loss": 0.5146099328994751
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.27707660698562,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.00021774355104667455,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12115636.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01400370616465807,
+ "skip_count": 2.0,
+ "step": 7514,
+ "text_loss": 0.19512294232845306
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 35.28646903434106,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.00021748812270501805,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12119116.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005261222366243601,
+ "skip_count": 3.0,
+ "step": 7516,
+ "text_loss": 0.17316904664039612
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.295861461696504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 0.0002172328026224459,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12122070.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01021486520767212,
+ "skip_count": 2.0,
+ "step": 7518,
+ "text_loss": 0.2777172029018402
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.305253889051954,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00021697759089679713,
+ "loss": 0.0056,
+ "macro_f1": 1.0,
+ "num_tokens": 12125386.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005217147525399923,
+ "skip_count": 2.0,
+ "step": 7520,
+ "text_loss": 0.49744322896003723
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.3146463164074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.00021672248762586948,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12128753.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003868246916681528,
+ "skip_count": 0.0,
+ "step": 7522,
+ "text_loss": 0.4209211468696594
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 35.32403874376284,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00021646749290741895,
+ "loss": 0.009,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 12132425.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.044205982238054276,
+ "skip_count": 3.0,
+ "step": 7524,
+ "text_loss": 0.4180344343185425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.00021621260683916005,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12135740.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032584366854280233,
+ "skip_count": 2.0,
+ "step": 7526,
+ "text_loss": 0.21219655871391296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.00021595782951876552,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12139239.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002418758114799857,
+ "skip_count": 2.0,
+ "step": 7528,
+ "text_loss": 0.40800613164901733
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.35221602582917,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0186767578125,
+ "learning_rate": 0.0002157031610438665,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 12142572.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005265383515506983,
+ "skip_count": 1.0,
+ "step": 7530,
+ "text_loss": 0.7539705634117126
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.0002154486015120525,
+ "loss": 0.0051,
+ "macro_f1": 1.0,
+ "num_tokens": 12145737.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006648020353168249,
+ "skip_count": 2.0,
+ "step": 7532,
+ "text_loss": 0.7824432253837585
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.371000880540066,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0002151941510208712,
+ "loss": 0.0049,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 12149376.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01692759431898594,
+ "skip_count": 0.0,
+ "step": 7534,
+ "text_loss": 0.4476291239261627
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 35.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0002149398096678283,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12152191.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013883143663406372,
+ "skip_count": 0.0,
+ "step": 7536,
+ "text_loss": 0.14996720850467682
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.38978573525095,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.058837890625,
+ "learning_rate": 0.00021468557755038826,
+ "loss": 0.0069,
+ "macro_f1": 1.0,
+ "num_tokens": 12155084.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.009390740655362606,
+ "skip_count": 2.0,
+ "step": 7538,
+ "text_loss": 0.23685340583324432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.3991781626064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0002144314547659731,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12159366.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025363171007484198,
+ "skip_count": 0.0,
+ "step": 7540,
+ "text_loss": 0.6687407493591309
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.40857058996184,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.00021417744141196315,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12162545.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004230613354593515,
+ "skip_count": 1.0,
+ "step": 7542,
+ "text_loss": 0.24885894358158112
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 35.41796301731729,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 0.00021392353758569694,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12165381.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008058524690568447,
+ "skip_count": 0.0,
+ "step": 7544,
+ "text_loss": 0.15833988785743713
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.427355444672735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0002136697433844707,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12168304.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018041770672425628,
+ "skip_count": 0.0,
+ "step": 7546,
+ "text_loss": 0.6046217083930969
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.43674787202818,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.00021341605890553894,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 12171040.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008584463968873024,
+ "skip_count": 2.0,
+ "step": 7548,
+ "text_loss": 0.3001522719860077
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.00021316248424611408,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12174702.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010506469989195466,
+ "skip_count": 0.0,
+ "step": 7550,
+ "text_loss": 0.2998376488685608
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0162353515625,
+ "learning_rate": 0.00021290901950336627,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12178388.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012753128539770842,
+ "skip_count": 0.0,
+ "step": 7552,
+ "text_loss": 0.8125656843185425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.00021265566477442384,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12181863.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004343052394688129,
+ "skip_count": 2.0,
+ "step": 7554,
+ "text_loss": 0.14004671573638916
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 35.47431758144996,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.00021240242015637268,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12185485.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0005794052849523723,
+ "skip_count": 0.0,
+ "step": 7556,
+ "text_loss": 0.7116519808769226
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.4837100088054,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.00021214928574625664,
+ "loss": 0.0063,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 12188914.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01066325418651104,
+ "skip_count": 0.0,
+ "step": 7558,
+ "text_loss": 0.4664429724216461
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.49310243616085,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00021189626164107718,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12193042.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011769415577873588,
+ "skip_count": 0.0,
+ "step": 7560,
+ "text_loss": 0.672637403011322
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.00021164334793779388,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 12195675.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008653911761939526,
+ "skip_count": 1.0,
+ "step": 7562,
+ "text_loss": 0.5301182866096497
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.00021139054473332357,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12198638.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0058176578022539616,
+ "skip_count": 0.0,
+ "step": 7564,
+ "text_loss": 0.1889677792787552
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.000211137852124541,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12202312.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004154018242843449,
+ "skip_count": 0.0,
+ "step": 7566,
+ "text_loss": 0.3610386848449707
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.53067214558262,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.00021088527020827848,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12205112.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014722816413268447,
+ "skip_count": 0.0,
+ "step": 7568,
+ "text_loss": 0.15214823186397552
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.54006457293807,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.0002106327990813257,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12208103.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015596678713336587,
+ "skip_count": 0.0,
+ "step": 7570,
+ "text_loss": 0.5034125447273254
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 35.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.00021038043884043022,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12211208.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007482443004846573,
+ "skip_count": 0.0,
+ "step": 7572,
+ "text_loss": 0.6760116219520569
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.00021012818958229696,
+ "loss": 0.0031,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12214463.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003875598544254899,
+ "skip_count": 2.0,
+ "step": 7574,
+ "text_loss": 0.3278147876262665
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.5682418550044,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.00020987605140358824,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12218199.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007918627932667732,
+ "skip_count": 2.0,
+ "step": 7576,
+ "text_loss": 0.23850615322589874
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.577634282359845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.00020962402440092388,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12221151.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005424308590590954,
+ "skip_count": 1.0,
+ "step": 7578,
+ "text_loss": 0.5670642256736755
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.58702670971529,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0002093721086708812,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 12224789.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0066504343412816525,
+ "skip_count": 1.0,
+ "step": 7580,
+ "text_loss": 0.30404478311538696
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 35.59641913707074,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.00020912030430999452,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12228134.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008815597742795944,
+ "skip_count": 0.0,
+ "step": 7582,
+ "text_loss": 0.32522889971733093
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 35.60581156442618,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.05126953125,
+ "learning_rate": 0.0002088686114147561,
+ "loss": 0.0098,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 12231335.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03785836696624756,
+ "skip_count": 2.0,
+ "step": 7584,
+ "text_loss": 0.6277920603752136
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.61520399178163,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.00020861703008161504,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12234619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016183801926672459,
+ "skip_count": 0.0,
+ "step": 7586,
+ "text_loss": 0.38319316506385803
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.00020836556040697767,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 12237296.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013077575713396072,
+ "skip_count": 1.0,
+ "step": 7588,
+ "text_loss": 0.297571063041687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 0.00020811420248720769,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12240633.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002858756808564067,
+ "skip_count": 0.0,
+ "step": 7590,
+ "text_loss": 0.2506035268306732
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.000207862956418626,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12244118.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032624071463942528,
+ "skip_count": 1.0,
+ "step": 7592,
+ "text_loss": 0.19843827188014984
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.65277370120341,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056640625,
+ "learning_rate": 0.00020761182229751045,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 12247367.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005885142367333174,
+ "skip_count": 3.0,
+ "step": 7594,
+ "text_loss": 0.3347153067588806
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 35.66216612855885,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.00020736080022009602,
+ "loss": 0.0088,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 12250487.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.021491389721632004,
+ "skip_count": 4.0,
+ "step": 7596,
+ "text_loss": 0.6777212619781494
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 35.671558555914295,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04052734375,
+ "learning_rate": 0.00020710989028257514,
+ "loss": 0.0061,
+ "macro_f1": 0.6595745086669922,
+ "num_tokens": 12253834.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.014164486899971962,
+ "skip_count": 4.0,
+ "step": 7598,
+ "text_loss": 0.741127610206604
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0002068590925810968,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12257289.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012773120542988181,
+ "skip_count": 0.0,
+ "step": 7600,
+ "text_loss": 0.5336982607841492
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.69034341062518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.0002066084072117672,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12260825.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013102042488753796,
+ "skip_count": 2.0,
+ "step": 7602,
+ "text_loss": 0.30410775542259216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.699735837980626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.00020635783427064942,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12264609.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002602101070806384,
+ "skip_count": 0.0,
+ "step": 7604,
+ "text_loss": 0.29835572838783264
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.70912826533607,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00020610737385376348,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12267537.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0053265830501914024,
+ "skip_count": 0.0,
+ "step": 7606,
+ "text_loss": 0.2095658779144287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.71852069269152,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.00020585702605708628,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12271175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000614096992649138,
+ "skip_count": 0.0,
+ "step": 7608,
+ "text_loss": 0.8146751523017883
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.72791312004696,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.00020560679097655137,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12274067.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013201923575252295,
+ "skip_count": 0.0,
+ "step": 7610,
+ "text_loss": 0.40818271040916443
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.73730554740241,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.0002053566687080497,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12276946.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004304401110857725,
+ "skip_count": 1.0,
+ "step": 7612,
+ "text_loss": 0.7063660025596619
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.0002051066593474284,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12279760.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032060579396784306,
+ "skip_count": 1.0,
+ "step": 7614,
+ "text_loss": 0.23671887814998627
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.756090402113294,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00020485676299049154,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12282737.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005103024188429117,
+ "skip_count": 2.0,
+ "step": 7616,
+ "text_loss": 0.17571020126342773
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00020460697973299986,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 12286290.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007189507596194744,
+ "skip_count": 1.0,
+ "step": 7618,
+ "text_loss": 0.30872994661331177
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.77487525682419,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.0002043573096706708,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12289458.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010217712260782719,
+ "skip_count": 0.0,
+ "step": 7620,
+ "text_loss": 0.5155487060546875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.0002041077528991784,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12292846.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022399788722395897,
+ "skip_count": 1.0,
+ "step": 7622,
+ "text_loss": 0.717949390411377
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.0002038583095141532,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12295673.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018168877577409148,
+ "skip_count": 0.0,
+ "step": 7624,
+ "text_loss": 0.560361385345459
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 0.00020360897961118246,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12298624.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008487844606861472,
+ "skip_count": 0.0,
+ "step": 7626,
+ "text_loss": 0.6391524076461792
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.81244496624596,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.00020335976328580984,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12302136.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006127831293269992,
+ "skip_count": 0.0,
+ "step": 7628,
+ "text_loss": 0.5932226777076721
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.821837393601406,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.07373046875,
+ "learning_rate": 0.00020311066063353556,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12305152.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018765819258987904,
+ "skip_count": 0.0,
+ "step": 7630,
+ "text_loss": 0.37831631302833557
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.83122982095686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00020286167174981618,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12307771.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025384656619280577,
+ "skip_count": 0.0,
+ "step": 7632,
+ "text_loss": 0.34806445240974426
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.8406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 0.0002026127967300645,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12310921.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008239032700657845,
+ "skip_count": 2.0,
+ "step": 7634,
+ "text_loss": 0.34859901666641235
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00020236403566965027,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12314200.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029505928978323936,
+ "skip_count": 2.0,
+ "step": 7636,
+ "text_loss": 0.2647531032562256
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 35.85940710302319,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.0002021153886638991,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12319221.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0014016951899975538,
+ "skip_count": 0.0,
+ "step": 7638,
+ "text_loss": 0.42428603768348694
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 35.86879953037863,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.00020186685580809288,
+ "loss": 0.0059,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 12322204.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01761031709611416,
+ "skip_count": 2.0,
+ "step": 7640,
+ "text_loss": 0.25929757952690125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.878191957734074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.00020161843719746997,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12324750.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023674629628658295,
+ "skip_count": 0.0,
+ "step": 7642,
+ "text_loss": 0.567159116268158
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.887584385089525,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0002013701329272248,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12327933.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004534341394901276,
+ "skip_count": 0.0,
+ "step": 7644,
+ "text_loss": 0.4765215516090393
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.89697681244497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.00020112194309250797,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12330847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003144246758893132,
+ "skip_count": 2.0,
+ "step": 7646,
+ "text_loss": 0.39837369322776794
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 0.00020087386778842642,
+ "loss": 0.0046,
+ "macro_f1": 1.0,
+ "num_tokens": 12333782.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008137194439768791,
+ "skip_count": 1.0,
+ "step": 7648,
+ "text_loss": 0.42175763845443726
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.00020062590711004296,
+ "loss": 0.0034,
+ "macro_f1": 1.0,
+ "num_tokens": 12336837.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006499455776065588,
+ "skip_count": 1.0,
+ "step": 7650,
+ "text_loss": 0.18695278465747833
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 0.00020037806115237667,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12340414.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001548365456983447,
+ "skip_count": 0.0,
+ "step": 7652,
+ "text_loss": 0.1981094628572464
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 35.93454652186674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00020013033001040255,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12343209.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008136926218867302,
+ "skip_count": 2.0,
+ "step": 7654,
+ "text_loss": 0.2231602668762207
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.943938949222186,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.00019988271377905165,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12346158.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00370375020429492,
+ "skip_count": 1.0,
+ "step": 7656,
+ "text_loss": 0.4809921383857727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 35.95333137657764,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.00019963521255321077,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12349279.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00690054427832365,
+ "skip_count": 3.0,
+ "step": 7658,
+ "text_loss": 0.40473970770835876
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 35.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.0001993878264277233,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 12352848.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004367961548268795,
+ "skip_count": 1.0,
+ "step": 7660,
+ "text_loss": 0.3646799921989441
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 0.00019914055549738775,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12356737.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000662159756757319,
+ "skip_count": 0.0,
+ "step": 7662,
+ "text_loss": 0.3703214228153229
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 35.98150865864397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0001988933998569589,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12360085.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023262565955519676,
+ "skip_count": 0.0,
+ "step": 7664,
+ "text_loss": 0.12910836935043335
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 35.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.0001986463596011473,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12363296.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002686078194528818,
+ "skip_count": 1.0,
+ "step": 7666,
+ "text_loss": 0.39628392457962036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.00019839943482461914,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12366072.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007100159768015146,
+ "skip_count": 1.0,
+ "step": 7668,
+ "text_loss": 0.6588287949562073
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.00939242735544,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00019815262562199648,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12368940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004194926470518112,
+ "skip_count": 0.0,
+ "step": 7670,
+ "text_loss": 0.36411619186401367
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0189208984375,
+ "learning_rate": 0.00019790593208785713,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12372031.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0041313013061881065,
+ "skip_count": 0.0,
+ "step": 7672,
+ "text_loss": 0.23270413279533386
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 36.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.00019765935431673444,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12375115.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003343774238601327,
+ "skip_count": 0.0,
+ "step": 7674,
+ "text_loss": 0.1686355322599411
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 36.03756970942178,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.00019741289240311755,
+ "loss": 0.0058,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 12379089.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021328814327716827,
+ "skip_count": 4.0,
+ "step": 7676,
+ "text_loss": 0.9312577247619629
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.00019716654644145104,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12383115.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004511173174250871,
+ "skip_count": 0.0,
+ "step": 7678,
+ "text_loss": 0.3305695056915283
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050048828125,
+ "learning_rate": 0.00019692031652613522,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12386064.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006190002430230379,
+ "skip_count": 0.0,
+ "step": 7680,
+ "text_loss": 0.4829687178134918
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 36.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.00019667420275152575,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 12389743.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.004575030412524939,
+ "skip_count": 1.0,
+ "step": 7682,
+ "text_loss": 0.5751548409461975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 0.0001964282052119341,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12392481.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002718796720728278,
+ "skip_count": 0.0,
+ "step": 7684,
+ "text_loss": 0.5349925756454468
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.084531846199,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.0001961823240016269,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12395207.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027528523933142424,
+ "skip_count": 0.0,
+ "step": 7686,
+ "text_loss": 0.5322592258453369
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 36.09392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.00019593655921482624,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12398232.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008105970919132233,
+ "skip_count": 0.0,
+ "step": 7688,
+ "text_loss": 0.3192061185836792
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 36.10331670090989,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.00019569091094570967,
+ "loss": 0.0069,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 12400862.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.024075545370578766,
+ "skip_count": 1.0,
+ "step": 7690,
+ "text_loss": 0.3189752697944641
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 36.11270912826534,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.0001954453792884101,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12404039.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007513802964240313,
+ "skip_count": 3.0,
+ "step": 7692,
+ "text_loss": 0.5985093712806702
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.0001951999643370157,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 12407085.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009606506675481796,
+ "skip_count": 2.0,
+ "step": 7694,
+ "text_loss": 0.2050790935754776
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.00019495466618556996,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12411377.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007978329667821527,
+ "skip_count": 0.0,
+ "step": 7696,
+ "text_loss": 0.4705570638179779
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00019470948492807154,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12414427.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010737364646047354,
+ "skip_count": 0.0,
+ "step": 7698,
+ "text_loss": 0.6105324029922485
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 0.00019446442065847448,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12417442.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001762967323884368,
+ "skip_count": 0.0,
+ "step": 7700,
+ "text_loss": 0.5638618469238281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00019421947347068774,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12420862.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015798417152836919,
+ "skip_count": 0.0,
+ "step": 7702,
+ "text_loss": 0.1939864307641983
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00019397464345857562,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12423876.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005659835878759623,
+ "skip_count": 1.0,
+ "step": 7704,
+ "text_loss": 0.20829300582408905
+ },
+ {
+ "acc_repeat": 0.75,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 36.17845611975345,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.8571428656578064,
+ "f1_skip": 1.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 0.00019372993071595723,
+ "loss": 0.0072,
+ "macro_f1": 0.9449735879898071,
+ "num_tokens": 12427639.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.018665846437215805,
+ "skip_count": 2.0,
+ "step": 7706,
+ "text_loss": 0.47913849353790283
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.00019348533533660727,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12431520.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006690093432553113,
+ "skip_count": 0.0,
+ "step": 7708,
+ "text_loss": 0.494870662689209
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.00019324085741425511,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12434213.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004067352041602135,
+ "skip_count": 1.0,
+ "step": 7710,
+ "text_loss": 0.7631711959838867
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 36.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 0.00019299649704258504,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12437437.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01157623715698719,
+ "skip_count": 0.0,
+ "step": 7712,
+ "text_loss": 0.3145926296710968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.21602582917523,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 0.0001927522543152364,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12440507.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001888492377474904,
+ "skip_count": 0.0,
+ "step": 7714,
+ "text_loss": 0.576301097869873
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.00019250812932580352,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12443484.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00042988534551113844,
+ "skip_count": 0.0,
+ "step": 7716,
+ "text_loss": 0.5716445446014404
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.00019226412216783557,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12446460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005063199903815985,
+ "skip_count": 1.0,
+ "step": 7718,
+ "text_loss": 0.2700924873352051
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.0001920202329348365,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12449346.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010775640839710832,
+ "skip_count": 0.0,
+ "step": 7720,
+ "text_loss": 0.5162558555603027
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.00019177646172026513,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12452680.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014514096546918154,
+ "skip_count": 0.0,
+ "step": 7722,
+ "text_loss": 0.5753642916679382
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0177001953125,
+ "learning_rate": 0.00019153280861753497,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12455348.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002202774863690138,
+ "skip_count": 1.0,
+ "step": 7724,
+ "text_loss": 0.5751997232437134
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.00019128927372001454,
+ "loss": 0.0032,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12458098.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005171069409698248,
+ "skip_count": 0.0,
+ "step": 7726,
+ "text_loss": 0.22252975404262543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00019104585712102678,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12460958.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0041033923625946045,
+ "skip_count": 0.0,
+ "step": 7728,
+ "text_loss": 0.18611937761306763
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 36.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.00019080255891384945,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12463596.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0012201941572129726,
+ "skip_count": 0.0,
+ "step": 7730,
+ "text_loss": 0.47347909212112427
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 36.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0001905593791917148,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 12467021.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005837214644998312,
+ "skip_count": 2.0,
+ "step": 7732,
+ "text_loss": 0.2055564969778061
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.30995010272967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.00019031631804780974,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12469743.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010269953636452556,
+ "skip_count": 0.0,
+ "step": 7734,
+ "text_loss": 0.45995602011680603
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00019007337557527582,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12473082.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00436213007196784,
+ "skip_count": 1.0,
+ "step": 7736,
+ "text_loss": 0.4515823721885681
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.32873495744057,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.00018983055186720888,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12476100.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003051829058676958,
+ "skip_count": 2.0,
+ "step": 7738,
+ "text_loss": 0.12298467755317688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.33812738479601,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.0001895878470166597,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12480231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008164191618561745,
+ "skip_count": 2.0,
+ "step": 7740,
+ "text_loss": 0.17456457018852234
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.347519812151454,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046630859375,
+ "learning_rate": 0.00018934526111663314,
+ "loss": 0.0069,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 12483894.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008653721772134304,
+ "skip_count": 1.0,
+ "step": 7742,
+ "text_loss": 0.7125775814056396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 36.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.00018910279426008857,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12488077.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005024447571486235,
+ "skip_count": 6.0,
+ "step": 7744,
+ "text_loss": 0.833778977394104
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.00018886044653993966,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12490999.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002690888475626707,
+ "skip_count": 0.0,
+ "step": 7746,
+ "text_loss": 0.15594039857387543
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.375697094217784,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00018861821804905466,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12494765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006087568122893572,
+ "skip_count": 0.0,
+ "step": 7748,
+ "text_loss": 0.2696777880191803
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.385089521573235,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00018837610888025586,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12497741.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014629303477704525,
+ "skip_count": 0.0,
+ "step": 7750,
+ "text_loss": 0.6801294684410095
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.11865234375,
+ "learning_rate": 0.00018813411912631996,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12500585.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001163579523563385,
+ "skip_count": 0.0,
+ "step": 7752,
+ "text_loss": 0.41069695353507996
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 36.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.00018789224887997796,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12503579.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.009436148218810558,
+ "skip_count": 0.0,
+ "step": 7754,
+ "text_loss": 0.6993107795715332
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.00018765049823391472,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 12506698.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002098206663504243,
+ "skip_count": 2.0,
+ "step": 7756,
+ "text_loss": 0.5704247951507568
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00018740886728077,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12509869.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002066673245280981,
+ "skip_count": 1.0,
+ "step": 7758,
+ "text_loss": 0.7605635523796082
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.43205165835045,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.00018716735611313707,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12513433.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023439819924533367,
+ "skip_count": 1.0,
+ "step": 7760,
+ "text_loss": 0.4746153950691223
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.441444085705896,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.00018692596482356333,
+ "loss": 0.0057,
+ "macro_f1": 0.9255813956260681,
+ "num_tokens": 12516817.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.039019811898469925,
+ "skip_count": 4.0,
+ "step": 7762,
+ "text_loss": 0.3105330467224121
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.45083651306135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 0.00018668469350455048,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12519357.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002269966993480921,
+ "skip_count": 0.0,
+ "step": 7764,
+ "text_loss": 0.3700210452079773
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00018644354224855414,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12522072.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001265842467546463,
+ "skip_count": 0.0,
+ "step": 7766,
+ "text_loss": 0.6737633943557739
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00018620251114798386,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12524999.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006547329016029835,
+ "skip_count": 1.0,
+ "step": 7768,
+ "text_loss": 0.24906545877456665
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.47901379512768,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 0.0001859616002952033,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 12527785.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.010791841894388199,
+ "skip_count": 3.0,
+ "step": 7770,
+ "text_loss": 0.3069820702075958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.48840622248312,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.0001857208097825299,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12530801.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00492103723809123,
+ "skip_count": 2.0,
+ "step": 7772,
+ "text_loss": 0.2524295151233673
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.0001854801397022351,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12533919.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001942967064678669,
+ "skip_count": 0.0,
+ "step": 7774,
+ "text_loss": 0.7855241894721985
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 36.507191077194015,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.00018523959014654407,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 12537265.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.00987488217651844,
+ "skip_count": 2.0,
+ "step": 7776,
+ "text_loss": 0.2767317593097687
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 0.00018499916120763582,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12539695.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0054283770732581615,
+ "skip_count": 1.0,
+ "step": 7778,
+ "text_loss": 0.43287888169288635
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 36.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00018475885297764306,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12542881.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.00797359924763441,
+ "skip_count": 0.0,
+ "step": 7780,
+ "text_loss": 0.3738224506378174
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.0001845186655486527,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12546530.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0045951665379107,
+ "skip_count": 0.0,
+ "step": 7782,
+ "text_loss": 0.2511517107486725
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 36.54476078661579,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.00018427859901270482,
+ "loss": 0.0055,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 12549439.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02312052994966507,
+ "skip_count": 4.0,
+ "step": 7784,
+ "text_loss": 0.3837030827999115
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
+ "epoch": 36.55415321397123,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.059814453125,
+ "learning_rate": 0.00018403865346179344,
+ "loss": 0.0066,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 12553211.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.014698561280965805,
+ "skip_count": 3.0,
+ "step": 7786,
+ "text_loss": 0.510159432888031
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 36.563545641326684,
+ "f1_execute": 0.9743589162826538,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 0.00018379882898786603,
+ "loss": 0.0075,
+ "macro_f1": 0.8803418874740601,
+ "num_tokens": 12556497.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.023926246911287308,
+ "skip_count": 7.0,
+ "step": 7788,
+ "text_loss": 0.44811317324638367
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.57293806868213,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.00018355912568282384,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12559778.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011187797645106912,
+ "skip_count": 0.0,
+ "step": 7790,
+ "text_loss": 0.32099616527557373
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.00018331954363852166,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12562610.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005356677575036883,
+ "skip_count": 0.0,
+ "step": 7792,
+ "text_loss": 0.9754356145858765
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 36.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 0.0001830800829467677,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12565886.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0017101728590205312,
+ "skip_count": 0.0,
+ "step": 7794,
+ "text_loss": 0.4234761595726013
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.00018284074369932386,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12568728.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012841494753956795,
+ "skip_count": 0.0,
+ "step": 7796,
+ "text_loss": 0.41109147667884827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0001826015259879053,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12572231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022388407960534096,
+ "skip_count": 0.0,
+ "step": 7798,
+ "text_loss": 0.5459926128387451
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.619900205459345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.00018236242990418074,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12574968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019992550369352102,
+ "skip_count": 0.0,
+ "step": 7800,
+ "text_loss": 0.5028481483459473
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.629292632814796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.0001821234555397722,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12579074.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002936388598755002,
+ "skip_count": 2.0,
+ "step": 7802,
+ "text_loss": 0.2377086579799652
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 36.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.00018188460298625503,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12581912.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0026762608904391527,
+ "skip_count": 0.0,
+ "step": 7804,
+ "text_loss": 0.13887254893779755
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 30.0,
+ "epoch": 36.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00018164587233515824,
+ "loss": 0.0057,
+ "macro_f1": 1.0,
+ "num_tokens": 12585020.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.003901638789102435,
+ "skip_count": 1.0,
+ "step": 7806,
+ "text_loss": 0.35454171895980835
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.00018140726367796373,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12588310.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031358697451651096,
+ "skip_count": 2.0,
+ "step": 7808,
+ "text_loss": 0.3567306697368622
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.66686234223657,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.00018116877710610673,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12591735.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002310588024556637,
+ "skip_count": 1.0,
+ "step": 7810,
+ "text_loss": 0.45357072353363037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.00018093041271097582,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12595232.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005600228440016508,
+ "skip_count": 2.0,
+ "step": 7812,
+ "text_loss": 0.4179847836494446
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.685647196947464,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.00018069217058391267,
+ "loss": 0.006,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 12598367.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04015933722257614,
+ "skip_count": 1.0,
+ "step": 7814,
+ "text_loss": 0.17874565720558167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.00018045405081621214,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12601864.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005119446665048599,
+ "skip_count": 1.0,
+ "step": 7816,
+ "text_loss": 0.6867854595184326
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.00018021605349912207,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12605268.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005990012432448566,
+ "skip_count": 0.0,
+ "step": 7818,
+ "text_loss": 0.9084970355033875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.00017997817872384358,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12608093.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008712377399206161,
+ "skip_count": 1.0,
+ "step": 7820,
+ "text_loss": 0.19413328170776367
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.00017974042658153066,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12611001.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007535711396485567,
+ "skip_count": 1.0,
+ "step": 7822,
+ "text_loss": 0.2672932744026184
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.73260933372468,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.0001795027971632905,
+ "loss": 0.0042,
+ "macro_f1": 1.0,
+ "num_tokens": 12614584.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006770546548068523,
+ "skip_count": 3.0,
+ "step": 7824,
+ "text_loss": 0.22805163264274597
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0189208984375,
+ "learning_rate": 0.00017926529056018297,
+ "loss": 0.0031,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12617519.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010458873584866524,
+ "skip_count": 0.0,
+ "step": 7826,
+ "text_loss": 0.385499507188797
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 36.751394188435576,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 0.00017902790686322102,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12621566.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00634258147329092,
+ "skip_count": 0.0,
+ "step": 7828,
+ "text_loss": 0.8044118285179138
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 36.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.00017879064616337076,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12624751.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0053052278235554695,
+ "skip_count": 3.0,
+ "step": 7830,
+ "text_loss": 0.264322966337204
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.77017904314646,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 0.00017855350855155088,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12628478.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028291696216911077,
+ "skip_count": 0.0,
+ "step": 7832,
+ "text_loss": 0.20611460506916046
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 36.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.00017831649411863287,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12632027.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009586421074345708,
+ "skip_count": 1.0,
+ "step": 7834,
+ "text_loss": 0.4119716286659241
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00017807960295544118,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12635144.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012304541654884815,
+ "skip_count": 2.0,
+ "step": 7836,
+ "text_loss": 0.28647977113723755
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0001778428351527529,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12638719.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005212076939642429,
+ "skip_count": 2.0,
+ "step": 7838,
+ "text_loss": 0.630459189414978
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.807748752568244,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.0001776061908012979,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12642119.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00183707510586828,
+ "skip_count": 0.0,
+ "step": 7840,
+ "text_loss": 0.5905961990356445
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 36.81714117992369,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.0001773696699917588,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12645077.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0058263009414076805,
+ "skip_count": 0.0,
+ "step": 7842,
+ "text_loss": 0.41949576139450073
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.00017713327281477077,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12648964.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001586507773026824,
+ "skip_count": 0.0,
+ "step": 7844,
+ "text_loss": 0.5048848390579224
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.00017689699936092163,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12651934.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002397194504737854,
+ "skip_count": 0.0,
+ "step": 7846,
+ "text_loss": 0.23879878222942352
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 36.84531846199002,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.0001766608497207518,
+ "loss": 0.0054,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 12654907.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016742069274187088,
+ "skip_count": 2.0,
+ "step": 7848,
+ "text_loss": 0.23400072753429413
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.0001764248239847544,
+ "loss": 0.0085,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12658765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007037387229502201,
+ "skip_count": 2.0,
+ "step": 7850,
+ "text_loss": 0.26165497303009033
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 36.86410331670091,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.017822265625,
+ "learning_rate": 0.00017618892224337463,
+ "loss": 0.0044,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 12662024.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017352160066366196,
+ "skip_count": 2.0,
+ "step": 7852,
+ "text_loss": 0.23813043534755707
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 36.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.00017595314458701084,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12665751.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005349365528672934,
+ "skip_count": 3.0,
+ "step": 7854,
+ "text_loss": 0.14920757710933685
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.00017571749110601337,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12668823.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037689812015742064,
+ "skip_count": 2.0,
+ "step": 7856,
+ "text_loss": 0.2198697030544281
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.89228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.00017548196189068506,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12672367.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006363615393638611,
+ "skip_count": 0.0,
+ "step": 7858,
+ "text_loss": 0.5338839888572693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00017524655703128112,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12675217.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002691479865461588,
+ "skip_count": 0.0,
+ "step": 7860,
+ "text_loss": 0.17463763058185577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00017501127661800908,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12678796.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002262329449877143,
+ "skip_count": 0.0,
+ "step": 7862,
+ "text_loss": 0.4637797474861145
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.92045788083358,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 0.00017477612074102899,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12681631.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00115531450137496,
+ "skip_count": 0.0,
+ "step": 7864,
+ "text_loss": 0.6089238524436951
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.00017454108949045295,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12685647.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00260268640704453,
+ "skip_count": 0.0,
+ "step": 7866,
+ "text_loss": 0.5876018404960632
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.93924273554447,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.00017430618295634514,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12688995.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002731681102886796,
+ "skip_count": 0.0,
+ "step": 7868,
+ "text_loss": 0.35076001286506653
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 36.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.00017407140122872262,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 12692100.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003314645728096366,
+ "skip_count": 1.0,
+ "step": 7870,
+ "text_loss": 0.5313478112220764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.958027590255355,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00017383674439755393,
+ "loss": 0.0069,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 12695117.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010385016910731792,
+ "skip_count": 1.0,
+ "step": 7872,
+ "text_loss": 0.5092368125915527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.00017360221255276016,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12697678.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001273582922294736,
+ "skip_count": 0.0,
+ "step": 7874,
+ "text_loss": 0.5282881855964661
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 36.97681244496625,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.00017336780578421418,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12702132.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007510313298553228,
+ "skip_count": 0.0,
+ "step": 7876,
+ "text_loss": 0.49093571305274963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 36.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 0.0001731335241817412,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12705413.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005138787440955639,
+ "skip_count": 2.0,
+ "step": 7878,
+ "text_loss": 0.7503541111946106
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 36.99559729967714,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0001728993678351184,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12708310.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.004379773512482643,
+ "skip_count": 0.0,
+ "step": 7880,
+ "text_loss": 0.5942456126213074
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.004696213677725,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 0.0001726653368340747,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12711043.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005271450616419315,
+ "skip_count": 2.0,
+ "step": 7882,
+ "text_loss": 0.348360538482666
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 37.01408864103317,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00017243143126829163,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 12714473.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015764752170071006,
+ "skip_count": 1.0,
+ "step": 7884,
+ "text_loss": 0.45971861481666565
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.02348106838861,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.000172197651227402,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12717832.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00040649910806678236,
+ "skip_count": 0.0,
+ "step": 7886,
+ "text_loss": 0.5996841788291931
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 0.00017196399680099078,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12720479.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00473182974383235,
+ "skip_count": 2.0,
+ "step": 7888,
+ "text_loss": 0.40346208214759827
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.00017173046807859483,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12723104.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020138369873166084,
+ "skip_count": 0.0,
+ "step": 7890,
+ "text_loss": 0.6878634095191956
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.05165835045494,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.0001714970651497027,
+ "loss": 0.005,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 12725967.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008381367661058903,
+ "skip_count": 1.0,
+ "step": 7892,
+ "text_loss": 0.9161711931228638
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 37.061050777810394,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 0.00017126378810375498,
+ "loss": 0.0033,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12728819.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0037658829241991043,
+ "skip_count": 0.0,
+ "step": 7894,
+ "text_loss": 0.4447716772556305
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.07044320516584,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.00017103063703014372,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12731806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022742559667676687,
+ "skip_count": 0.0,
+ "step": 7896,
+ "text_loss": 0.9140825867652893
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.00017079761201821298,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12734649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002157264854758978,
+ "skip_count": 0.0,
+ "step": 7898,
+ "text_loss": 0.268303781747818
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 37.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.0001705647131572583,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 12737889.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01064873393625021,
+ "skip_count": 1.0,
+ "step": 7900,
+ "text_loss": 0.36009490489959717
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 37.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.00017033194053652685,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12740821.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0062920586206018925,
+ "skip_count": 0.0,
+ "step": 7902,
+ "text_loss": 0.5301805138587952
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 37.10801291458761,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 0.00017009929424521782,
+ "loss": 0.0063,
+ "macro_f1": 1.0,
+ "num_tokens": 12743876.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0033694824669510126,
+ "skip_count": 1.0,
+ "step": 7904,
+ "text_loss": 1.026949167251587
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.117405341943055,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 0.00016986677437248155,
+ "loss": 0.0071,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 12747623.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.05076088383793831,
+ "skip_count": 3.0,
+ "step": 7906,
+ "text_loss": 0.33465588092803955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.126797769298506,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.00016963438100742014,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12751255.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005921403644606471,
+ "skip_count": 0.0,
+ "step": 7908,
+ "text_loss": 0.3498881757259369
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.00016940211423908713,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12754297.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004132566973567009,
+ "skip_count": 0.0,
+ "step": 7910,
+ "text_loss": 0.2874198853969574
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.14558262400939,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.0001691699741564876,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12756969.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024724705144762993,
+ "skip_count": 1.0,
+ "step": 7912,
+ "text_loss": 0.10593545436859131
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.00016893796084857806,
+ "loss": 0.0031,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12760261.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002991671208292246,
+ "skip_count": 0.0,
+ "step": 7914,
+ "text_loss": 0.1331545114517212
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.16436747872028,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 0.00016870607440426643,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12762971.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018167285015806556,
+ "skip_count": 0.0,
+ "step": 7916,
+ "text_loss": 0.496826171875
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 37.17375990607572,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 0.00016847431491241207,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12765949.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0033364067785441875,
+ "skip_count": 0.0,
+ "step": 7918,
+ "text_loss": 0.43522849678993225
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.183152333431174,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 0.0001682426824618256,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12769201.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001313596498221159,
+ "skip_count": 0.0,
+ "step": 7920,
+ "text_loss": 0.8691539168357849
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.19254476078662,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.00016801117714126908,
+ "loss": 0.0108,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 12773308.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02579287625849247,
+ "skip_count": 1.0,
+ "step": 7922,
+ "text_loss": 0.275301069021225
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.00016777979903945568,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12776166.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010501758195459843,
+ "skip_count": 1.0,
+ "step": 7924,
+ "text_loss": 0.32124993205070496
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.0001675485482450499,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12779965.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0063389060087502,
+ "skip_count": 2.0,
+ "step": 7926,
+ "text_loss": 0.2527695894241333
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00016731742484666774,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12783019.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002796935848891735,
+ "skip_count": 0.0,
+ "step": 7928,
+ "text_loss": 0.18767669796943665
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.23011447020839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0001670864289328759,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12786291.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007973561994731426,
+ "skip_count": 2.0,
+ "step": 7930,
+ "text_loss": 0.29628485441207886
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 0.00016685556059219253,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 12789566.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.011405733413994312,
+ "skip_count": 6.0,
+ "step": 7932,
+ "text_loss": 0.16635073721408844
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.248899324919286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00016662481991308682,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12792533.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012368770549073815,
+ "skip_count": 1.0,
+ "step": 7934,
+ "text_loss": 0.4196353852748871
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 0.000166394206983979,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12795619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036002211272716522,
+ "skip_count": 1.0,
+ "step": 7936,
+ "text_loss": 0.17559808492660522
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 37.26768417963017,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.00016616372189324035,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12799702.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0039332108572125435,
+ "skip_count": 0.0,
+ "step": 7938,
+ "text_loss": 0.603410542011261
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.27707660698562,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00016593336472919324,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12802704.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008303318754769862,
+ "skip_count": 0.0,
+ "step": 7940,
+ "text_loss": 0.5331749320030212
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.28646903434106,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.00016570313558011098,
+ "loss": 0.0058,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 12805630.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.05092398822307587,
+ "skip_count": 2.0,
+ "step": 7942,
+ "text_loss": 0.17398510873317719
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.295861461696504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.00016547303453421774,
+ "loss": 0.0031,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12809065.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006886976188980043,
+ "skip_count": 0.0,
+ "step": 7944,
+ "text_loss": 0.3419797718524933
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.305253889051954,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044677734375,
+ "learning_rate": 0.00016524306167968878,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 12812641.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005634502973407507,
+ "skip_count": 3.0,
+ "step": 7946,
+ "text_loss": 0.5877651572227478
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.3146463164074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.00016501321710465005,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12815527.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020598487462848425,
+ "skip_count": 0.0,
+ "step": 7948,
+ "text_loss": 0.3558528423309326
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 0.0001647835008971783,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12819103.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005946476943790913,
+ "skip_count": 2.0,
+ "step": 7950,
+ "text_loss": 0.5800213813781738
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.00016455391314530154,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12822423.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010360358282923698,
+ "skip_count": 2.0,
+ "step": 7952,
+ "text_loss": 0.278255820274353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.00016432445393699802,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12826180.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003017681185156107,
+ "skip_count": 0.0,
+ "step": 7954,
+ "text_loss": 0.1571389138698578
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.35221602582917,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00016409512336019698,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12829196.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008854938205331564,
+ "skip_count": 0.0,
+ "step": 7956,
+ "text_loss": 0.2776578366756439
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 0.00016386592150277834,
+ "loss": 0.0092,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12831983.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023990103509277105,
+ "skip_count": 0.0,
+ "step": 7958,
+ "text_loss": 0.46686989068984985
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 37.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.0001636368484525727,
+ "loss": 0.0035,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12834889.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009835032746195793,
+ "skip_count": 5.0,
+ "step": 7960,
+ "text_loss": 0.22224856913089752
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.00016340790429736118,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12837950.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018618656322360039,
+ "skip_count": 0.0,
+ "step": 7962,
+ "text_loss": 0.5101882815361023
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 37.38978573525095,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 0.00016317908912487578,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 12840981.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001275144051760435,
+ "skip_count": 1.0,
+ "step": 7964,
+ "text_loss": 0.40567103028297424
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.3991781626064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.00016295040302279873,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12844044.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003117429558187723,
+ "skip_count": 2.0,
+ "step": 7966,
+ "text_loss": 0.6888198852539062
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.40857058996184,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.00016272184607876312,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 12847350.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006585797294974327,
+ "skip_count": 4.0,
+ "step": 7968,
+ "text_loss": 0.19813506305217743
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 37.41796301731729,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.0001624934183803523,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 12850285.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0043576788157224655,
+ "skip_count": 1.0,
+ "step": 7970,
+ "text_loss": 0.6108269691467285
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 37.427355444672735,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00016226512001510024,
+ "loss": 0.0039,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 12853993.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011879517696797848,
+ "skip_count": 2.0,
+ "step": 7972,
+ "text_loss": 0.42478689551353455
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.43674787202818,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.00016203695107049117,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12857022.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016375730047002435,
+ "skip_count": 0.0,
+ "step": 7974,
+ "text_loss": 0.5130020976066589
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 0.0001618089116339601,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12860764.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006649247952736914,
+ "skip_count": 0.0,
+ "step": 7976,
+ "text_loss": 1.0629136562347412
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.455532726739065,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.00016158100179289208,
+ "loss": 0.0062,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 12864066.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03140667825937271,
+ "skip_count": 1.0,
+ "step": 7978,
+ "text_loss": 0.4241345226764679
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 37.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 0.0001613532216346226,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12867555.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010257012210786343,
+ "skip_count": 4.0,
+ "step": 7980,
+ "text_loss": 0.6085613369941711
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.47431758144996,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0001611255712464374,
+ "loss": 0.0037,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12871415.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00783725269138813,
+ "skip_count": 1.0,
+ "step": 7982,
+ "text_loss": 0.15661844611167908
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.017578125,
+ "learning_rate": 0.00016089805071557256,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 12874195.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0027650597039610147,
+ "skip_count": 2.0,
+ "step": 7984,
+ "text_loss": 0.4938865005970001
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.49310243616085,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049072265625,
+ "learning_rate": 0.00016067066012921439,
+ "loss": 0.0083,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 12878084.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04647083953022957,
+ "skip_count": 0.0,
+ "step": 7986,
+ "text_loss": 0.2973119020462036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 0.00016044339957449938,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12881182.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002192265819758177,
+ "skip_count": 0.0,
+ "step": 7988,
+ "text_loss": 0.2623208165168762
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.00016021626913851418,
+ "loss": 0.0031,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12884028.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023096329532563686,
+ "skip_count": 0.0,
+ "step": 7990,
+ "text_loss": 0.3752247989177704
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.52127971822718,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.00015998926890829562,
+ "loss": 0.0046,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 12887759.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03038526326417923,
+ "skip_count": 1.0,
+ "step": 7992,
+ "text_loss": 0.2609226405620575
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.53067214558262,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0001597623989708306,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12890976.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015199477784335613,
+ "skip_count": 0.0,
+ "step": 7994,
+ "text_loss": 0.6512867212295532
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.54006457293807,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.00015953565941305615,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12894112.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024166766088455915,
+ "skip_count": 0.0,
+ "step": 7996,
+ "text_loss": 0.5539866089820862
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05908203125,
+ "learning_rate": 0.0001593090503218591,
+ "loss": 0.0069,
+ "macro_f1": 1.0,
+ "num_tokens": 12896857.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005081235896795988,
+ "skip_count": 2.0,
+ "step": 7998,
+ "text_loss": 0.6631022691726685
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 37.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.00015908257178407682,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12900075.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0024711282458156347,
+ "skip_count": 0.0,
+ "step": 8000,
+ "text_loss": 0.3309785723686218
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.5682418550044,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.00015885622388649617,
+ "loss": 0.0059,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 12903845.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04024988412857056,
+ "skip_count": 2.0,
+ "step": 8002,
+ "text_loss": 0.2384071946144104
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.577634282359845,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 0.00015863000671585405,
+ "loss": 0.008,
+ "macro_f1": 1.0,
+ "num_tokens": 12907694.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001953886589035392,
+ "skip_count": 2.0,
+ "step": 8004,
+ "text_loss": 0.5001366138458252
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.58702670971529,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.00015840392035883726,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12910871.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002982128644362092,
+ "skip_count": 2.0,
+ "step": 8006,
+ "text_loss": 0.2589346170425415
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.59641913707074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0001581779649020827,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12914484.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009384988807141781,
+ "skip_count": 0.0,
+ "step": 8008,
+ "text_loss": 0.5727795362472534
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.00015795214043217654,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12917480.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008854437619447708,
+ "skip_count": 2.0,
+ "step": 8010,
+ "text_loss": 0.24354904890060425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.61520399178163,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.00015772644703565563,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12920383.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001689503900706768,
+ "skip_count": 0.0,
+ "step": 8012,
+ "text_loss": 0.5372336506843567
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.00015750088479900588,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12923886.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002284591319039464,
+ "skip_count": 0.0,
+ "step": 8014,
+ "text_loss": 0.1708722710609436
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 37.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.00015727545380866316,
+ "loss": 0.0042,
+ "macro_f1": 1.0,
+ "num_tokens": 12926998.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004594483878463507,
+ "skip_count": 4.0,
+ "step": 8016,
+ "text_loss": 0.26784324645996094
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 37.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.0001570501541510131,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12929726.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0021998141892254353,
+ "skip_count": 0.0,
+ "step": 8018,
+ "text_loss": 0.8051869869232178
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.65277370120341,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00015682498591239086,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12932182.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032623414881527424,
+ "skip_count": 1.0,
+ "step": 8020,
+ "text_loss": 0.8431181907653809
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00015659994917908144,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12935338.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014909361489117146,
+ "skip_count": 1.0,
+ "step": 8022,
+ "text_loss": 0.6168642640113831
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 0.0001563750440373191,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12938484.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010295510292053223,
+ "skip_count": 0.0,
+ "step": 8024,
+ "text_loss": 0.2694014608860016
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 37.68095098326974,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00015615027057328828,
+ "loss": 0.0066,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 12942045.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018341995775699615,
+ "skip_count": 2.0,
+ "step": 8026,
+ "text_loss": 0.8151478171348572
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 37.69034341062518,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.0001559256288731224,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 12945547.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0023289949167519808,
+ "skip_count": 1.0,
+ "step": 8028,
+ "text_loss": 0.613464891910553
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.699735837980626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 0.00015570111902290463,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12949544.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006635872647166252,
+ "skip_count": 2.0,
+ "step": 8030,
+ "text_loss": 0.17417465150356293
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 37.70912826533607,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04931640625,
+ "learning_rate": 0.00015547674110866756,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 12952838.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006023989990353584,
+ "skip_count": 1.0,
+ "step": 8032,
+ "text_loss": 0.4801837205886841
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.71852069269152,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.00015525249521639319,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12956329.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005706884432584047,
+ "skip_count": 0.0,
+ "step": 8034,
+ "text_loss": 0.2028084248304367
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.72791312004696,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 0.000155028381432013,
+ "loss": 0.0034,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12959122.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003527123713865876,
+ "skip_count": 2.0,
+ "step": 8036,
+ "text_loss": 0.39474430680274963
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.73730554740241,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0179443359375,
+ "learning_rate": 0.00015480439984140776,
+ "loss": 0.0029,
+ "macro_f1": 1.0,
+ "num_tokens": 12962546.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010415437631309032,
+ "skip_count": 2.0,
+ "step": 8038,
+ "text_loss": 0.20412345230579376
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 0.0001545805505304077,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12965861.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001566931139677763,
+ "skip_count": 0.0,
+ "step": 8040,
+ "text_loss": 0.5129821300506592
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 37.756090402113294,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.0001543568335847923,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12968677.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.0037196793127804995,
+ "skip_count": 0.0,
+ "step": 8042,
+ "text_loss": 0.755020260810852
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.00015413324909029031,
+ "loss": 0.0086,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12972001.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010940275387838483,
+ "skip_count": 0.0,
+ "step": 8044,
+ "text_loss": 0.48672133684158325
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.77487525682419,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.00015390979713257968,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12974765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011106903664767742,
+ "skip_count": 1.0,
+ "step": 8046,
+ "text_loss": 0.1727766990661621
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 25.0,
+ "epoch": 37.78426768417963,
+ "f1_execute": 0.949999988079071,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.048828125,
+ "learning_rate": 0.00015368647779728757,
+ "loss": 0.006,
+ "macro_f1": 0.886363685131073,
+ "num_tokens": 12979127.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.05134248360991478,
+ "skip_count": 6.0,
+ "step": 8048,
+ "text_loss": 0.33233317732810974
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.00015346329116999057,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 12982812.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027500339783728123,
+ "skip_count": 0.0,
+ "step": 8050,
+ "text_loss": 0.8176849484443665
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.80305253889052,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.00015324023733621412,
+ "loss": 0.005,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 12985740.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030734945088624954,
+ "skip_count": 2.0,
+ "step": 8052,
+ "text_loss": 0.38721024990081787
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 37.81244496624596,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 0.00015301731638143285,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 12988646.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002358534839004278,
+ "skip_count": 2.0,
+ "step": 8054,
+ "text_loss": 0.5656245946884155
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.821837393601406,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.060791015625,
+ "learning_rate": 0.0001527945283910705,
+ "loss": 0.0074,
+ "macro_f1": 1.0,
+ "num_tokens": 12991518.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.007991814985871315,
+ "skip_count": 3.0,
+ "step": 8056,
+ "text_loss": 0.26438817381858826
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 37.83122982095686,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 0.00015257187345049983,
+ "loss": 0.0079,
+ "macro_f1": 1.0,
+ "num_tokens": 12994847.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011761264875531197,
+ "skip_count": 1.0,
+ "step": 8058,
+ "text_loss": 0.1801673173904419
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 37.8406222483123,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 0.0001523493516450427,
+ "loss": 0.004,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 12997874.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.021669765934348106,
+ "skip_count": 2.0,
+ "step": 8060,
+ "text_loss": 0.3278379738330841
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0001521269630599698,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13000504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002388916676864028,
+ "skip_count": 0.0,
+ "step": 8062,
+ "text_loss": 0.5396623611450195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.85940710302319,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00015190470778050086,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13003620.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007719808723777533,
+ "skip_count": 1.0,
+ "step": 8064,
+ "text_loss": 0.1989232450723648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 0.00015168258589180462,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13007410.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007461659261025488,
+ "skip_count": 0.0,
+ "step": 8066,
+ "text_loss": 0.5293997526168823
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 37.878191957734074,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.00015146059747899848,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13010240.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005515575874596834,
+ "skip_count": 0.0,
+ "step": 8068,
+ "text_loss": 0.2776186466217041
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.887584385089525,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00015123874262714892,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13012728.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026730166282504797,
+ "skip_count": 0.0,
+ "step": 8070,
+ "text_loss": 0.5902766585350037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.89697681244497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04833984375,
+ "learning_rate": 0.00015101702142127088,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13015616.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002244985429570079,
+ "skip_count": 0.0,
+ "step": 8072,
+ "text_loss": 0.21447396278381348
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 0.00015079543394632878,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13019846.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001963787479326129,
+ "skip_count": 0.0,
+ "step": 8074,
+ "text_loss": 0.22974267601966858
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 37.915761667155856,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 0.00015057398028723513,
+ "loss": 0.0064,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 13023036.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02271878905594349,
+ "skip_count": 2.0,
+ "step": 8076,
+ "text_loss": 0.26458361744880676
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 0.00015035266052885137,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13025840.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011732397833839059,
+ "skip_count": 0.0,
+ "step": 8078,
+ "text_loss": 0.44129177927970886
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.93454652186674,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.0001501314747559877,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 13030031.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015655985102057457,
+ "skip_count": 2.0,
+ "step": 8080,
+ "text_loss": 0.28889161348342896
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.943938949222186,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.00014991042305340286,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13033603.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012988687958568335,
+ "skip_count": 0.0,
+ "step": 8082,
+ "text_loss": 0.16362667083740234
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.95333137657764,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00014968950550580434,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13036931.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002425852930173278,
+ "skip_count": 0.0,
+ "step": 8084,
+ "text_loss": 0.35900676250457764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 37.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.0001494687221978482,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13040637.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004092676565051079,
+ "skip_count": 1.0,
+ "step": 8086,
+ "text_loss": 0.20662656426429749
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00014924807321413893,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13043855.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009040542645379901,
+ "skip_count": 0.0,
+ "step": 8088,
+ "text_loss": 0.30341213941574097
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.98150865864397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 0.0001490275586392296,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13046903.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019248841563239694,
+ "skip_count": 0.0,
+ "step": 8090,
+ "text_loss": 0.4299648702144623
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 37.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.000148807178557622,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13050219.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008314658771269023,
+ "skip_count": 0.0,
+ "step": 8092,
+ "text_loss": 0.4521652162075043
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00014858693305376598,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13053076.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007470731507055461,
+ "skip_count": 0.0,
+ "step": 8094,
+ "text_loss": 0.46265852451324463
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 38.00939242735544,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.00014836682221206,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13056170.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003292408073320985,
+ "skip_count": 0.0,
+ "step": 8096,
+ "text_loss": 0.6483868956565857
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.00014814684611685124,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13059181.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001357200788334012,
+ "skip_count": 0.0,
+ "step": 8098,
+ "text_loss": 0.43141183257102966
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0177001953125,
+ "learning_rate": 0.00014792700485243476,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13062124.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030062920413911343,
+ "skip_count": 0.0,
+ "step": 8100,
+ "text_loss": 0.26022693514823914
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.0001477072985030542,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13065273.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006919128354638815,
+ "skip_count": 0.0,
+ "step": 8102,
+ "text_loss": 0.5927232503890991
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.00014748772715290144,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13068346.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005062389187514782,
+ "skip_count": 0.0,
+ "step": 8104,
+ "text_loss": 0.1255214959383011
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 0.00014726829088611664,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13071384.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005492564523592591,
+ "skip_count": 0.0,
+ "step": 8106,
+ "text_loss": 0.6445038914680481
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 0.00014704898978678817,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13074667.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002470226027071476,
+ "skip_count": 0.0,
+ "step": 8108,
+ "text_loss": 0.5019628405570984
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.00014682982393895256,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13077566.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008262090268544853,
+ "skip_count": 0.0,
+ "step": 8110,
+ "text_loss": 0.6075460314750671
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.084531846199,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 0.00014661079342659467,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13081042.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00034181721275672317,
+ "skip_count": 0.0,
+ "step": 8112,
+ "text_loss": 0.7349393963813782
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.09392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0001463918983336474,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 13084151.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01406828872859478,
+ "skip_count": 2.0,
+ "step": 8114,
+ "text_loss": 0.3122454285621643
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017822265625,
+ "learning_rate": 0.00014617313874399173,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13086998.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002714085392653942,
+ "skip_count": 0.0,
+ "step": 8116,
+ "text_loss": 0.6545852422714233
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.11270912826534,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.00014595451474145677,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13090017.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0073202489875257015,
+ "skip_count": 0.0,
+ "step": 8118,
+ "text_loss": 0.5487201809883118
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.00014573602640981947,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13093651.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000667977670673281,
+ "skip_count": 0.0,
+ "step": 8120,
+ "text_loss": 0.672166109085083
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.00014551767383280535,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13097139.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020584615413099527,
+ "skip_count": 0.0,
+ "step": 8122,
+ "text_loss": 0.1996239423751831
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 38.14088641033167,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.00014529945709408726,
+ "loss": 0.0069,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 13100493.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013855135068297386,
+ "skip_count": 3.0,
+ "step": 8124,
+ "text_loss": 0.4099486768245697
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.0001450813762772863,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13103488.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014984552981331944,
+ "skip_count": 0.0,
+ "step": 8126,
+ "text_loss": 0.6307108402252197
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 38.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.00014486343146597152,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13106445.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00430954247713089,
+ "skip_count": 0.0,
+ "step": 8128,
+ "text_loss": 0.6226127743721008
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07177734375,
+ "learning_rate": 0.00014464562274365972,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13109258.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003711461555212736,
+ "skip_count": 1.0,
+ "step": 8130,
+ "text_loss": 0.17819052934646606
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.17845611975345,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00014442795019381567,
+ "loss": 0.0064,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 13114206.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015719098970294,
+ "skip_count": 1.0,
+ "step": 8132,
+ "text_loss": 0.28450697660446167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.00014421041389985184,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13117351.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013113922905176878,
+ "skip_count": 0.0,
+ "step": 8134,
+ "text_loss": 0.310830682516098
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 38.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 0.00014399301394512858,
+ "loss": 0.0051,
+ "macro_f1": 1.0,
+ "num_tokens": 13120228.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001965439412742853,
+ "skip_count": 1.0,
+ "step": 8136,
+ "text_loss": 0.8635116815567017
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.00014377575041295393,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 13123380.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004898902028799057,
+ "skip_count": 2.0,
+ "step": 8138,
+ "text_loss": 0.5302467346191406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.21602582917523,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0179443359375,
+ "learning_rate": 0.0001435586233865836,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13126875.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00031845085322856903,
+ "skip_count": 0.0,
+ "step": 8140,
+ "text_loss": 0.5913560390472412
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 38.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.0001433416329492213,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 13129563.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00298812473192811,
+ "skip_count": 1.0,
+ "step": 8142,
+ "text_loss": 0.5153398513793945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00014312477918401807,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13132608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026608197949826717,
+ "skip_count": 1.0,
+ "step": 8144,
+ "text_loss": 0.4554155766963959
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 38.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.00014290806217407272,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 13136204.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0027651884593069553,
+ "skip_count": 1.0,
+ "step": 8146,
+ "text_loss": 0.6349515318870544
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.00014269148200243148,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13138895.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006579195614904165,
+ "skip_count": 0.0,
+ "step": 8148,
+ "text_loss": 0.4629364013671875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.26298796595245,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.00014247503875208846,
+ "loss": 0.0059,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 13142500.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.023065708577632904,
+ "skip_count": 0.0,
+ "step": 8150,
+ "text_loss": 0.4962928593158722
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 0.00014225873250598496,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13146203.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007397830951958895,
+ "skip_count": 1.0,
+ "step": 8152,
+ "text_loss": 0.3225953280925751
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00014204256334700988,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13149517.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004839105997234583,
+ "skip_count": 1.0,
+ "step": 8154,
+ "text_loss": 0.18435558676719666
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 38.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.00014182653135799995,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13152643.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028303388971835375,
+ "skip_count": 4.0,
+ "step": 8156,
+ "text_loss": 0.5836900472640991
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0001416106366217389,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13155213.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004012314020656049,
+ "skip_count": 0.0,
+ "step": 8158,
+ "text_loss": 0.3723861575126648
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 20.0,
+ "epoch": 38.30995010272967,
+ "f1_execute": 0.9714285731315613,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 0.0001413948792209579,
+ "loss": 0.0065,
+ "macro_f1": 0.8793651461601257,
+ "num_tokens": 13158440.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04377155378460884,
+ "skip_count": 9.0,
+ "step": 8160,
+ "text_loss": 0.32476910948753357
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.0001411792592383357,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13162651.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011163362069055438,
+ "skip_count": 0.0,
+ "step": 8162,
+ "text_loss": 0.4890389144420624
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.32873495744057,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.00014096377675649823,
+ "loss": 0.0055,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 13165406.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012117774225771427,
+ "skip_count": 1.0,
+ "step": 8164,
+ "text_loss": 0.7763246893882751
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 38.33812738479601,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00014074843185801883,
+ "loss": 0.004,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 13168402.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.009951545856893063,
+ "skip_count": 2.0,
+ "step": 8166,
+ "text_loss": 0.5038266777992249
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 38.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.00014053322462541802,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 13171423.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0021372761111706495,
+ "skip_count": 1.0,
+ "step": 8168,
+ "text_loss": 0.5634724497795105
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.00014031815514116354,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13174713.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007417177548632026,
+ "skip_count": 0.0,
+ "step": 8170,
+ "text_loss": 0.4009707272052765
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 38.36630466686234,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 0.00014010322348767057,
+ "loss": 0.0077,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 13178012.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01619168184697628,
+ "skip_count": 3.0,
+ "step": 8172,
+ "text_loss": 0.29182371497154236
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.375697094217784,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.00013988842974730137,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13181096.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037969043478369713,
+ "skip_count": 0.0,
+ "step": 8174,
+ "text_loss": 0.275851845741272
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.385089521573235,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.00013967377400236515,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13184116.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007759644067846239,
+ "skip_count": 0.0,
+ "step": 8176,
+ "text_loss": 0.7569663524627686
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.00013945925633511848,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13187319.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002708743792027235,
+ "skip_count": 0.0,
+ "step": 8178,
+ "text_loss": 0.4733831286430359
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.00013924487682776492,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13190796.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005060714902356267,
+ "skip_count": 0.0,
+ "step": 8180,
+ "text_loss": 0.5663171410560608
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.413266803639566,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0001390306355624551,
+ "loss": 0.0049,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 13193705.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02932601235806942,
+ "skip_count": 1.0,
+ "step": 8182,
+ "text_loss": 0.30700045824050903
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0001388165326212867,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13196393.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011637522839009762,
+ "skip_count": 0.0,
+ "step": 8184,
+ "text_loss": 0.6897354125976562
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.43205165835045,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 0.00013860256808630427,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13199526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017184355529025197,
+ "skip_count": 0.0,
+ "step": 8186,
+ "text_loss": 0.6246579885482788
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.441444085705896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.00013838874203949954,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13202963.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026622721925377846,
+ "skip_count": 0.0,
+ "step": 8188,
+ "text_loss": 0.506066083908081
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.45083651306135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.00013817505456281099,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13207408.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000543750764336437,
+ "skip_count": 0.0,
+ "step": 8190,
+ "text_loss": 0.5192428231239319
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 0.0001379615057381241,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13211073.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010060713393613696,
+ "skip_count": 0.0,
+ "step": 8192,
+ "text_loss": 0.5640166401863098
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 0.00013774809564727104,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13214203.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005152868572622538,
+ "skip_count": 2.0,
+ "step": 8194,
+ "text_loss": 0.8643819689750671
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.47901379512768,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 0.0001375348243720312,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13217748.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017722113989293575,
+ "skip_count": 2.0,
+ "step": 8196,
+ "text_loss": 0.40500834584236145
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.48840622248312,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 0.0001373216919941304,
+ "loss": 0.005,
+ "macro_f1": 1.0,
+ "num_tokens": 13221341.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00999271310865879,
+ "skip_count": 3.0,
+ "step": 8198,
+ "text_loss": 0.2317391037940979
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.00013710869859524143,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13224288.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016836341237649322,
+ "skip_count": 0.0,
+ "step": 8200,
+ "text_loss": 0.31873467564582825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.507191077194015,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 0.00013689584425698376,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13227342.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002255793660879135,
+ "skip_count": 0.0,
+ "step": 8202,
+ "text_loss": 0.13513202965259552
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 38.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 0.0001366831290609235,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 13230912.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0062925987876951694,
+ "skip_count": 4.0,
+ "step": 8204,
+ "text_loss": 0.3692396581172943
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 38.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00013647055308857353,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13233961.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0020471401512622833,
+ "skip_count": 0.0,
+ "step": 8206,
+ "text_loss": 0.5655510425567627
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.0001362581164213934,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13237170.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009666495025157928,
+ "skip_count": 0.0,
+ "step": 8208,
+ "text_loss": 0.720582902431488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 0.00013604581914078922,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13241020.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006306356517598033,
+ "skip_count": 0.0,
+ "step": 8210,
+ "text_loss": 0.5686481595039368
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 38.55415321397123,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 0.00013583366132811374,
+ "loss": 0.0058,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 13244491.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.016230134293437004,
+ "skip_count": 0.0,
+ "step": 8212,
+ "text_loss": 0.55678790807724
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.563545641326684,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.00013562164306466624,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13247551.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003904943587258458,
+ "skip_count": 2.0,
+ "step": 8214,
+ "text_loss": 0.6521575450897217
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.57293806868213,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.00013540976443169244,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13250863.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002239734400063753,
+ "skip_count": 1.0,
+ "step": 8216,
+ "text_loss": 0.29757481813430786
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.00013519802551038452,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13254215.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004978829529136419,
+ "skip_count": 2.0,
+ "step": 8218,
+ "text_loss": 0.30598193407058716
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.00013498642638188157,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13257269.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0040260558016598225,
+ "skip_count": 0.0,
+ "step": 8220,
+ "text_loss": 0.39327144622802734
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 0.00013477496712726862,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13260573.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002124674618244171,
+ "skip_count": 0.0,
+ "step": 8222,
+ "text_loss": 0.38342708349227905
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00013456364782757718,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13263684.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00087209593039006,
+ "skip_count": 0.0,
+ "step": 8224,
+ "text_loss": 0.6338301301002502
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 38.619900205459345,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00013435246856378526,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13266879.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003183641703799367,
+ "skip_count": 0.0,
+ "step": 8226,
+ "text_loss": 0.6073583364486694
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.629292632814796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0169677734375,
+ "learning_rate": 0.00013414142941681718,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13270679.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001859338372014463,
+ "skip_count": 0.0,
+ "step": 8228,
+ "text_loss": 0.5427029132843018
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.0001339305304675435,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13273275.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000655558833386749,
+ "skip_count": 0.0,
+ "step": 8230,
+ "text_loss": 0.29442915320396423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 0.00013371977179678113,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13276205.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011499621905386448,
+ "skip_count": 0.0,
+ "step": 8232,
+ "text_loss": 0.5601125359535217
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00013350915348529313,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13279242.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019823790062218904,
+ "skip_count": 0.0,
+ "step": 8234,
+ "text_loss": 0.43674135208129883
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 38.66686234223657,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04248046875,
+ "learning_rate": 0.00013329867561378888,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13282531.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005772443953901529,
+ "skip_count": 3.0,
+ "step": 8236,
+ "text_loss": 0.4838809072971344
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 0.00013308833826292395,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13286219.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038314659614115953,
+ "skip_count": 2.0,
+ "step": 8238,
+ "text_loss": 0.5002569556236267
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 38.685647196947464,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 0.00013287814151329987,
+ "loss": 0.0075,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 13290348.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04819172993302345,
+ "skip_count": 4.0,
+ "step": 8240,
+ "text_loss": 0.3099883198738098
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.00013266808544546438,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13293644.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010334883816540241,
+ "skip_count": 2.0,
+ "step": 8242,
+ "text_loss": 0.17672912776470184
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.00013245817013991164,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13296721.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00162201386410743,
+ "skip_count": 0.0,
+ "step": 8244,
+ "text_loss": 0.7664286494255066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.00013224839567708142,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13299704.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0039452011696994305,
+ "skip_count": 0.0,
+ "step": 8246,
+ "text_loss": 0.1827820986509323
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 38.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 0.00013203876213735972,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 13302553.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006701917387545109,
+ "skip_count": 7.0,
+ "step": 8248,
+ "text_loss": 0.6020278930664062
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.73260933372468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.0001318292696010785,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13305875.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00968079548329115,
+ "skip_count": 2.0,
+ "step": 8250,
+ "text_loss": 0.2693248987197876
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 38.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.00013161991814851571,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 13309115.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.008890608325600624,
+ "skip_count": 2.0,
+ "step": 8252,
+ "text_loss": 0.6325297355651855
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 38.751394188435576,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 0.00013141070785989517,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 13312219.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00825794693082571,
+ "skip_count": 4.0,
+ "step": 8254,
+ "text_loss": 0.284396767616272
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.00013120163881538677,
+ "loss": 0.0095,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13315214.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003378969384357333,
+ "skip_count": 1.0,
+ "step": 8256,
+ "text_loss": 0.20296992361545563
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.77017904314646,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.00013099271109510603,
+ "loss": 0.005,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 13319117.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0164186954498291,
+ "skip_count": 0.0,
+ "step": 8258,
+ "text_loss": 0.21940068900585175
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 38.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 0.0001307839247791145,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13321631.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0053979759104549885,
+ "skip_count": 3.0,
+ "step": 8260,
+ "text_loss": 0.19442199170589447
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 0.00013057527994741946,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13324759.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024567479267716408,
+ "skip_count": 0.0,
+ "step": 8262,
+ "text_loss": 0.5528824925422668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.0001303667766799741,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13327554.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002819873159751296,
+ "skip_count": 1.0,
+ "step": 8264,
+ "text_loss": 0.4418395757675171
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.807748752568244,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.00013015841505667703,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13331838.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030280952341854572,
+ "skip_count": 1.0,
+ "step": 8266,
+ "text_loss": 0.5263079404830933
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 38.81714117992369,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0185546875,
+ "learning_rate": 0.0001299501951573731,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13334968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001774887670762837,
+ "skip_count": 4.0,
+ "step": 8268,
+ "text_loss": 0.47985130548477173
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00012974211706185247,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13338052.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007027842104434967,
+ "skip_count": 1.0,
+ "step": 8270,
+ "text_loss": 0.6588287949562073
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.00012953418084985107,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13341653.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026854060124605894,
+ "skip_count": 1.0,
+ "step": 8272,
+ "text_loss": 0.43156498670578003
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.00012932638660105038,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13345173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033325920812785625,
+ "skip_count": 0.0,
+ "step": 8274,
+ "text_loss": 0.1679086685180664
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 0.00012911873439507766,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13348635.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016183287370949984,
+ "skip_count": 0.0,
+ "step": 8276,
+ "text_loss": 0.5907418131828308
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 0.00012891122431150549,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13351120.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0049970983527600765,
+ "skip_count": 1.0,
+ "step": 8278,
+ "text_loss": 0.5437678694725037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048095703125,
+ "learning_rate": 0.00012870385642985222,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13353774.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027123154141008854,
+ "skip_count": 0.0,
+ "step": 8280,
+ "text_loss": 0.5742796659469604
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.00012849663082958158,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13358236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0062842960469424725,
+ "skip_count": 0.0,
+ "step": 8282,
+ "text_loss": 0.2340863049030304
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.89228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.00012828954759010265,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13360994.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006564505747519433,
+ "skip_count": 0.0,
+ "step": 8284,
+ "text_loss": 0.45432794094085693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.0001280826067907705,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13363665.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001298630959354341,
+ "skip_count": 0.0,
+ "step": 8286,
+ "text_loss": 0.7439755201339722
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00012787580851088493,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13367412.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00464112963527441,
+ "skip_count": 0.0,
+ "step": 8288,
+ "text_loss": 0.2854461669921875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.92045788083358,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 0.0001276691528296916,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13370745.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006090773968026042,
+ "skip_count": 0.0,
+ "step": 8290,
+ "text_loss": 0.6663011312484741
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 0.00012746263982638123,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13373396.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038922233507037163,
+ "skip_count": 0.0,
+ "step": 8292,
+ "text_loss": 0.3858443796634674
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.93924273554447,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.00012725626958009007,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13376172.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016941255889832973,
+ "skip_count": 0.0,
+ "step": 8294,
+ "text_loss": 0.4758119285106659
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 38.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.0001270500421698994,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13379002.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001703770598396659,
+ "skip_count": 0.0,
+ "step": 8296,
+ "text_loss": 0.7464606165885925
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 38.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 0.00012684395767483626,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13382221.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001474690856412053,
+ "skip_count": 1.0,
+ "step": 8298,
+ "text_loss": 0.37309199571609497
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 38.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.00012663801617387245,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13385276.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004561704583466053,
+ "skip_count": 3.0,
+ "step": 8300,
+ "text_loss": 0.43284836411476135
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 38.97681244496625,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 0.00012643221774592518,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 13388321.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005136100109666586,
+ "skip_count": 1.0,
+ "step": 8302,
+ "text_loss": 0.669730007648468
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 38.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 0.00012622656246985675,
+ "loss": 0.0101,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13391222.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028521555941551924,
+ "skip_count": 0.0,
+ "step": 8304,
+ "text_loss": 0.16773155331611633
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 38.99559729967714,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 0.00012602105042447471,
+ "loss": 0.0087,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13395297.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033424890134483576,
+ "skip_count": 2.0,
+ "step": 8306,
+ "text_loss": 0.1650846153497696
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.004696213677725,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.0001258156816885316,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13398482.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012481207959353924,
+ "skip_count": 0.0,
+ "step": 8308,
+ "text_loss": 0.37225499749183655
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 39.01408864103317,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.00012561045634072515,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13402199.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006243644282221794,
+ "skip_count": 3.0,
+ "step": 8310,
+ "text_loss": 0.16000206768512726
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.02348106838861,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00012540537445969807,
+ "loss": 0.0087,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13404950.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004267443902790546,
+ "skip_count": 2.0,
+ "step": 8312,
+ "text_loss": 0.400174081325531
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.00012520043612403815,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13407883.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005013707559555769,
+ "skip_count": 2.0,
+ "step": 8314,
+ "text_loss": 0.1331731230020523
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 39.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.00012499564141227798,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13410563.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00463570561259985,
+ "skip_count": 0.0,
+ "step": 8316,
+ "text_loss": 0.5098661184310913
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 39.05165835045494,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.052978515625,
+ "learning_rate": 0.0001247909904028956,
+ "loss": 0.0078,
+ "macro_f1": 1.0,
+ "num_tokens": 13413730.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007066591177135706,
+ "skip_count": 1.0,
+ "step": 8318,
+ "text_loss": 0.8059925436973572
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 39.061050777810394,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00012458648317431348,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13416425.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004210594110190868,
+ "skip_count": 3.0,
+ "step": 8320,
+ "text_loss": 0.6559522151947021
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.07044320516584,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.0001243821198048992,
+ "loss": 0.0045,
+ "macro_f1": 1.0,
+ "num_tokens": 13419851.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005613257177174091,
+ "skip_count": 2.0,
+ "step": 8322,
+ "text_loss": 0.2783811688423157
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.00012417790037296523,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13422588.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00233642989769578,
+ "skip_count": 1.0,
+ "step": 8324,
+ "text_loss": 0.7659147381782532
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.00012397382495676874,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13425275.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013295465614646673,
+ "skip_count": 0.0,
+ "step": 8326,
+ "text_loss": 0.5693745017051697
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 39.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 0.0001237698936345119,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 13428314.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005712272133678198,
+ "skip_count": 1.0,
+ "step": 8328,
+ "text_loss": 0.8581340909004211
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.10801291458761,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.00012356610648434153,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13431453.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015835616504773498,
+ "skip_count": 0.0,
+ "step": 8330,
+ "text_loss": 0.1395341008901596
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.117405341943055,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 0.00012336246358434928,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13434566.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012973316479474306,
+ "skip_count": 0.0,
+ "step": 8332,
+ "text_loss": 0.7125005125999451
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.126797769298506,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.00012315896501257145,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13438056.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005822008824907243,
+ "skip_count": 0.0,
+ "step": 8334,
+ "text_loss": 0.7730510234832764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.00012295561084698915,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13441390.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00547185679897666,
+ "skip_count": 1.0,
+ "step": 8336,
+ "text_loss": 0.3927873373031616
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.14558262400939,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 0.000122752401165528,
+ "loss": 0.0022,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13443864.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011191967641934752,
+ "skip_count": 0.0,
+ "step": 8338,
+ "text_loss": 0.3996548354625702
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.00012254933604605828,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13447070.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005196621641516685,
+ "skip_count": 0.0,
+ "step": 8340,
+ "text_loss": 0.5597847104072571
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.16436747872028,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 0.00012234641556639508,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13450522.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003857341594994068,
+ "skip_count": 2.0,
+ "step": 8342,
+ "text_loss": 0.14400488138198853
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.17375990607572,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.00012214363980429793,
+ "loss": 0.0056,
+ "macro_f1": 1.0,
+ "num_tokens": 13453578.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006664265412837267,
+ "skip_count": 3.0,
+ "step": 8344,
+ "text_loss": 0.27675092220306396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.183152333431174,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0595703125,
+ "learning_rate": 0.00012194100883747078,
+ "loss": 0.0095,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13456480.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003549816319718957,
+ "skip_count": 0.0,
+ "step": 8346,
+ "text_loss": 0.21776801347732544
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.19254476078662,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 0.00012173852274356217,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 13459859.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00446992926299572,
+ "skip_count": 3.0,
+ "step": 8348,
+ "text_loss": 0.1828736811876297
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 0.00012153618160016527,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13463104.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024826989974826574,
+ "skip_count": 1.0,
+ "step": 8350,
+ "text_loss": 0.15649555623531342
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0186767578125,
+ "learning_rate": 0.0001213339854848175,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13467051.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021385846193879843,
+ "skip_count": 1.0,
+ "step": 8352,
+ "text_loss": 0.49281737208366394
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 0.00012113193447500081,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13470411.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014382716035470366,
+ "skip_count": 1.0,
+ "step": 8354,
+ "text_loss": 0.5984349846839905
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.23011447020839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 0.00012093002864814151,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13474666.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008536498062312603,
+ "skip_count": 1.0,
+ "step": 8356,
+ "text_loss": 0.2851131856441498
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 0.00012072826808161036,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13477754.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027286717668175697,
+ "skip_count": 0.0,
+ "step": 8358,
+ "text_loss": 0.5987376570701599
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.248899324919286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 0.0001205266528527223,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13481151.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002780565759167075,
+ "skip_count": 1.0,
+ "step": 8360,
+ "text_loss": 0.1847199648618698
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00012032518303873674,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13484050.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006186611135490239,
+ "skip_count": 0.0,
+ "step": 8362,
+ "text_loss": 0.6229772567749023
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 39.26768417963017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 0.00012012385871685716,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13488551.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00956071075052023,
+ "skip_count": 5.0,
+ "step": 8364,
+ "text_loss": 0.2810790538787842
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.27707660698562,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 0.00011992267996423162,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13491420.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008410792797803879,
+ "skip_count": 2.0,
+ "step": 8366,
+ "text_loss": 0.20509617030620575
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.28646903434106,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 0.00011972164685795212,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13494736.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00762166129425168,
+ "skip_count": 1.0,
+ "step": 8368,
+ "text_loss": 0.24739402532577515
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.295861461696504,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.00011952075947505486,
+ "loss": 0.0051,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 13498363.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010674391873180866,
+ "skip_count": 1.0,
+ "step": 8370,
+ "text_loss": 0.31931644678115845
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 39.305253889051954,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 0.0001193200178925204,
+ "loss": 0.0036,
+ "macro_f1": 1.0,
+ "num_tokens": 13501029.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0041843741200864315,
+ "skip_count": 1.0,
+ "step": 8372,
+ "text_loss": 0.5103049278259277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.3146463164074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00011911942218727312,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13503854.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006344785797409713,
+ "skip_count": 0.0,
+ "step": 8374,
+ "text_loss": 0.4914432764053345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 0.00011891897243618183,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13508316.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003527739318087697,
+ "skip_count": 0.0,
+ "step": 8376,
+ "text_loss": 0.5317551493644714
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 0.00011871866871605913,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13512603.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001071247854270041,
+ "skip_count": 0.0,
+ "step": 8378,
+ "text_loss": 0.6693558096885681
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 0.00011851851110366185,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13515928.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000924977008253336,
+ "skip_count": 1.0,
+ "step": 8380,
+ "text_loss": 0.8004939556121826
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.35221602582917,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 0.0001183184996756908,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13518548.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017637151759117842,
+ "skip_count": 0.0,
+ "step": 8382,
+ "text_loss": 0.5012105107307434
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 39.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.00011811863450879063,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13522155.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0011129514314234257,
+ "skip_count": 0.0,
+ "step": 8384,
+ "text_loss": 0.3866073489189148
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 39.371000880540066,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 0.00011791891567955009,
+ "loss": 0.0046,
+ "macro_f1": 0.8814815282821655,
+ "num_tokens": 13525352.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.042801812291145325,
+ "skip_count": 4.0,
+ "step": 8386,
+ "text_loss": 0.18817944824695587
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018798828125,
+ "learning_rate": 0.00011771934326450173,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13528537.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006869474309496582,
+ "skip_count": 0.0,
+ "step": 8388,
+ "text_loss": 0.6407818794250488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.38978573525095,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 0.00011751991734012229,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13531650.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008001072565093637,
+ "skip_count": 0.0,
+ "step": 8390,
+ "text_loss": 0.5149344205856323
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.3991781626064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 0.00011732063798283204,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13535071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006921148742549121,
+ "skip_count": 0.0,
+ "step": 8392,
+ "text_loss": 0.5906356573104858
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.40857058996184,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 0.00011712150526899523,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13537741.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005221226718276739,
+ "skip_count": 2.0,
+ "step": 8394,
+ "text_loss": 0.3381146192550659
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 39.41796301731729,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 0.00011692251927491987,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 13541189.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0023983579594641924,
+ "skip_count": 1.0,
+ "step": 8396,
+ "text_loss": 0.7345486283302307
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.427355444672735,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 0.00011672368007685774,
+ "loss": 0.0069,
+ "macro_f1": 1.0,
+ "num_tokens": 13545210.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005362956319004297,
+ "skip_count": 2.0,
+ "step": 8398,
+ "text_loss": 0.6522865295410156
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.43674787202818,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 0.00011652498775100445,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13548260.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002955642296001315,
+ "skip_count": 0.0,
+ "step": 8400,
+ "text_loss": 0.3200102150440216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.00011632644237349927,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13551519.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001079231034964323,
+ "skip_count": 0.0,
+ "step": 8402,
+ "text_loss": 0.7251807451248169
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 39.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 0.00011612804402042509,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13555241.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013860360719263554,
+ "skip_count": 0.0,
+ "step": 8404,
+ "text_loss": 0.159539595246315
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 25.0,
+ "epoch": 39.46492515409451,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.888888955116272,
+ "grad_norm": 0.054931640625,
+ "learning_rate": 0.00011592979276780857,
+ "loss": 0.0055,
+ "macro_f1": 0.9555556178092957,
+ "num_tokens": 13558389.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.017025530338287354,
+ "skip_count": 5.0,
+ "step": 8406,
+ "text_loss": 0.5154430270195007
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.47431758144996,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 0.00011573168869162004,
+ "loss": 0.0037,
+ "macro_f1": 1.0,
+ "num_tokens": 13561237.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007349071092903614,
+ "skip_count": 2.0,
+ "step": 8408,
+ "text_loss": 0.20888492465019226
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 0.00011553373186777327,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 13564080.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003303215140476823,
+ "skip_count": 2.0,
+ "step": 8410,
+ "text_loss": 0.21808166801929474
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.49310243616085,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 0.00011533592237212558,
+ "loss": 0.0035,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13566649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005856195464730263,
+ "skip_count": 1.0,
+ "step": 8412,
+ "text_loss": 0.28037169575691223
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 0.0001151382602804782,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13570015.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007515792385675013,
+ "skip_count": 0.0,
+ "step": 8414,
+ "text_loss": 0.8517835736274719
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 0.00011494074566857549,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13573262.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0043421462178230286,
+ "skip_count": 0.0,
+ "step": 8416,
+ "text_loss": 0.27418580651283264
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 0.00011474337861210544,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 13576104.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0108594736084342,
+ "skip_count": 2.0,
+ "step": 8418,
+ "text_loss": 0.4724268317222595
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.53067214558262,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 0.00011454615918669948,
+ "loss": 0.008,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 13579138.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04178442806005478,
+ "skip_count": 0.0,
+ "step": 8420,
+ "text_loss": 0.4065103530883789
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.54006457293807,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.00011434908746793238,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13582818.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004756448790431023,
+ "skip_count": 2.0,
+ "step": 8422,
+ "text_loss": 0.2932167947292328
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00011415216353132252,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13586261.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033427432645112276,
+ "skip_count": 1.0,
+ "step": 8424,
+ "text_loss": 0.47670233249664307
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 0.0001139553874523313,
+ "loss": 0.003,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13589765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006597383879125118,
+ "skip_count": 1.0,
+ "step": 8426,
+ "text_loss": 0.31448885798454285
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.5682418550044,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 0.00011375875930636403,
+ "loss": 0.005,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 13592741.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011398134753108025,
+ "skip_count": 1.0,
+ "step": 8428,
+ "text_loss": 0.17429469525814056
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 39.577634282359845,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 0.00011356227916876877,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13595763.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038021153304725885,
+ "skip_count": 0.0,
+ "step": 8430,
+ "text_loss": 0.6043882966041565
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.58702670971529,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 0.00011336594711483712,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13598274.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00044314167462289333,
+ "skip_count": 0.0,
+ "step": 8432,
+ "text_loss": 0.3818575143814087
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.59641913707074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.00011316976321980388,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13601510.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001956664025783539,
+ "skip_count": 0.0,
+ "step": 8434,
+ "text_loss": 0.48483794927597046
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.0001129737275588471,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13604410.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005170237272977829,
+ "skip_count": 0.0,
+ "step": 8436,
+ "text_loss": 0.21759741008281708
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.61520399178163,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.00011277784020708803,
+ "loss": 0.0045,
+ "macro_f1": 1.0,
+ "num_tokens": 13607207.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002223948948085308,
+ "skip_count": 2.0,
+ "step": 8438,
+ "text_loss": 0.6877034306526184
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 0.00011258210123959089,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13610981.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017733481945469975,
+ "skip_count": 1.0,
+ "step": 8440,
+ "text_loss": 0.7250658273696899
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 39.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00011238651073136358,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 13614194.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00155889883171767,
+ "skip_count": 1.0,
+ "step": 8442,
+ "text_loss": 0.6742649078369141
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 0.00011219106875735652,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13618011.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011234934208914638,
+ "skip_count": 0.0,
+ "step": 8444,
+ "text_loss": 0.8105526566505432
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 39.65277370120341,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 0.00011199577539246347,
+ "loss": 0.0055,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 13621852.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02346695400774479,
+ "skip_count": 1.0,
+ "step": 8446,
+ "text_loss": 0.22664032876491547
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 0.0001118006307115213,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13624711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012819754891097546,
+ "skip_count": 2.0,
+ "step": 8448,
+ "text_loss": 0.31696105003356934
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 39.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 0.00011160563478930969,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13627561.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0060531035996973515,
+ "skip_count": 2.0,
+ "step": 8450,
+ "text_loss": 0.2935826778411865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 0.00011141078770055152,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13630445.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004288572818040848,
+ "skip_count": 0.0,
+ "step": 8452,
+ "text_loss": 0.5720692873001099
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.69034341062518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 0.00011121608951991252,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13633496.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005682424642145634,
+ "skip_count": 1.0,
+ "step": 8454,
+ "text_loss": 0.28466710448265076
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.699735837980626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00011102154032200146,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13635938.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009555552969686687,
+ "skip_count": 0.0,
+ "step": 8456,
+ "text_loss": 0.47744694352149963
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.70912826533607,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.00011082714018136985,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13638863.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023627313785254955,
+ "skip_count": 0.0,
+ "step": 8458,
+ "text_loss": 0.5212090611457825
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.71852069269152,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0189208984375,
+ "learning_rate": 0.00011063288917251235,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 13641874.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00791920255869627,
+ "skip_count": 2.0,
+ "step": 8460,
+ "text_loss": 0.31359919905662537
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 39.72791312004696,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.00011043878736986607,
+ "loss": 0.0077,
+ "macro_f1": 1.0,
+ "num_tokens": 13644970.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0033252311404794455,
+ "skip_count": 1.0,
+ "step": 8462,
+ "text_loss": 0.33621230721473694
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.73730554740241,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 0.00011024483484781144,
+ "loss": 0.0077,
+ "macro_f1": 1.0,
+ "num_tokens": 13648103.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005567418877035379,
+ "skip_count": 2.0,
+ "step": 8464,
+ "text_loss": 0.48708856105804443
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 0.00011005103168067143,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13651085.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00047958645154722035,
+ "skip_count": 0.0,
+ "step": 8466,
+ "text_loss": 0.4151248633861542
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.756090402113294,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 0.00010985737794271161,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13654175.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009806647431105375,
+ "skip_count": 0.0,
+ "step": 8468,
+ "text_loss": 0.7322396039962769
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 0.00010966387370814057,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13657058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009820344857871532,
+ "skip_count": 0.0,
+ "step": 8470,
+ "text_loss": 0.6350769400596619
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 39.77487525682419,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.00010947051905110945,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13660203.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.002065197564661503,
+ "skip_count": 0.0,
+ "step": 8472,
+ "text_loss": 0.6025850176811218
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.00010927731404571211,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13664021.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009939799783751369,
+ "skip_count": 0.0,
+ "step": 8474,
+ "text_loss": 0.3040087819099426
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 0.0001090842587659851,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13667055.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008282510680146515,
+ "skip_count": 0.0,
+ "step": 8476,
+ "text_loss": 0.7306531667709351
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0001088913532859076,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13669940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008349589770659804,
+ "skip_count": 0.0,
+ "step": 8478,
+ "text_loss": 0.32041916251182556
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.81244496624596,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.00010869859767940133,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13672955.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007435405277647078,
+ "skip_count": 0.0,
+ "step": 8480,
+ "text_loss": 0.5343614816665649
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.821837393601406,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 0.00010850599202033051,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13676173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002763360273092985,
+ "skip_count": 0.0,
+ "step": 8482,
+ "text_loss": 0.6071668267250061
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.83122982095686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 0.00010831353638250213,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13680121.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00202178000472486,
+ "skip_count": 0.0,
+ "step": 8484,
+ "text_loss": 0.42487844824790955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.8406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.00010812123083966535,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13683504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0056348275393247604,
+ "skip_count": 1.0,
+ "step": 8486,
+ "text_loss": 0.17678795754909515
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 0.00010792907546551229,
+ "loss": 0.0079,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13686870.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003331703832373023,
+ "skip_count": 0.0,
+ "step": 8488,
+ "text_loss": 0.32238465547561646
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.85940710302319,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 0.00010773707033367708,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13690429.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011620528530329466,
+ "skip_count": 0.0,
+ "step": 8490,
+ "text_loss": 0.4141998291015625
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 39.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 0.00010754521551773655,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13693747.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005236583761870861,
+ "skip_count": 0.0,
+ "step": 8492,
+ "text_loss": 0.557283878326416
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 39.878191957734074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 0.00010735351109120972,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13696837.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005507425405085087,
+ "skip_count": 6.0,
+ "step": 8494,
+ "text_loss": 0.7394861578941345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.887584385089525,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 0.00010716195712755821,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13700080.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008621517335996032,
+ "skip_count": 0.0,
+ "step": 8496,
+ "text_loss": 0.7079368233680725
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.89697681244497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 0.00010697055370018572,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13704088.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004489862476475537,
+ "skip_count": 0.0,
+ "step": 8498,
+ "text_loss": 0.5672308206558228
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 0.00010677930088243847,
+ "loss": 0.0077,
+ "macro_f1": 1.0,
+ "num_tokens": 13707391.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009171495214104652,
+ "skip_count": 2.0,
+ "step": 8500,
+ "text_loss": 0.6851600408554077
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 0.00010658819874760495,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13711238.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016714727971702814,
+ "skip_count": 1.0,
+ "step": 8502,
+ "text_loss": 0.7102733850479126
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 0.00010639724736891576,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13714553.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012916292762383819,
+ "skip_count": 0.0,
+ "step": 8504,
+ "text_loss": 0.4234752953052521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.93454652186674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 0.0001062064468195439,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13718046.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005265420186333358,
+ "skip_count": 0.0,
+ "step": 8506,
+ "text_loss": 0.5576326251029968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.943938949222186,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 0.0001060157971726045,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13720687.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023503501433879137,
+ "skip_count": 1.0,
+ "step": 8508,
+ "text_loss": 0.5259605646133423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.95333137657764,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 0.00010582529850115469,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13723946.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007593657355755568,
+ "skip_count": 0.0,
+ "step": 8510,
+ "text_loss": 0.3795129954814911
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05419921875,
+ "learning_rate": 0.00010563495087819419,
+ "loss": 0.0077,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13727589.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005672222469002008,
+ "skip_count": 0.0,
+ "step": 8512,
+ "text_loss": 0.685897946357727
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 39.972116231288524,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 0.00010544475437666445,
+ "loss": 0.0049,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 13730579.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.01708158478140831,
+ "skip_count": 2.0,
+ "step": 8514,
+ "text_loss": 0.8044925332069397
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 39.98150865864397,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0517578125,
+ "learning_rate": 0.00010525470906944917,
+ "loss": 0.0113,
+ "macro_f1": 1.0,
+ "num_tokens": 13733563.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010253295302391052,
+ "skip_count": 2.0,
+ "step": 8516,
+ "text_loss": 0.3999447524547577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 39.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 0.00010506481502937398,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13736645.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004293019883334637,
+ "skip_count": 0.0,
+ "step": 8518,
+ "text_loss": 0.3128681778907776
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 40.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 0.00010487507232920674,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 13740080.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030790462624281645,
+ "skip_count": 1.0,
+ "step": 8520,
+ "text_loss": 0.39142900705337524
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.00939242735544,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 0.00010468548104165709,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13743085.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007342757890000939,
+ "skip_count": 0.0,
+ "step": 8522,
+ "text_loss": 0.7652465105056763
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 0.00010449604123937689,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13746513.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030496022664010525,
+ "skip_count": 0.0,
+ "step": 8524,
+ "text_loss": 0.6259746551513672
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 40.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 0.00010430675299495973,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 13749391.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010060965083539486,
+ "skip_count": 1.0,
+ "step": 8526,
+ "text_loss": 0.2266668826341629
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 0.0001041176163809413,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 13752449.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002234962536022067,
+ "skip_count": 2.0,
+ "step": 8528,
+ "text_loss": 0.9742465019226074
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 0.00010392863146979903,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13755572.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003572004789020866,
+ "skip_count": 0.0,
+ "step": 8530,
+ "text_loss": 0.5757357478141785
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.00010373979833395242,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13759198.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011161680333316326,
+ "skip_count": 0.0,
+ "step": 8532,
+ "text_loss": 0.6268131136894226
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 0.00010355111704576236,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13761914.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002053353004157543,
+ "skip_count": 0.0,
+ "step": 8534,
+ "text_loss": 0.22388778626918793
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 0.00010336258767753232,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13765371.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003634720342233777,
+ "skip_count": 2.0,
+ "step": 8536,
+ "text_loss": 0.5802993178367615
+ },
+ {
+ "acc_repeat": 0.800000011920929,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.084531846199,
+ "f1_execute": 0.9729729890823364,
+ "f1_repeat": 0.888888955116272,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 0.00010317421030150692,
+ "loss": 0.0072,
+ "macro_f1": 0.9539539813995361,
+ "num_tokens": 13768276.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.053806692361831665,
+ "skip_count": 5.0,
+ "step": 8538,
+ "text_loss": 0.10888377577066422
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.09392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.07275390625,
+ "learning_rate": 0.00010298598498987266,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13772369.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00501362606883049,
+ "skip_count": 1.0,
+ "step": 8540,
+ "text_loss": 0.5794995427131653
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 0.00010279791181475795,
+ "loss": 0.0082,
+ "macro_f1": 1.0,
+ "num_tokens": 13776595.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002230882178992033,
+ "skip_count": 2.0,
+ "step": 8542,
+ "text_loss": 0.5503702163696289
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.11270912826534,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 0.00010260999084823264,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13779993.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012205395614728332,
+ "skip_count": 0.0,
+ "step": 8544,
+ "text_loss": 0.7248672842979431
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 0.00010242222216230856,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13782683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003966465883422643,
+ "skip_count": 0.0,
+ "step": 8546,
+ "text_loss": 0.7446619272232056
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 0.00010223460582893889,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13785534.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004968565888702869,
+ "skip_count": 1.0,
+ "step": 8548,
+ "text_loss": 0.22457796335220337
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 0.00010204714192001863,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13788608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033054195810109377,
+ "skip_count": 2.0,
+ "step": 8550,
+ "text_loss": 0.418837308883667
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 0.00010185983050738434,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13791553.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001166256028227508,
+ "skip_count": 0.0,
+ "step": 8552,
+ "text_loss": 0.4060337543487549
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 0.00010167267166281402,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13795304.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003844029037281871,
+ "skip_count": 2.0,
+ "step": 8554,
+ "text_loss": 0.17412975430488586
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 0.00010148566545802718,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13798445.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033507589250802994,
+ "skip_count": 0.0,
+ "step": 8556,
+ "text_loss": 0.24744336307048798
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 0.00010129881196468527,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13801338.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004076482728123665,
+ "skip_count": 0.0,
+ "step": 8558,
+ "text_loss": 0.6542767882347107
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01806640625,
+ "learning_rate": 0.00010111211125439069,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13804157.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005654391716234386,
+ "skip_count": 0.0,
+ "step": 8560,
+ "text_loss": 0.527079701423645
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 0.00010092556339868758,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13807411.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004915264435112476,
+ "skip_count": 1.0,
+ "step": 8562,
+ "text_loss": 0.721017599105835
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 0.00010073916846906139,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13810489.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005571382585912943,
+ "skip_count": 1.0,
+ "step": 8564,
+ "text_loss": 0.5802517533302307
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.21602582917523,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 0.00010055292653693903,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13813526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001321605988778174,
+ "skip_count": 0.0,
+ "step": 8566,
+ "text_loss": 0.5485247373580933
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 0.00010036683767368859,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13817225.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001876185997389257,
+ "skip_count": 0.0,
+ "step": 8568,
+ "text_loss": 0.08957820385694504
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 0.00010018090195061997,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13820667.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004593426361680031,
+ "skip_count": 0.0,
+ "step": 8570,
+ "text_loss": 0.24580086767673492
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 9.999511943898398e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13824505.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022372701205313206,
+ "skip_count": 0.0,
+ "step": 8572,
+ "text_loss": 0.20976831018924713
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 9.980949020997276e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13827623.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030519715510308743,
+ "skip_count": 0.0,
+ "step": 8574,
+ "text_loss": 0.7638732194900513
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 9.962401433471985e-05,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13831013.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005036211106926203,
+ "skip_count": 1.0,
+ "step": 8576,
+ "text_loss": 0.3791790306568146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 9.943869188429989e-05,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13833611.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002071794355288148,
+ "skip_count": 2.0,
+ "step": 8578,
+ "text_loss": 0.5480846166610718
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 40.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 9.925352292972884e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13836678.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008119060657918453,
+ "skip_count": 0.0,
+ "step": 8580,
+ "text_loss": 0.21605457365512848
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 9.906850754196379e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13839255.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004017427563667297,
+ "skip_count": 2.0,
+ "step": 8582,
+ "text_loss": 0.4473285973072052
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.045654296875,
+ "learning_rate": 9.888364579190285e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13842034.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005163116846233606,
+ "skip_count": 1.0,
+ "step": 8584,
+ "text_loss": 0.21627424657344818
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.30995010272967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 9.869893775038557e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13844648.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0044358340092003345,
+ "skip_count": 1.0,
+ "step": 8586,
+ "text_loss": 0.5660704970359802
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 9.851438348819247e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13847629.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00038135924842208624,
+ "skip_count": 1.0,
+ "step": 8588,
+ "text_loss": 0.6401235461235046
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.32873495744057,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 9.832998307604495e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13851409.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004005341790616512,
+ "skip_count": 1.0,
+ "step": 8590,
+ "text_loss": 0.43975043296813965
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.33812738479601,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 9.814573658460562e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13854031.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006872966885566711,
+ "skip_count": 2.0,
+ "step": 8592,
+ "text_loss": 0.6000451445579529
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 9.796164408447811e-05,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13856813.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019872859120368958,
+ "skip_count": 0.0,
+ "step": 8594,
+ "text_loss": 0.6026073098182678
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 9.777770564620698e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13859805.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013098123483359814,
+ "skip_count": 2.0,
+ "step": 8596,
+ "text_loss": 0.3294500708580017
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 40.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 9.759392134027783e-05,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 13863119.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001011171261779964,
+ "skip_count": 1.0,
+ "step": 8598,
+ "text_loss": 0.4078965187072754
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.375697094217784,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 9.741029123711708e-05,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13866239.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003267963184043765,
+ "skip_count": 0.0,
+ "step": 8600,
+ "text_loss": 0.5064641833305359
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.385089521573235,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 9.722681540709228e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 13869647.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02431299351155758,
+ "skip_count": 2.0,
+ "step": 8602,
+ "text_loss": 0.2512950301170349
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 9.704349392051155e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13873128.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019577480852603912,
+ "skip_count": 1.0,
+ "step": 8604,
+ "text_loss": 0.425156831741333
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 9.686032684762408e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13876603.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001554530463181436,
+ "skip_count": 1.0,
+ "step": 8606,
+ "text_loss": 0.3596082329750061
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01519775390625,
+ "learning_rate": 9.667731425861975e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13879602.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027400986291468143,
+ "skip_count": 0.0,
+ "step": 8608,
+ "text_loss": 0.12101534754037857
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 9.649445622362957e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13882204.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001957559958100319,
+ "skip_count": 2.0,
+ "step": 8610,
+ "text_loss": 0.382834255695343
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.43205165835045,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 9.631175281272491e-05,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 13886397.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009613300673663616,
+ "skip_count": 3.0,
+ "step": 8612,
+ "text_loss": 0.24718235433101654
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.441444085705896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 9.612920409591813e-05,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13889625.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015159029280766845,
+ "skip_count": 0.0,
+ "step": 8614,
+ "text_loss": 0.406452476978302
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 40.45083651306135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 9.59468101431622e-05,
+ "loss": 0.0034,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13892518.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008069832809269428,
+ "skip_count": 3.0,
+ "step": 8616,
+ "text_loss": 0.19740329682826996
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0157470703125,
+ "learning_rate": 9.576457102435082e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13895822.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024340536911040545,
+ "skip_count": 0.0,
+ "step": 8618,
+ "text_loss": 0.44761306047439575
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 40.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 9.558248680931841e-05,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 13898829.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0053517078049480915,
+ "skip_count": 1.0,
+ "step": 8620,
+ "text_loss": 0.37335118651390076
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.47901379512768,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 9.540055756783994e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.9255813956260681,
+ "num_tokens": 13902122.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.03885587304830551,
+ "skip_count": 4.0,
+ "step": 8622,
+ "text_loss": 0.21311092376708984
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.48840622248312,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 9.521878336963108e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13904874.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007965708151459694,
+ "skip_count": 1.0,
+ "step": 8624,
+ "text_loss": 0.27229398488998413
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020263671875,
+ "learning_rate": 9.5037164284348e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13907755.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019825168419629335,
+ "skip_count": 0.0,
+ "step": 8626,
+ "text_loss": 0.6535577178001404
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.507191077194015,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 9.485570038158747e-05,
+ "loss": 0.0085,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 13910619.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.017803344875574112,
+ "skip_count": 0.0,
+ "step": 8628,
+ "text_loss": 0.26617178320884705
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 9.467439173088687e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13914098.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025836096610873938,
+ "skip_count": 0.0,
+ "step": 8630,
+ "text_loss": 0.44465285539627075
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 9.44932384017238e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13917192.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004438584204763174,
+ "skip_count": 2.0,
+ "step": 8632,
+ "text_loss": 0.33622798323631287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 9.431224046351688e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13920067.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017312567681074142,
+ "skip_count": 2.0,
+ "step": 8634,
+ "text_loss": 0.31870952248573303
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 9.413139798562476e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13922887.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019389945082366467,
+ "skip_count": 0.0,
+ "step": 8636,
+ "text_loss": 0.18223261833190918
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.55415321397123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 9.395071103734648e-05,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13926545.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011485094437375665,
+ "skip_count": 0.0,
+ "step": 8638,
+ "text_loss": 0.48031774163246155
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 40.563545641326684,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 9.377017968792179e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13931171.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003448521951213479,
+ "skip_count": 0.0,
+ "step": 8640,
+ "text_loss": 0.7585139870643616
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 40.57293806868213,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 9.35898040065305e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 13934369.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017959754914045334,
+ "skip_count": 2.0,
+ "step": 8642,
+ "text_loss": 0.49708613753318787
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 9.3409584062293e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13938166.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004092653747648001,
+ "skip_count": 1.0,
+ "step": 8644,
+ "text_loss": 0.20662656426429749
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 9.322951992426992e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13941922.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026206092443317175,
+ "skip_count": 0.0,
+ "step": 8646,
+ "text_loss": 0.4735889434814453
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 40.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 9.304961166146209e-05,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 13945569.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.005156307481229305,
+ "skip_count": 2.0,
+ "step": 8648,
+ "text_loss": 0.5630270838737488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 9.286985934281079e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13948357.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004913610871881247,
+ "skip_count": 1.0,
+ "step": 8650,
+ "text_loss": 0.4053497016429901
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.619900205459345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0186767578125,
+ "learning_rate": 9.26902630371974e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13952543.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003946282435208559,
+ "skip_count": 2.0,
+ "step": 8652,
+ "text_loss": 0.40166863799095154
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.629292632814796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 9.251082281344358e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13955917.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009605551022104919,
+ "skip_count": 0.0,
+ "step": 8654,
+ "text_loss": 0.20477983355522156
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 40.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 9.233153874031102e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13960071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004408199340105057,
+ "skip_count": 3.0,
+ "step": 8656,
+ "text_loss": 0.3349814713001251
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 9.215241088650194e-05,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 13963125.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005541396792978048,
+ "skip_count": 2.0,
+ "step": 8658,
+ "text_loss": 0.6602919697761536
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 9.197343932065843e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13966130.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001636760076507926,
+ "skip_count": 0.0,
+ "step": 8660,
+ "text_loss": 0.7704628109931946
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.66686234223657,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 9.179462411136263e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13969791.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006453761598095298,
+ "skip_count": 0.0,
+ "step": 8662,
+ "text_loss": 0.3898075520992279
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 40.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 9.161596532713695e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13972987.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005081792362034321,
+ "skip_count": 4.0,
+ "step": 8664,
+ "text_loss": 0.8477506041526794
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.685647196947464,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 9.143746303644374e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13976505.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032063762191683054,
+ "skip_count": 0.0,
+ "step": 8666,
+ "text_loss": 0.23729658126831055
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 9.125911730768543e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13980061.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00043821477447636425,
+ "skip_count": 0.0,
+ "step": 8668,
+ "text_loss": 0.4233637750148773
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 9.108092820920438e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13983407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007779054809361696,
+ "skip_count": 2.0,
+ "step": 8670,
+ "text_loss": 0.5050316452980042
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 9.090289580928307e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 13986725.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018697676714509726,
+ "skip_count": 1.0,
+ "step": 8672,
+ "text_loss": 1.0568488836288452
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 9.072502017614382e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13990765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002077789744362235,
+ "skip_count": 0.0,
+ "step": 8674,
+ "text_loss": 0.48911142349243164
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 40.73260933372468,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 9.054730137794887e-05,
+ "loss": 0.0081,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 13994083.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.044373031705617905,
+ "skip_count": 3.0,
+ "step": 8676,
+ "text_loss": 0.3420281708240509
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 9.036973948280048e-05,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 13997500.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015431724023073912,
+ "skip_count": 0.0,
+ "step": 8678,
+ "text_loss": 0.21514096856117249
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.751394188435576,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 9.019233455874049e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14000460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006088062655180693,
+ "skip_count": 1.0,
+ "step": 8680,
+ "text_loss": 0.43932875990867615
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 9.001508667375107e-05,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 14003537.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01006145216524601,
+ "skip_count": 3.0,
+ "step": 8682,
+ "text_loss": 0.2192728966474533
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.77017904314646,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 8.983799589575393e-05,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14005943.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001044525415636599,
+ "skip_count": 0.0,
+ "step": 8684,
+ "text_loss": 0.8686383962631226
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 8.96610622926104e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14008954.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004876079503446817,
+ "skip_count": 2.0,
+ "step": 8686,
+ "text_loss": 0.2513524889945984
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 8.948428593212193e-05,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 14012268.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007909095846116543,
+ "skip_count": 2.0,
+ "step": 8688,
+ "text_loss": 0.17117907106876373
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 8.930766688202946e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14015192.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022194553166627884,
+ "skip_count": 0.0,
+ "step": 8690,
+ "text_loss": 0.637697160243988
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 40.807748752568244,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0162353515625,
+ "learning_rate": 8.913120521001383e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14018055.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0023777696769684553,
+ "skip_count": 0.0,
+ "step": 8692,
+ "text_loss": 0.39099860191345215
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.81714117992369,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 8.895490098369535e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14021035.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002676652278751135,
+ "skip_count": 1.0,
+ "step": 8694,
+ "text_loss": 0.6112156510353088
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 8.877875427063431e-05,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14023759.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001040685223415494,
+ "skip_count": 0.0,
+ "step": 8696,
+ "text_loss": 0.3562681972980499
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 40.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 8.86027651383302e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14026090.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0011444527190178633,
+ "skip_count": 0.0,
+ "step": 8698,
+ "text_loss": 0.6152632236480713
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 40.84531846199002,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 8.842693365422266e-05,
+ "loss": 0.008,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 14029570.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.024327632039785385,
+ "skip_count": 3.0,
+ "step": 8700,
+ "text_loss": 0.2170596867799759
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 8.825125988569061e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14032418.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00048010432510636747,
+ "skip_count": 0.0,
+ "step": 8702,
+ "text_loss": 0.4421340525150299
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 8.807574390005241e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14035610.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010498231276869774,
+ "skip_count": 0.0,
+ "step": 8704,
+ "text_loss": 0.3656717538833618
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.873495744056356,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 8.790038576456627e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 14039354.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019302964210510254,
+ "skip_count": 1.0,
+ "step": 8706,
+ "text_loss": 0.6150856018066406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 8.772518554642972e-05,
+ "loss": 0.0029,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14042353.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004211598541587591,
+ "skip_count": 0.0,
+ "step": 8708,
+ "text_loss": 0.17178772389888763
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.89228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 8.755014331277972e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14045704.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007902922225184739,
+ "skip_count": 0.0,
+ "step": 8710,
+ "text_loss": 0.6289885640144348
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 8.737525913069277e-05,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 14048743.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007915202528238297,
+ "skip_count": 2.0,
+ "step": 8712,
+ "text_loss": 0.2778690457344055
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 40.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 8.720053306718506e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14052762.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027877227403223515,
+ "skip_count": 3.0,
+ "step": 8714,
+ "text_loss": 0.3615926504135132
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.92045788083358,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0478515625,
+ "learning_rate": 8.702596518921175e-05,
+ "loss": 0.0086,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 14056645.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03460995852947235,
+ "skip_count": 1.0,
+ "step": 8716,
+ "text_loss": 0.19412031769752502
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 8.685155556366763e-05,
+ "loss": 0.0064,
+ "macro_f1": 1.0,
+ "num_tokens": 14059604.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0026834046002477407,
+ "skip_count": 2.0,
+ "step": 8718,
+ "text_loss": 0.4414670169353485
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 40.93924273554447,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 8.667730425738679e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14062170.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01547359861433506,
+ "skip_count": 4.0,
+ "step": 8720,
+ "text_loss": 0.2850716710090637
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 8.650321133714267e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14065526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020194994285702705,
+ "skip_count": 0.0,
+ "step": 8722,
+ "text_loss": 0.1776508241891861
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 8.632927686964798e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14068525.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037195945624262094,
+ "skip_count": 0.0,
+ "step": 8724,
+ "text_loss": 0.2786005735397339
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 40.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 8.615550092155477e-05,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 14071830.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008169961161911488,
+ "skip_count": 4.0,
+ "step": 8726,
+ "text_loss": 0.43228310346603394
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.97681244496625,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 8.598188355945424e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14074977.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006407112814486027,
+ "skip_count": 1.0,
+ "step": 8728,
+ "text_loss": 0.24443474411964417
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 40.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 8.580842484987689e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14078104.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001878641895018518,
+ "skip_count": 1.0,
+ "step": 8730,
+ "text_loss": 0.4559098184108734
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 40.99559729967714,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 8.563512485929253e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14081934.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0056114462204277515,
+ "skip_count": 0.0,
+ "step": 8732,
+ "text_loss": 0.3063429594039917
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.004696213677725,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 8.546198365411007e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14085097.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001542840269394219,
+ "skip_count": 0.0,
+ "step": 8734,
+ "text_loss": 0.7624274492263794
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.01408864103317,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 8.528900130067741e-05,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14088630.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002677374053746462,
+ "skip_count": 0.0,
+ "step": 8736,
+ "text_loss": 0.18395234644412994
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.02348106838861,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 8.511617786528175e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14091513.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004059800878167152,
+ "skip_count": 0.0,
+ "step": 8738,
+ "text_loss": 0.4567817449569702
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 41.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 8.494351341414947e-05,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 14094500.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0023724427446722984,
+ "skip_count": 1.0,
+ "step": 8740,
+ "text_loss": 0.6925744414329529
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0155029296875,
+ "learning_rate": 8.477100801344573e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14097518.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013842503540217876,
+ "skip_count": 2.0,
+ "step": 8742,
+ "text_loss": 0.6574832201004028
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.05165835045494,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 8.459866172927505e-05,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14101219.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003597316099330783,
+ "skip_count": 2.0,
+ "step": 8744,
+ "text_loss": 0.785912036895752
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 24.0,
+ "epoch": 41.061050777810394,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.888888955116272,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 8.442647462768082e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.6225374937057495,
+ "num_tokens": 14104460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01929798349738121,
+ "skip_count": 5.0,
+ "step": 8746,
+ "text_loss": 0.2111714482307434
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.07044320516584,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 8.425444677464545e-05,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14107404.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00048497592797502875,
+ "skip_count": 0.0,
+ "step": 8748,
+ "text_loss": 0.4764930307865143
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 8.408257823609033e-05,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 14109917.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007886217907071114,
+ "skip_count": 2.0,
+ "step": 8750,
+ "text_loss": 0.2771969735622406
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 8.391086907787587e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14112649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006535434629768133,
+ "skip_count": 0.0,
+ "step": 8752,
+ "text_loss": 0.1550854742527008
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 8.373931936580114e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14116044.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002130605047568679,
+ "skip_count": 0.0,
+ "step": 8754,
+ "text_loss": 0.4055478870868683
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.10801291458761,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 8.356792916560457e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14119097.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005611231899820268,
+ "skip_count": 0.0,
+ "step": 8756,
+ "text_loss": 0.47804903984069824
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 41.117405341943055,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 8.339669854296316e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14122079.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005650801584124565,
+ "skip_count": 0.0,
+ "step": 8758,
+ "text_loss": 0.1968296617269516
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.126797769298506,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 8.322562756349273e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14124910.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035948604345321655,
+ "skip_count": 1.0,
+ "step": 8760,
+ "text_loss": 0.4988253712654114
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 8.305471629274802e-05,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14127767.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012090947711840272,
+ "skip_count": 0.0,
+ "step": 8762,
+ "text_loss": 0.6330704689025879
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.14558262400939,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.019287109375,
+ "learning_rate": 8.288396479622262e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14130766.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010853242129087448,
+ "skip_count": 1.0,
+ "step": 8764,
+ "text_loss": 0.43057000637054443
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 8.271337313934868e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14133804.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037055034190416336,
+ "skip_count": 2.0,
+ "step": 8766,
+ "text_loss": 0.31973564624786377
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.16436747872028,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 8.254294138749741e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14137164.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005338407587260008,
+ "skip_count": 0.0,
+ "step": 8768,
+ "text_loss": 0.5066531896591187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.17375990607572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 8.237266960597844e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14140119.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014707009540870786,
+ "skip_count": 1.0,
+ "step": 8770,
+ "text_loss": 0.553493857383728
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.183152333431174,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 8.220255786004033e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14143223.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002113121096044779,
+ "skip_count": 0.0,
+ "step": 8772,
+ "text_loss": 0.40016281604766846
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.19254476078662,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0179443359375,
+ "learning_rate": 8.203260621487019e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14146366.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002210963051766157,
+ "skip_count": 1.0,
+ "step": 8774,
+ "text_loss": 0.44022905826568604
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 8.186281473559382e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14150009.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011857844656333327,
+ "skip_count": 0.0,
+ "step": 8776,
+ "text_loss": 0.572823703289032
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 8.169318348727544e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14153343.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020397785119712353,
+ "skip_count": 1.0,
+ "step": 8778,
+ "text_loss": 0.5724276900291443
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 8.152371253491841e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14156392.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001745635992847383,
+ "skip_count": 0.0,
+ "step": 8780,
+ "text_loss": 0.14162923395633698
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.23011447020839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 8.135440194346416e-05,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14159616.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002799858106300235,
+ "skip_count": 0.0,
+ "step": 8782,
+ "text_loss": 0.18205340206623077
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 8.118525177779284e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14163531.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0029223538003861904,
+ "skip_count": 0.0,
+ "step": 8784,
+ "text_loss": 0.4107058644294739
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.248899324919286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 8.101626210272311e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14166776.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001209643087349832,
+ "skip_count": 0.0,
+ "step": 8786,
+ "text_loss": 0.6441596746444702
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 8.084743298301211e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14169586.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015196573222056031,
+ "skip_count": 0.0,
+ "step": 8788,
+ "text_loss": 0.35585930943489075
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.26768417963017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 8.067876448335549e-05,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14174180.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004388966190163046,
+ "skip_count": 0.0,
+ "step": 8790,
+ "text_loss": 0.31594613194465637
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.27707660698562,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 8.05102566683873e-05,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14177950.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0031201441306620836,
+ "skip_count": 0.0,
+ "step": 8792,
+ "text_loss": 0.3161006569862366
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.28646903434106,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 8.034190960268012e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14180642.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001848527928814292,
+ "skip_count": 0.0,
+ "step": 8794,
+ "text_loss": 0.47571417689323425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.295861461696504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 8.017372335074486e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14183743.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0043064444325864315,
+ "skip_count": 1.0,
+ "step": 8796,
+ "text_loss": 0.5976942777633667
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.305253889051954,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 8.000569797703072e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14187742.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005383181851357222,
+ "skip_count": 2.0,
+ "step": 8798,
+ "text_loss": 0.2692606449127197
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.3146463164074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 7.983783354592544e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14191211.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001401974936015904,
+ "skip_count": 0.0,
+ "step": 8800,
+ "text_loss": 0.38108205795288086
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 7.967013012175478e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14194992.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001168998540379107,
+ "skip_count": 0.0,
+ "step": 8802,
+ "text_loss": 0.5201764106750488
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05322265625,
+ "learning_rate": 7.950258776878332e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14198059.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032015808392316103,
+ "skip_count": 2.0,
+ "step": 8804,
+ "text_loss": 0.6014752984046936
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 7.933520655121351e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14202313.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009403078584000468,
+ "skip_count": 0.0,
+ "step": 8806,
+ "text_loss": 0.54194176197052
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.35221602582917,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 7.916798653318607e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14205534.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027781077660620213,
+ "skip_count": 1.0,
+ "step": 8808,
+ "text_loss": 0.7181227803230286
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 7.900092777878004e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14209357.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034586815163493156,
+ "skip_count": 1.0,
+ "step": 8810,
+ "text_loss": 0.21651209890842438
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 41.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 7.883403035201265e-05,
+ "loss": 0.0056,
+ "macro_f1": 1.0,
+ "num_tokens": 14212328.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01194343063980341,
+ "skip_count": 4.0,
+ "step": 8812,
+ "text_loss": 0.20523512363433838
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 41.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0157470703125,
+ "learning_rate": 7.866729431683938e-05,
+ "loss": 0.0038,
+ "macro_f1": 1.0,
+ "num_tokens": 14214979.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0045132869854569435,
+ "skip_count": 1.0,
+ "step": 8814,
+ "text_loss": 0.4066837728023529
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.38978573525095,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0181884765625,
+ "learning_rate": 7.850071973715368e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14219030.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005109346006065607,
+ "skip_count": 2.0,
+ "step": 8816,
+ "text_loss": 0.12459450960159302
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.3991781626064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 7.833430667678737e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14222117.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036401136312633753,
+ "skip_count": 0.0,
+ "step": 8818,
+ "text_loss": 0.3759046494960785
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 41.40857058996184,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 7.816805519951008e-05,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 14225546.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006177824921905994,
+ "skip_count": 1.0,
+ "step": 8820,
+ "text_loss": 0.4031941592693329
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 41.41796301731729,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 7.800196536902987e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14228731.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009549650363624096,
+ "skip_count": 5.0,
+ "step": 8822,
+ "text_loss": 0.2895966172218323
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.427355444672735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 7.783603724899258e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14231796.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005532847251743078,
+ "skip_count": 2.0,
+ "step": 8824,
+ "text_loss": 0.32433390617370605
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.43674787202818,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 7.767027090298206e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14235869.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011165215400978923,
+ "skip_count": 0.0,
+ "step": 8826,
+ "text_loss": 0.41239091753959656
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 7.750466639452059e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14238830.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007845646468922496,
+ "skip_count": 0.0,
+ "step": 8828,
+ "text_loss": 0.5113243460655212
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 7.733922378706787e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14241672.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029602700378745794,
+ "skip_count": 1.0,
+ "step": 8830,
+ "text_loss": 0.22004501521587372
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 41.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 7.717394314402199e-05,
+ "loss": 0.0037,
+ "macro_f1": 1.0,
+ "num_tokens": 14244522.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005297200754284859,
+ "skip_count": 1.0,
+ "step": 8832,
+ "text_loss": 0.6039504408836365
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.47431758144996,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 7.700882452871872e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14246964.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018059068825095892,
+ "skip_count": 2.0,
+ "step": 8834,
+ "text_loss": 0.46563026309013367
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 7.684386800443177e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14249387.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005659483838826418,
+ "skip_count": 2.0,
+ "step": 8836,
+ "text_loss": 0.31516948342323303
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.49310243616085,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 7.667907363437288e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14252438.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011170750483870506,
+ "skip_count": 1.0,
+ "step": 8838,
+ "text_loss": 0.22867503762245178
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 7.651444148169157e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14255490.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004106760956346989,
+ "skip_count": 2.0,
+ "step": 8840,
+ "text_loss": 0.5757828950881958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 7.634997160947499e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14258430.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008562540751881897,
+ "skip_count": 0.0,
+ "step": 8842,
+ "text_loss": 0.5166661143302917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 7.618566408074862e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14261275.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012901517329737544,
+ "skip_count": 0.0,
+ "step": 8844,
+ "text_loss": 0.7376981973648071
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.53067214558262,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 7.602151895847526e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14264698.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00267209205776453,
+ "skip_count": 0.0,
+ "step": 8846,
+ "text_loss": 0.5249470472335815
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 41.54006457293807,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 7.585753630555565e-05,
+ "loss": 0.009,
+ "macro_f1": 1.0,
+ "num_tokens": 14267887.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015334542840719223,
+ "skip_count": 7.0,
+ "step": 8848,
+ "text_loss": 1.1539889574050903
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017578125,
+ "learning_rate": 7.569371618482818e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14271392.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010222389828413725,
+ "skip_count": 0.0,
+ "step": 8850,
+ "text_loss": 0.33968010544776917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 7.553005865906914e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14274658.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006116362637840211,
+ "skip_count": 0.0,
+ "step": 8852,
+ "text_loss": 0.7514221668243408
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.5682418550044,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 7.536656379099221e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14277763.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036474792286753654,
+ "skip_count": 0.0,
+ "step": 8854,
+ "text_loss": 0.3964846134185791
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.577634282359845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 7.520323164324921e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14281165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005498840939253569,
+ "skip_count": 1.0,
+ "step": 8856,
+ "text_loss": 0.2235594391822815
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 41.58702670971529,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 7.504006227842919e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14284761.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006513409782201052,
+ "skip_count": 0.0,
+ "step": 8858,
+ "text_loss": 0.45196816325187683
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.59641913707074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 7.48770557590589e-05,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14287844.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013065916718915105,
+ "skip_count": 0.0,
+ "step": 8860,
+ "text_loss": 0.2188033014535904
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 7.471421214760287e-05,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14291280.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016644994029775262,
+ "skip_count": 0.0,
+ "step": 8862,
+ "text_loss": 0.7049906253814697
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.61520399178163,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 7.455153150646299e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14294330.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002664943691343069,
+ "skip_count": 0.0,
+ "step": 8864,
+ "text_loss": 0.2160239815711975
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 7.43890138979788e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14298355.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0035776710137724876,
+ "skip_count": 0.0,
+ "step": 8866,
+ "text_loss": 0.4922088384628296
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 7.422665938442741e-05,
+ "loss": 0.0033,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14301452.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029914912302047014,
+ "skip_count": 2.0,
+ "step": 8868,
+ "text_loss": 0.5828475952148438
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 7.406446802802331e-05,
+ "loss": 0.0045,
+ "macro_f1": 1.0,
+ "num_tokens": 14304667.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0010031569981947541,
+ "skip_count": 2.0,
+ "step": 8870,
+ "text_loss": 0.657244861125946
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.65277370120341,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 7.390243989091849e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14307397.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007960405200719833,
+ "skip_count": 1.0,
+ "step": 8872,
+ "text_loss": 0.3147352635860443
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 7.37405750352026e-05,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 14310687.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007953251712024212,
+ "skip_count": 3.0,
+ "step": 8874,
+ "text_loss": 0.30315887928009033
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 7.357887352290227e-05,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14314007.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012103051412850618,
+ "skip_count": 0.0,
+ "step": 8876,
+ "text_loss": 0.6356115341186523
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 7.341733541598217e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14316696.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017898730002343655,
+ "skip_count": 1.0,
+ "step": 8878,
+ "text_loss": 0.35877764225006104
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.69034341062518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 7.325596077634383e-05,
+ "loss": 0.0068,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14320172.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007144945557229221,
+ "skip_count": 0.0,
+ "step": 8880,
+ "text_loss": 0.7939266562461853
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.699735837980626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 7.309474966582635e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14323262.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001255290349945426,
+ "skip_count": 0.0,
+ "step": 8882,
+ "text_loss": 0.7115976810455322
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.70912826533607,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 7.293370214620616e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14326826.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028131126891821623,
+ "skip_count": 2.0,
+ "step": 8884,
+ "text_loss": 0.24073036015033722
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.71852069269152,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 7.277281827919691e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14329658.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024797592777758837,
+ "skip_count": 1.0,
+ "step": 8886,
+ "text_loss": 0.47276070713996887
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 41.72791312004696,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 7.26120981264496e-05,
+ "loss": 0.0081,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 14333584.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.023670634254813194,
+ "skip_count": 3.0,
+ "step": 8888,
+ "text_loss": 0.47537583112716675
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.73730554740241,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04541015625,
+ "learning_rate": 7.245154174955254e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14336850.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009583478095009923,
+ "skip_count": 0.0,
+ "step": 8890,
+ "text_loss": 0.5258943438529968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 41.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 7.229114921003116e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14339940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006664840504527092,
+ "skip_count": 3.0,
+ "step": 8892,
+ "text_loss": 0.20986922085285187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.756090402113294,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 7.213092056934833e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14342737.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005362578085623682,
+ "skip_count": 0.0,
+ "step": 8894,
+ "text_loss": 0.5174402594566345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 7.197085588890383e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14345769.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006428950000554323,
+ "skip_count": 1.0,
+ "step": 8896,
+ "text_loss": 0.657136857509613
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.77487525682419,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 7.181095523003478e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14348563.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015549053205177188,
+ "skip_count": 0.0,
+ "step": 8898,
+ "text_loss": 0.49799686670303345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.78426768417963,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 7.165121865401535e-05,
+ "loss": 0.0068,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 14353134.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.030110027641057968,
+ "skip_count": 2.0,
+ "step": 8900,
+ "text_loss": 0.3644331693649292
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 41.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 7.149164622205712e-05,
+ "loss": 0.0072,
+ "macro_f1": 1.0,
+ "num_tokens": 14356031.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0014812488807365298,
+ "skip_count": 1.0,
+ "step": 8902,
+ "text_loss": 0.46983054280281067
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 7.133223799530836e-05,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14358941.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001170543720945716,
+ "skip_count": 0.0,
+ "step": 8904,
+ "text_loss": 0.7030026316642761
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 41.81244496624596,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 7.117299403485466e-05,
+ "loss": 0.0085,
+ "macro_f1": 1.0,
+ "num_tokens": 14361807.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0011649372754618526,
+ "skip_count": 1.0,
+ "step": 8906,
+ "text_loss": 0.44989535212516785
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.821837393601406,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 7.101391440171856e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14365464.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028165180701762438,
+ "skip_count": 0.0,
+ "step": 8908,
+ "text_loss": 0.487165629863739
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.83122982095686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 7.085499915685978e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14368149.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001956705003976822,
+ "skip_count": 2.0,
+ "step": 8910,
+ "text_loss": 0.3717629909515381
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.8406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 7.069624836117484e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14371440.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027164234779775143,
+ "skip_count": 1.0,
+ "step": 8912,
+ "text_loss": 0.3683965802192688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039794921875,
+ "learning_rate": 7.053766207549734e-05,
+ "loss": 0.009,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14374965.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005999395158141851,
+ "skip_count": 2.0,
+ "step": 8914,
+ "text_loss": 0.6271854639053345
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.85940710302319,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 7.037924036059789e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14378445.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000978486379608512,
+ "skip_count": 0.0,
+ "step": 8916,
+ "text_loss": 0.5927628874778748
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 7.022098327718401e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14382851.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012569266371428967,
+ "skip_count": 1.0,
+ "step": 8918,
+ "text_loss": 0.4092319905757904
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 41.878191957734074,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 7.006289088590007e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 14386959.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.011032132431864738,
+ "skip_count": 2.0,
+ "step": 8920,
+ "text_loss": 0.6553854942321777
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.887584385089525,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.048095703125,
+ "learning_rate": 6.990496324732737e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14390031.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001376329455524683,
+ "skip_count": 0.0,
+ "step": 8922,
+ "text_loss": 0.7792862057685852
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.89697681244497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 6.974720042198396e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14392966.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005924372002482414,
+ "skip_count": 2.0,
+ "step": 8924,
+ "text_loss": 0.4466548562049866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 6.958960247032515e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14395619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010054769925773144,
+ "skip_count": 2.0,
+ "step": 8926,
+ "text_loss": 0.24784758687019348
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 6.943216945274255e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14398891.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006864808965474367,
+ "skip_count": 0.0,
+ "step": 8928,
+ "text_loss": 0.5154114961624146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 6.927490142956489e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14402991.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000996887218207121,
+ "skip_count": 0.0,
+ "step": 8930,
+ "text_loss": 0.5888006091117859
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 41.93454652186674,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 6.911779846105753e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14406276.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0007863475475460291,
+ "skip_count": 0.0,
+ "step": 8932,
+ "text_loss": 0.6862632632255554
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.943938949222186,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 6.896086060742262e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14409005.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020060581155121326,
+ "skip_count": 1.0,
+ "step": 8934,
+ "text_loss": 0.8998132348060608
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 41.95333137657764,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 6.880408792879905e-05,
+ "loss": 0.0047,
+ "macro_f1": 1.0,
+ "num_tokens": 14411902.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.008094016462564468,
+ "skip_count": 3.0,
+ "step": 8936,
+ "text_loss": 0.3411460518836975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 6.864748048526237e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14414683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004374993033707142,
+ "skip_count": 0.0,
+ "step": 8938,
+ "text_loss": 0.24222217500209808
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 6.84910383368249e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14417740.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003004335332661867,
+ "skip_count": 2.0,
+ "step": 8940,
+ "text_loss": 0.5524137020111084
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 41.98150865864397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 6.83347615434356e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14420678.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007001105695962906,
+ "skip_count": 2.0,
+ "step": 8942,
+ "text_loss": 0.3124033212661743
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 41.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 6.817865016497993e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14424259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038414683658629656,
+ "skip_count": 0.0,
+ "step": 8944,
+ "text_loss": 0.509667694568634
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 42.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060791015625,
+ "learning_rate": 6.80227042612801e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14427084.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008573584258556366,
+ "skip_count": 0.0,
+ "step": 8946,
+ "text_loss": 0.2533438205718994
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.00939242735544,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 6.786692389209482e-05,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 14429690.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003758789971470833,
+ "skip_count": 2.0,
+ "step": 8948,
+ "text_loss": 0.14571085572242737
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06640625,
+ "learning_rate": 6.771130911711953e-05,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14432983.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005996126215904951,
+ "skip_count": 2.0,
+ "step": 8950,
+ "text_loss": 0.24994049966335297
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 6.755585999598613e-05,
+ "loss": 0.0032,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14435772.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012271527666598558,
+ "skip_count": 0.0,
+ "step": 8952,
+ "text_loss": 0.3705698549747467
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 42.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 6.740057658826293e-05,
+ "loss": 0.0081,
+ "macro_f1": 1.0,
+ "num_tokens": 14438912.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017618577694520354,
+ "skip_count": 1.0,
+ "step": 8954,
+ "text_loss": 0.6691124439239502
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 6.72454589534548e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14441959.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016956349136307836,
+ "skip_count": 1.0,
+ "step": 8956,
+ "text_loss": 0.45412346720695496
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 6.709050715100324e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14444804.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017321301624178886,
+ "skip_count": 2.0,
+ "step": 8958,
+ "text_loss": 0.2668265998363495
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 6.69357212402859e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14447390.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005267233122140169,
+ "skip_count": 2.0,
+ "step": 8960,
+ "text_loss": 0.35546016693115234
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 42.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.017578125,
+ "learning_rate": 6.67811012806172e-05,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14451286.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0045175012201070786,
+ "skip_count": 3.0,
+ "step": 8962,
+ "text_loss": 0.14669834077358246
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.084531846199,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 6.662664733124768e-05,
+ "loss": 0.0064,
+ "macro_f1": 1.0,
+ "num_tokens": 14454335.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004905698820948601,
+ "skip_count": 3.0,
+ "step": 8964,
+ "text_loss": 0.28777357935905457
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 42.09392427355445,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 6.647235945136442e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 14457708.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.032136883586645126,
+ "skip_count": 1.0,
+ "step": 8966,
+ "text_loss": 0.2317836582660675
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 42.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 6.631823770009088e-05,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 14460721.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038611628115177155,
+ "skip_count": 1.0,
+ "step": 8968,
+ "text_loss": 0.28979742527008057
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.11270912826534,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 6.616428213648656e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14463467.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006560821202583611,
+ "skip_count": 0.0,
+ "step": 8970,
+ "text_loss": 0.3474387526512146
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 42.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 6.60104928195479e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14466586.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016879125032573938,
+ "skip_count": 0.0,
+ "step": 8972,
+ "text_loss": 0.5454491972923279
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 6.58568698082071e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14470125.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004945555119775236,
+ "skip_count": 0.0,
+ "step": 8974,
+ "text_loss": 0.4728975296020508
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 6.570341316133272e-05,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 14473887.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.010141569189727306,
+ "skip_count": 3.0,
+ "step": 8976,
+ "text_loss": 0.24756617844104767
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 6.555012293772967e-05,
+ "loss": 0.0051,
+ "macro_f1": 1.0,
+ "num_tokens": 14477046.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.011950359679758549,
+ "skip_count": 2.0,
+ "step": 8978,
+ "text_loss": 0.25375646352767944
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 6.539699919613911e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14480638.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007824545609764755,
+ "skip_count": 0.0,
+ "step": 8980,
+ "text_loss": 0.6888379454612732
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 6.524404199523826e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14483723.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004318726249039173,
+ "skip_count": 1.0,
+ "step": 8982,
+ "text_loss": 0.3603152334690094
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.17845611975345,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 6.509125139364058e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 14486876.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010652635246515274,
+ "skip_count": 1.0,
+ "step": 8984,
+ "text_loss": 0.43394285440444946
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 6.493862744989587e-05,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14489944.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010475299786776304,
+ "skip_count": 0.0,
+ "step": 8986,
+ "text_loss": 0.5952020287513733
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 6.478617022248984e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14493094.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004329503979533911,
+ "skip_count": 1.0,
+ "step": 8988,
+ "text_loss": 0.7284399271011353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 6.463387976984437e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14496944.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019588395953178406,
+ "skip_count": 1.0,
+ "step": 8990,
+ "text_loss": 0.8103306889533997
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.21602582917523,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 6.448175615031749e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14499997.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008046228438615799,
+ "skip_count": 1.0,
+ "step": 8992,
+ "text_loss": 0.14758773148059845
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 42.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 6.432979942220319e-05,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14503247.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0028899910394102335,
+ "skip_count": 0.0,
+ "step": 8994,
+ "text_loss": 0.2568151652812958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 6.417800964373161e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14506244.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0042211092077195644,
+ "skip_count": 2.0,
+ "step": 8996,
+ "text_loss": 0.3506850600242615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 6.402638687306872e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14510502.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003309462917968631,
+ "skip_count": 0.0,
+ "step": 8998,
+ "text_loss": 0.5852319598197937
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 42.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 6.387493116831699e-05,
+ "loss": 0.005,
+ "macro_f1": 1.0,
+ "num_tokens": 14513679.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015246274881064892,
+ "skip_count": 5.0,
+ "step": 9000,
+ "text_loss": 0.4266709089279175
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 42.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 6.372364258751434e-05,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 14516862.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005648075137287378,
+ "skip_count": 2.0,
+ "step": 9002,
+ "text_loss": 0.34153711795806885
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 42.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 6.357252118863482e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14519660.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005153972655534744,
+ "skip_count": 3.0,
+ "step": 9004,
+ "text_loss": 0.3911980092525482
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 6.342156702958851e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14522261.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001209715730510652,
+ "skip_count": 0.0,
+ "step": 9006,
+ "text_loss": 0.45400822162628174
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 6.327078016822124e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14525368.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00367624219506979,
+ "skip_count": 1.0,
+ "step": 9008,
+ "text_loss": 0.5327706336975098
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 6.31201606623149e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14528253.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018971028039231896,
+ "skip_count": 0.0,
+ "step": 9010,
+ "text_loss": 0.19216643273830414
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 42.30995010272967,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 6.296970856958712e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14531214.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003927265293896198,
+ "skip_count": 0.0,
+ "step": 9012,
+ "text_loss": 0.3931650221347809
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 6.281942394769142e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14535063.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00801338441669941,
+ "skip_count": 0.0,
+ "step": 9014,
+ "text_loss": 0.1605554074048996
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.32873495744057,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 6.266930685421717e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14538690.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013267790200188756,
+ "skip_count": 0.0,
+ "step": 9016,
+ "text_loss": 0.4797641932964325
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.33812738479601,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 6.251935734668957e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14542591.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013866537483409047,
+ "skip_count": 1.0,
+ "step": 9018,
+ "text_loss": 0.4539037346839905
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 6.236957548256945e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14545259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001481749233789742,
+ "skip_count": 0.0,
+ "step": 9020,
+ "text_loss": 0.6693689227104187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 6.22199613192535e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14548362.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005995423533022404,
+ "skip_count": 1.0,
+ "step": 9022,
+ "text_loss": 0.6533607244491577
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 42.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 6.207051491407428e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14551694.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.015427720732986927,
+ "skip_count": 4.0,
+ "step": 9024,
+ "text_loss": 0.33537840843200684
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 42.375697094217784,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 6.192123632429986e-05,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14554614.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017432396998628974,
+ "skip_count": 0.0,
+ "step": 9026,
+ "text_loss": 0.9725127220153809
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.385089521573235,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 6.177212560713413e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14559474.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002909898292273283,
+ "skip_count": 2.0,
+ "step": 9028,
+ "text_loss": 0.16944198310375214
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 6.162318281971652e-05,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14563046.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00274385092779994,
+ "skip_count": 0.0,
+ "step": 9030,
+ "text_loss": 0.43176764249801636
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 42.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 6.147440801912218e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14565829.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0024230771232396364,
+ "skip_count": 0.0,
+ "step": 9032,
+ "text_loss": 0.5683854818344116
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 6.132580126236197e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14569016.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004686394706368446,
+ "skip_count": 1.0,
+ "step": 9034,
+ "text_loss": 0.5422781705856323
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 42.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 6.117736260638223e-05,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 14572558.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0010892068967223167,
+ "skip_count": 1.0,
+ "step": 9036,
+ "text_loss": 0.5740243196487427
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.43205165835045,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 6.102909210806495e-05,
+ "loss": 0.006,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 14575969.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0163960512727499,
+ "skip_count": 0.0,
+ "step": 9038,
+ "text_loss": 0.4803958535194397
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.441444085705896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 6.088098982422768e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14578746.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020733694545924664,
+ "skip_count": 0.0,
+ "step": 9040,
+ "text_loss": 0.30313390493392944
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.45083651306135,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 6.073305581162342e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 14581856.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.022739989683032036,
+ "skip_count": 2.0,
+ "step": 9042,
+ "text_loss": 0.5871608257293701
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 6.058529012694086e-05,
+ "loss": 0.0034,
+ "macro_f1": 1.0,
+ "num_tokens": 14584754.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012138293124735355,
+ "skip_count": 2.0,
+ "step": 9044,
+ "text_loss": 0.18492890894412994
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.053466796875,
+ "learning_rate": 6.0437692826803893e-05,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14587867.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009839123813435435,
+ "skip_count": 0.0,
+ "step": 9046,
+ "text_loss": 0.5532476902008057
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 42.47901379512768,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.11376953125,
+ "learning_rate": 6.029026396777237e-05,
+ "loss": 0.0082,
+ "macro_f1": 1.0,
+ "num_tokens": 14591521.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01392262615263462,
+ "skip_count": 5.0,
+ "step": 9048,
+ "text_loss": 0.20356278121471405
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.48840622248312,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 6.0143003606341174e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 14595358.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.018218200653791428,
+ "skip_count": 1.0,
+ "step": 9050,
+ "text_loss": 0.3070164620876312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 5.9995911798940764e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14598696.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0002688709646463394,
+ "skip_count": 1.0,
+ "step": 9052,
+ "text_loss": 0.5637917518615723
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.507191077194015,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 5.984898860193694e-05,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14602301.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003135781968012452,
+ "skip_count": 0.0,
+ "step": 9054,
+ "text_loss": 0.345111608505249
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 5.9702234071631e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14606625.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002299862913787365,
+ "skip_count": 0.0,
+ "step": 9056,
+ "text_loss": 0.30707255005836487
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 42.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 5.9555648264259576e-05,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14610303.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0007164468406699598,
+ "skip_count": 0.0,
+ "step": 9058,
+ "text_loss": 0.56083083152771
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 5.940923123599462e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14613211.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00136603566352278,
+ "skip_count": 0.0,
+ "step": 9060,
+ "text_loss": 0.4455239474773407
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 5.926298304294336e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14615844.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001727075781673193,
+ "skip_count": 0.0,
+ "step": 9062,
+ "text_loss": 0.5928102731704712
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.55415321397123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 5.911690374114842e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14619190.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022300337441265583,
+ "skip_count": 0.0,
+ "step": 9064,
+ "text_loss": 0.9456163048744202
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.563545641326684,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 5.8970993386587676e-05,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14622304.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006507525686174631,
+ "skip_count": 2.0,
+ "step": 9066,
+ "text_loss": 0.1809750199317932
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.57293806868213,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.050537109375,
+ "learning_rate": 5.882525203517419e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14625386.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022866397630423307,
+ "skip_count": 0.0,
+ "step": 9068,
+ "text_loss": 0.1849939227104187
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048095703125,
+ "learning_rate": 5.867967974275629e-05,
+ "loss": 0.0097,
+ "macro_f1": 1.0,
+ "num_tokens": 14628472.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0058460538275539875,
+ "skip_count": 2.0,
+ "step": 9070,
+ "text_loss": 0.2627561688423157
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 5.853427656511773e-05,
+ "loss": 0.0071,
+ "macro_f1": 1.0,
+ "num_tokens": 14631187.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0085217310115695,
+ "skip_count": 2.0,
+ "step": 9072,
+ "text_loss": 0.18039973080158234
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 42.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 5.838904255797717e-05,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 14633919.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007423012051731348,
+ "skip_count": 4.0,
+ "step": 9074,
+ "text_loss": 0.23746201395988464
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 5.8243977776988585e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14636674.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011181328445672989,
+ "skip_count": 0.0,
+ "step": 9076,
+ "text_loss": 0.38140806555747986
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 42.619900205459345,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 5.8099082277741024e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 14639506.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.03306882083415985,
+ "skip_count": 2.0,
+ "step": 9078,
+ "text_loss": 0.2627770006656647
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.629292632814796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 5.795435611575872e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14642955.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014759303303435445,
+ "skip_count": 0.0,
+ "step": 9080,
+ "text_loss": 0.47112786769866943
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 5.78097993465011e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14646018.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003744201036170125,
+ "skip_count": 0.0,
+ "step": 9082,
+ "text_loss": 0.36873605847358704
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 5.7665412025362516e-05,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14649402.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002992798574268818,
+ "skip_count": 2.0,
+ "step": 9084,
+ "text_loss": 0.6350628137588501
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 5.752119420767243e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14652248.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005798593629151583,
+ "skip_count": 2.0,
+ "step": 9086,
+ "text_loss": 0.2512637972831726
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.66686234223657,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 5.7377145948695474e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14655060.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024162146728485823,
+ "skip_count": 0.0,
+ "step": 9088,
+ "text_loss": 0.4233066439628601
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 42.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 5.723326730363115e-05,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 14658873.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004826475866138935,
+ "skip_count": 4.0,
+ "step": 9090,
+ "text_loss": 0.45946353673934937
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.685647196947464,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 5.7089558327614036e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14661865.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020765739027410746,
+ "skip_count": 2.0,
+ "step": 9092,
+ "text_loss": 0.9425542950630188
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 5.694601907571356e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14666085.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012533976696431637,
+ "skip_count": 0.0,
+ "step": 9094,
+ "text_loss": 0.6307007670402527
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 42.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 5.680264960293446e-05,
+ "loss": 0.0063,
+ "macro_f1": 1.0,
+ "num_tokens": 14668992.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013796845450997353,
+ "skip_count": 5.0,
+ "step": 9096,
+ "text_loss": 0.21720129251480103
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 5.665944996421612e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14672365.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004391494672745466,
+ "skip_count": 0.0,
+ "step": 9098,
+ "text_loss": 0.28794240951538086
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 5.651642021443287e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14676232.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006779583054594696,
+ "skip_count": 0.0,
+ "step": 9100,
+ "text_loss": 0.45190441608428955
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 23.0,
+ "epoch": 42.73260933372468,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 5.637356040839398e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6289562582969666,
+ "num_tokens": 14679582.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02379363216459751,
+ "skip_count": 6.0,
+ "step": 9102,
+ "text_loss": 0.3395652770996094
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 42.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 5.623087060084364e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14683438.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00344930961728096,
+ "skip_count": 4.0,
+ "step": 9104,
+ "text_loss": 0.4345538914203644
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 42.751394188435576,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 5.60883508464608e-05,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14686333.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005554547533392906,
+ "skip_count": 3.0,
+ "step": 9106,
+ "text_loss": 0.5202528238296509
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 5.594600119985932e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14690754.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004589532967656851,
+ "skip_count": 1.0,
+ "step": 9108,
+ "text_loss": 0.3040390610694885
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.77017904314646,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 5.580382171558784e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 14693793.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.029969461262226105,
+ "skip_count": 2.0,
+ "step": 9110,
+ "text_loss": 0.3644331693649292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 5.566181244812979e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14697290.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003387648146599531,
+ "skip_count": 0.0,
+ "step": 9112,
+ "text_loss": 0.5177932977676392
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 5.5519973451903404e-05,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14700597.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004790942650288343,
+ "skip_count": 1.0,
+ "step": 9114,
+ "text_loss": 0.2132686972618103
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 5.5378304781261715e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14703852.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007685191812925041,
+ "skip_count": 0.0,
+ "step": 9116,
+ "text_loss": 0.6690551042556763
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 42.807748752568244,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 5.523680649049234e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14707218.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0033531817607581615,
+ "skip_count": 0.0,
+ "step": 9118,
+ "text_loss": 0.26232191920280457
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.81714117992369,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 5.509547863381781e-05,
+ "loss": 0.0084,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 14710244.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.025616342201828957,
+ "skip_count": 0.0,
+ "step": 9120,
+ "text_loss": 0.2897983193397522
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 5.495432126539507e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14713495.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014400121290236712,
+ "skip_count": 0.0,
+ "step": 9122,
+ "text_loss": 0.4580271244049072
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 5.481333443931602e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14716703.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008548611658625305,
+ "skip_count": 0.0,
+ "step": 9124,
+ "text_loss": 0.5140601992607117
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.84531846199002,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 5.4672518209607e-05,
+ "loss": 0.0075,
+ "macro_f1": 0.9255813956260681,
+ "num_tokens": 14719443.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.02092800848186016,
+ "skip_count": 4.0,
+ "step": 9126,
+ "text_loss": 0.2842077314853668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 5.4531872630228965e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14722711.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0037711653858423233,
+ "skip_count": 0.0,
+ "step": 9128,
+ "text_loss": 0.3268158733844757
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 5.4391397755077784e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14725635.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005959369707852602,
+ "skip_count": 0.0,
+ "step": 9130,
+ "text_loss": 0.44725099205970764
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0181884765625,
+ "learning_rate": 5.425109363798358e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14728945.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011272960109636188,
+ "skip_count": 0.0,
+ "step": 9132,
+ "text_loss": 0.45580998063087463
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0167236328125,
+ "learning_rate": 5.411096033271118e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14732271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015554855344817042,
+ "skip_count": 0.0,
+ "step": 9134,
+ "text_loss": 0.16767354309558868
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.89228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 5.3970997892959894e-05,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 14735462.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.007287262007594109,
+ "skip_count": 5.0,
+ "step": 9136,
+ "text_loss": 0.8925374746322632
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.062255859375,
+ "learning_rate": 5.383120637236366e-05,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14739288.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004336730111390352,
+ "skip_count": 0.0,
+ "step": 9138,
+ "text_loss": 0.29503148794174194
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 5.369158582449074e-05,
+ "loss": 0.0032,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14742058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004528806544840336,
+ "skip_count": 0.0,
+ "step": 9140,
+ "text_loss": 0.16937516629695892
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.92045788083358,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 5.3552136302844e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14745628.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005676734144799411,
+ "skip_count": 0.0,
+ "step": 9142,
+ "text_loss": 0.48764488101005554
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 5.3412857860860917e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14748482.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017468055011704564,
+ "skip_count": 0.0,
+ "step": 9144,
+ "text_loss": 0.46164339780807495
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.93924273554447,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 5.327375055191314e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 14751091.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007167307659983635,
+ "skip_count": 1.0,
+ "step": 9146,
+ "text_loss": 0.37566086649894714
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 42.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 5.3134814429306896e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14753850.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003801940008997917,
+ "skip_count": 2.0,
+ "step": 9148,
+ "text_loss": 0.17589576542377472
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 5.299604954628268e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14756779.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00396628538146615,
+ "skip_count": 1.0,
+ "step": 9150,
+ "text_loss": 0.4118746817111969
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 42.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 5.2857455956015544e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14759574.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.003950111567974091,
+ "skip_count": 0.0,
+ "step": 9152,
+ "text_loss": 0.5839328169822693
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.97681244496625,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 5.271903371161479e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14762802.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006622051005251706,
+ "skip_count": 1.0,
+ "step": 9154,
+ "text_loss": 0.40162989497184753
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 42.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 5.2580782866124054e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14766136.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003140404587611556,
+ "skip_count": 0.0,
+ "step": 9156,
+ "text_loss": 0.2028028815984726
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 42.99559729967714,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 5.244270347252139e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14769306.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035792726557701826,
+ "skip_count": 1.0,
+ "step": 9158,
+ "text_loss": 0.5611430406570435
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.004696213677725,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 5.2304795583719034e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14771928.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007276696152985096,
+ "skip_count": 2.0,
+ "step": 9160,
+ "text_loss": 0.1382172554731369
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.01408864103317,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 5.2167059252563485e-05,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14775047.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003121814923360944,
+ "skip_count": 0.0,
+ "step": 9162,
+ "text_loss": 0.6130381226539612
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 31.0,
+ "epoch": 43.02348106838861,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 5.2029494531835695e-05,
+ "loss": 0.0071,
+ "macro_f1": 1.0,
+ "num_tokens": 14777746.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.006029475014656782,
+ "skip_count": 1.0,
+ "step": 9164,
+ "text_loss": 0.5901363492012024
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 43.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 5.189210147425061e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14780813.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034428017679601908,
+ "skip_count": 5.0,
+ "step": 9166,
+ "text_loss": 0.5909968018531799
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 5.1754880132457494e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14785178.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025068193208426237,
+ "skip_count": 2.0,
+ "step": 9168,
+ "text_loss": 0.20257101953029633
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.05165835045494,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 5.161783055904001e-05,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14788307.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003352245781570673,
+ "skip_count": 0.0,
+ "step": 9170,
+ "text_loss": 0.20024186372756958
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 43.061050777810394,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 5.1480952806515654e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14791053.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0009423785959370434,
+ "skip_count": 0.0,
+ "step": 9172,
+ "text_loss": 0.6944412589073181
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.07044320516584,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 5.13442469273363e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14794259.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016676477389410138,
+ "skip_count": 0.0,
+ "step": 9174,
+ "text_loss": 0.10889370739459991
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 5.1207712973887875e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14797345.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005842766724526882,
+ "skip_count": 2.0,
+ "step": 9176,
+ "text_loss": 0.17763052880764008
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 5.107135099849042e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14800819.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004951528972014785,
+ "skip_count": 0.0,
+ "step": 9178,
+ "text_loss": 0.43891432881355286
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 5.093516105339818e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14803924.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031010014936327934,
+ "skip_count": 1.0,
+ "step": 9180,
+ "text_loss": 0.39177098870277405
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.10801291458761,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 5.079914319079931e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14807083.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00047361713950522244,
+ "skip_count": 0.0,
+ "step": 9182,
+ "text_loss": 0.39144888520240784
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.117405341943055,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 5.066329746281617e-05,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14810263.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018734827172011137,
+ "skip_count": 0.0,
+ "step": 9184,
+ "text_loss": 0.531446099281311
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.126797769298506,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 5.052762392150506e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14813761.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00503428652882576,
+ "skip_count": 0.0,
+ "step": 9186,
+ "text_loss": 0.19398775696754456
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 5.039212261885634e-05,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14817708.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010842647170647979,
+ "skip_count": 0.0,
+ "step": 9188,
+ "text_loss": 0.5365647077560425
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 43.14558262400939,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0172119140625,
+ "learning_rate": 5.025679360679442e-05,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 14820912.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.004775309935212135,
+ "skip_count": 2.0,
+ "step": 9190,
+ "text_loss": 0.6473321318626404
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 5.012163693717747e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14824115.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004022061824798584,
+ "skip_count": 0.0,
+ "step": 9192,
+ "text_loss": 0.24432586133480072
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.16436747872028,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 4.9986652661798025e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14827404.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00231996551156044,
+ "skip_count": 1.0,
+ "step": 9194,
+ "text_loss": 0.7459486722946167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.17375990607572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 4.98518408323822e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14830077.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000999651150777936,
+ "skip_count": 0.0,
+ "step": 9196,
+ "text_loss": 0.5136345624923706
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.183152333431174,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 4.971720150059012e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14833231.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033226648811250925,
+ "skip_count": 2.0,
+ "step": 9198,
+ "text_loss": 0.1597593128681183
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.19254476078662,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 4.958273471801583e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14836534.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00400200579315424,
+ "skip_count": 0.0,
+ "step": 9200,
+ "text_loss": 0.16248664259910583
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 4.94484405361873e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14840301.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038636941462755203,
+ "skip_count": 0.0,
+ "step": 9202,
+ "text_loss": 0.20964740216732025
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 4.9314319006566296e-05,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14844094.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00593461561948061,
+ "skip_count": 2.0,
+ "step": 9204,
+ "text_loss": 0.43311986327171326
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0166015625,
+ "learning_rate": 4.918037018054844e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14847148.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007939442875795066,
+ "skip_count": 0.0,
+ "step": 9206,
+ "text_loss": 0.8805840015411377
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.23011447020839,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 4.904659410946311e-05,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 14851556.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0058822291903197765,
+ "skip_count": 4.0,
+ "step": 9208,
+ "text_loss": 0.2123873233795166
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 4.891299084457362e-05,
+ "loss": 0.0087,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14855208.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024413811042904854,
+ "skip_count": 0.0,
+ "step": 9210,
+ "text_loss": 0.4408712685108185
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.248899324919286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0400390625,
+ "learning_rate": 4.8779560437076983e-05,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14858433.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007487752009183168,
+ "skip_count": 1.0,
+ "step": 9212,
+ "text_loss": 0.7417129874229431
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03125,
+ "learning_rate": 4.864630293810401e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14861739.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007972145453095436,
+ "skip_count": 2.0,
+ "step": 9214,
+ "text_loss": 0.3347324728965759
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.26768417963017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 4.851321839871908e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14865220.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006238576490432024,
+ "skip_count": 1.0,
+ "step": 9216,
+ "text_loss": 0.49660998582839966
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.27707660698562,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 4.838030686992062e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14868179.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003592922119423747,
+ "skip_count": 0.0,
+ "step": 9218,
+ "text_loss": 0.316535621881485
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 43.28646903434106,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 4.824756840264055e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14870950.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012321153655648232,
+ "skip_count": 3.0,
+ "step": 9220,
+ "text_loss": 0.270915150642395
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.295861461696504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 4.8115003047744466e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14873749.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008396002231165767,
+ "skip_count": 0.0,
+ "step": 9222,
+ "text_loss": 0.4190096855163574
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.305253889051954,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0169677734375,
+ "learning_rate": 4.798261085603162e-05,
+ "loss": 0.0034,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14877349.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002983161248266697,
+ "skip_count": 1.0,
+ "step": 9224,
+ "text_loss": 0.8203139901161194
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.3146463164074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 4.785039187823503e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14881192.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003951616585254669,
+ "skip_count": 2.0,
+ "step": 9226,
+ "text_loss": 0.36447709798812866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 4.771834616502119e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14884608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001604852732270956,
+ "skip_count": 0.0,
+ "step": 9228,
+ "text_loss": 0.733951985836029
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.333431171118285,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 4.758647376699032e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 14887963.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.041028670966625214,
+ "skip_count": 2.0,
+ "step": 9230,
+ "text_loss": 0.1800784021615982
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 4.7454774734676074e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14890769.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027380166575312614,
+ "skip_count": 0.0,
+ "step": 9232,
+ "text_loss": 0.6017972230911255
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.35221602582917,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 4.732324911854591e-05,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14894162.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018064725445583463,
+ "skip_count": 2.0,
+ "step": 9234,
+ "text_loss": 0.5853637456893921
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 43.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 4.7191896969000617e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14897248.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005479716695845127,
+ "skip_count": 0.0,
+ "step": 9236,
+ "text_loss": 0.6206526756286621
+ },
+ {
+ "acc_repeat": 0.75,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 43.371000880540066,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.8571428656578064,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 4.706071833637454e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.9446290731430054,
+ "num_tokens": 14900186.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.013435420580208302,
+ "skip_count": 3.0,
+ "step": 9238,
+ "text_loss": 0.46402135491371155
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 43.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 4.692971327093559e-05,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 14903080.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007366253528743982,
+ "skip_count": 4.0,
+ "step": 9240,
+ "text_loss": 0.6870771646499634
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.38978573525095,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 4.6798881822885276e-05,
+ "loss": 0.0067,
+ "macro_f1": 1.0,
+ "num_tokens": 14906837.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004979560151696205,
+ "skip_count": 2.0,
+ "step": 9242,
+ "text_loss": 0.46396589279174805
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.3991781626064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 4.666822404235838e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14909541.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00023516178771387786,
+ "skip_count": 0.0,
+ "step": 9244,
+ "text_loss": 0.5960518717765808
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 43.40857058996184,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 4.6537739979423174e-05,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 14912820.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0014796241885051131,
+ "skip_count": 1.0,
+ "step": 9246,
+ "text_loss": 0.48075684905052185
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.41796301731729,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 4.640742968408146e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14916283.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001386807532981038,
+ "skip_count": 0.0,
+ "step": 9248,
+ "text_loss": 0.3950015902519226
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 43.427355444672735,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.037109375,
+ "learning_rate": 4.627729320626833e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 14918958.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.020335515961050987,
+ "skip_count": 4.0,
+ "step": 9250,
+ "text_loss": 0.6995832324028015
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.43674787202818,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 4.6147330595852354e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14921888.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005387732293456793,
+ "skip_count": 2.0,
+ "step": 9252,
+ "text_loss": 0.2771800756454468
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 4.601754190263552e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14925135.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001703745685517788,
+ "skip_count": 1.0,
+ "step": 9254,
+ "text_loss": 0.7100088596343994
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 4.5887927176352875e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14929198.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0058114733546972275,
+ "skip_count": 2.0,
+ "step": 9256,
+ "text_loss": 0.21729083359241486
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 4.5758486466673244e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14932685.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026105218566954136,
+ "skip_count": 0.0,
+ "step": 9258,
+ "text_loss": 0.20695121586322784
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.47431758144996,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 4.5629219823198564e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14937901.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006947176996618509,
+ "skip_count": 2.0,
+ "step": 9260,
+ "text_loss": 0.15886647999286652
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 4.550012729546393e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14941406.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011366386897861958,
+ "skip_count": 0.0,
+ "step": 9262,
+ "text_loss": 0.49892309308052063
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 43.49310243616085,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 4.537120893293789e-05,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 14944200.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002686526160687208,
+ "skip_count": 1.0,
+ "step": 9264,
+ "text_loss": 0.6201852560043335
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 4.5242464785022256e-05,
+ "loss": 0.0088,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14947592.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007816873257979751,
+ "skip_count": 0.0,
+ "step": 9266,
+ "text_loss": 0.49434536695480347
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 4.5113894901051944e-05,
+ "loss": 0.0072,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14950382.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013167982688173652,
+ "skip_count": 0.0,
+ "step": 9268,
+ "text_loss": 0.696306586265564
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 43.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 4.498549933029511e-05,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14953424.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006240467075258493,
+ "skip_count": 3.0,
+ "step": 9270,
+ "text_loss": 0.14193731546401978
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.53067214558262,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 4.485727812195339e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14956937.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006212725769728422,
+ "skip_count": 2.0,
+ "step": 9272,
+ "text_loss": 0.40858668088912964
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.54006457293807,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 4.472923132516132e-05,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14960398.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003120801877230406,
+ "skip_count": 2.0,
+ "step": 9274,
+ "text_loss": 0.4740981459617615
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 4.46013589889866e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14963037.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0027343074325472116,
+ "skip_count": 0.0,
+ "step": 9276,
+ "text_loss": 0.1420614868402481
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 4.4473661162430176e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14965604.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006372901843860745,
+ "skip_count": 0.0,
+ "step": 9278,
+ "text_loss": 0.4628531336784363
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.5682418550044,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 4.4346137894426155e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14968803.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0062922025099396706,
+ "skip_count": 2.0,
+ "step": 9280,
+ "text_loss": 0.29813849925994873
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.577634282359845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 4.421878923384159e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14972557.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006071912590414286,
+ "skip_count": 2.0,
+ "step": 9282,
+ "text_loss": 0.19581027328968048
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 43.58702670971529,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 4.40916152294768e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14975358.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001606325968168676,
+ "skip_count": 0.0,
+ "step": 9284,
+ "text_loss": 0.6929896473884583
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.59641913707074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 4.3964615930065124e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14978045.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002845643786713481,
+ "skip_count": 1.0,
+ "step": 9286,
+ "text_loss": 0.49997636675834656
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 4.3837791384272744e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14981606.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005257320590317249,
+ "skip_count": 1.0,
+ "step": 9288,
+ "text_loss": 0.3391074538230896
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.61520399178163,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 4.3711141640699395e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 14984404.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.02914038859307766,
+ "skip_count": 2.0,
+ "step": 9290,
+ "text_loss": 0.29165980219841003
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 4.3584666747877254e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14987280.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005831835325807333,
+ "skip_count": 1.0,
+ "step": 9292,
+ "text_loss": 0.5312305688858032
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 4.345836675427184e-05,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14990071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035566375590860844,
+ "skip_count": 0.0,
+ "step": 9294,
+ "text_loss": 0.25595441460609436
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 4.333224170828149e-05,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 14993809.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026552488561719656,
+ "skip_count": 0.0,
+ "step": 9296,
+ "text_loss": 0.18538808822631836
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 43.65277370120341,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 4.3206291658237586e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 14996794.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010047328658401966,
+ "skip_count": 4.0,
+ "step": 9298,
+ "text_loss": 0.37891554832458496
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 4.308051665240442e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15000911.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030308531131595373,
+ "skip_count": 0.0,
+ "step": 9300,
+ "text_loss": 0.20204831659793854
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 4.295491673897922e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15004106.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003695673542097211,
+ "skip_count": 1.0,
+ "step": 9302,
+ "text_loss": 0.84013831615448
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 4.282949196609215e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15007482.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000820459274109453,
+ "skip_count": 0.0,
+ "step": 9304,
+ "text_loss": 0.4521652162075043
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.69034341062518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 4.2704242381806144e-05,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15010579.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006170184817165136,
+ "skip_count": 1.0,
+ "step": 9306,
+ "text_loss": 0.22438007593154907
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 43.699735837980626,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.051025390625,
+ "learning_rate": 4.25791680341171e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 15013835.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021745599806308746,
+ "skip_count": 4.0,
+ "step": 9308,
+ "text_loss": 0.5847432613372803
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.70912826533607,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 4.245426897095372e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15017268.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022570823784917593,
+ "skip_count": 1.0,
+ "step": 9310,
+ "text_loss": 0.345931738615036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.71852069269152,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 4.232954524017763e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15020095.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009895693510770798,
+ "skip_count": 0.0,
+ "step": 9312,
+ "text_loss": 0.5374923944473267
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.72791312004696,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 4.220499688958307e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15022763.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005146807990968227,
+ "skip_count": 0.0,
+ "step": 9314,
+ "text_loss": 0.7208939790725708
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.73730554740241,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 4.208062396689738e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15025926.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00369556387886405,
+ "skip_count": 1.0,
+ "step": 9316,
+ "text_loss": 0.36686572432518005
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 4.1956426519780435e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15029120.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00971714872866869,
+ "skip_count": 2.0,
+ "step": 9318,
+ "text_loss": 0.20697914063930511
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 43.756090402113294,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 4.183240459582488e-05,
+ "loss": 0.0036,
+ "macro_f1": 1.0,
+ "num_tokens": 15032000.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002361048012971878,
+ "skip_count": 1.0,
+ "step": 9320,
+ "text_loss": 0.6737313866615295
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 4.1708558242556207e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15034831.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001238204538822174,
+ "skip_count": 0.0,
+ "step": 9322,
+ "text_loss": 0.823642373085022
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.77487525682419,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 4.1584887507432556e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15037487.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005211949814110994,
+ "skip_count": 1.0,
+ "step": 9324,
+ "text_loss": 0.3821350634098053
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 4.146139243784475e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15040167.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007513152435421944,
+ "skip_count": 0.0,
+ "step": 9326,
+ "text_loss": 0.18124167621135712
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 4.133807308111637e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15043777.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029832208529114723,
+ "skip_count": 0.0,
+ "step": 9328,
+ "text_loss": 0.47313618659973145
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0615234375,
+ "learning_rate": 4.1214929484503615e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15046622.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.009155526757240295,
+ "skip_count": 1.0,
+ "step": 9330,
+ "text_loss": 0.20556017756462097
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.81244496624596,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 4.1091961695195304e-05,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15049543.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003529169363901019,
+ "skip_count": 0.0,
+ "step": 9332,
+ "text_loss": 0.18752245604991913
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.821837393601406,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 4.0969169760313005e-05,
+ "loss": 0.0078,
+ "macro_f1": 1.0,
+ "num_tokens": 15052924.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002136822324246168,
+ "skip_count": 2.0,
+ "step": 9334,
+ "text_loss": 0.85563725233078
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.83122982095686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053466796875,
+ "learning_rate": 4.084655372691076e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15056579.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003167972667142749,
+ "skip_count": 2.0,
+ "step": 9336,
+ "text_loss": 0.45709627866744995
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 43.8406222483123,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 4.07241136419752e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 15059739.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.03742539510130882,
+ "skip_count": 2.0,
+ "step": 9338,
+ "text_loss": 0.19531641900539398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 4.06018495524258e-05,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15062795.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002699678996577859,
+ "skip_count": 0.0,
+ "step": 9340,
+ "text_loss": 0.31032654643058777
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.85940710302319,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 4.047976150511423e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15066591.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026099481619894505,
+ "skip_count": 0.0,
+ "step": 9342,
+ "text_loss": 0.4676157832145691
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 4.035784954682486e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15069509.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006772278342396021,
+ "skip_count": 1.0,
+ "step": 9344,
+ "text_loss": 0.23385995626449585
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 43.878191957734074,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 4.0236113724274713e-05,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15072898.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0005968905170448124,
+ "skip_count": 0.0,
+ "step": 9346,
+ "text_loss": 0.6250094175338745
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.887584385089525,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 4.011455408411302e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15075547.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012884319759905338,
+ "skip_count": 2.0,
+ "step": 9348,
+ "text_loss": 0.23720405995845795
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.89697681244497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 3.9993170672921794e-05,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15078902.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018171088304370642,
+ "skip_count": 0.0,
+ "step": 9350,
+ "text_loss": 0.23975110054016113
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 43.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 3.9871963537215284e-05,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 15082292.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001974726328626275,
+ "skip_count": 1.0,
+ "step": 9352,
+ "text_loss": 0.354034423828125
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 3.975093272344038e-05,
+ "loss": 0.0075,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15085288.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014760299818590283,
+ "skip_count": 0.0,
+ "step": 9354,
+ "text_loss": 0.6398947834968567
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 43.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 3.963007827797627e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15089089.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004467889666557312,
+ "skip_count": 3.0,
+ "step": 9356,
+ "text_loss": 0.26422595977783203
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.93454652186674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 3.950940024713462e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15092178.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0048953029327094555,
+ "skip_count": 1.0,
+ "step": 9358,
+ "text_loss": 0.7519236207008362
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 43.943938949222186,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 3.9388898677159446e-05,
+ "loss": 0.0065,
+ "macro_f1": 1.0,
+ "num_tokens": 15094825.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004229324869811535,
+ "skip_count": 1.0,
+ "step": 9360,
+ "text_loss": 0.522379457950592
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 43.95333137657764,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0244140625,
+ "learning_rate": 3.9268573614227146e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15098119.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028480603359639645,
+ "skip_count": 3.0,
+ "step": 9362,
+ "text_loss": 0.47443902492523193
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 3.914842510444666e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15101362.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024998984299600124,
+ "skip_count": 1.0,
+ "step": 9364,
+ "text_loss": 0.6255060434341431
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 43.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0186767578125,
+ "learning_rate": 3.9028453193859006e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15104544.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008692052215337753,
+ "skip_count": 1.0,
+ "step": 9366,
+ "text_loss": 0.26974618434906006
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 43.98150865864397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 3.890865792843768e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15107619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002779777627438307,
+ "skip_count": 2.0,
+ "step": 9368,
+ "text_loss": 0.4157184064388275
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 43.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 3.878903935408845e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15111352.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010220289696007967,
+ "skip_count": 0.0,
+ "step": 9370,
+ "text_loss": 0.5674155950546265
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 3.866959751664939e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15114088.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004387985449284315,
+ "skip_count": 1.0,
+ "step": 9372,
+ "text_loss": 0.3638002276420593
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.00939242735544,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 3.8550332461890824e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15117271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005855522467754781,
+ "skip_count": 0.0,
+ "step": 9374,
+ "text_loss": 0.6257871389389038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 3.843124423551536e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15119936.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026496360078454018,
+ "skip_count": 0.0,
+ "step": 9376,
+ "text_loss": 0.7019506096839905
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 3.8312332883157774e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15123407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024072150699794292,
+ "skip_count": 0.0,
+ "step": 9378,
+ "text_loss": 0.45380696654319763
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 3.819359845038517e-05,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15126742.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00031929166289046407,
+ "skip_count": 0.0,
+ "step": 9380,
+ "text_loss": 0.5322204828262329
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 3.807504098269682e-05,
+ "loss": 0.0103,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15130854.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00177620945032686,
+ "skip_count": 0.0,
+ "step": 9382,
+ "text_loss": 0.5220870971679688
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 44.05635456413267,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 3.7956660525524156e-05,
+ "loss": 0.0071,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 15135054.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013358182273805141,
+ "skip_count": 2.0,
+ "step": 9384,
+ "text_loss": 0.39796701073646545
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 3.783845712423067e-05,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15139179.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030253338627517223,
+ "skip_count": 0.0,
+ "step": 9386,
+ "text_loss": 0.13592341542243958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 3.772043082411236e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15142436.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008311813580803573,
+ "skip_count": 0.0,
+ "step": 9388,
+ "text_loss": 0.7804215550422668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.084531846199,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 3.760258167039704e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15146071.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012432600371539593,
+ "skip_count": 1.0,
+ "step": 9390,
+ "text_loss": 0.37692421674728394
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.8571428656578064,
+ "avg_layers": 23.0,
+ "epoch": 44.09392427355445,
+ "f1_execute": 0.9756097793579102,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.9230769276618958,
+ "grad_norm": 0.053955078125,
+ "learning_rate": 3.748490970824464e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.9662289023399353,
+ "num_tokens": 15149020.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03158312290906906,
+ "skip_count": 7.0,
+ "step": 9392,
+ "text_loss": 0.6111845374107361
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0166015625,
+ "learning_rate": 3.7367414982747374e-05,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15151887.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000898235070053488,
+ "skip_count": 0.0,
+ "step": 9394,
+ "text_loss": 0.42988476157188416
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.11270912826534,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 3.7250097538929384e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15155395.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024584042839705944,
+ "skip_count": 1.0,
+ "step": 9396,
+ "text_loss": 0.4083070456981659
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 3.713295742174694e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15158275.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012269694125279784,
+ "skip_count": 0.0,
+ "step": 9398,
+ "text_loss": 0.529385507106781
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 3.701599467608835e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15161533.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002610012423247099,
+ "skip_count": 1.0,
+ "step": 9400,
+ "text_loss": 0.1785552203655243
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 44.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 3.6899209346773986e-05,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15164799.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0012146600056439638,
+ "skip_count": 0.0,
+ "step": 9402,
+ "text_loss": 0.9209059476852417
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 3.678260147855628e-05,
+ "loss": 0.0028,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15168111.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001716976286843419,
+ "skip_count": 1.0,
+ "step": 9404,
+ "text_loss": 0.5762659907341003
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 3.6666171116119474e-05,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 15171285.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005656248424202204,
+ "skip_count": 2.0,
+ "step": 9406,
+ "text_loss": 0.3065127432346344
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0186767578125,
+ "learning_rate": 3.6549918304079946e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15174838.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002362997969612479,
+ "skip_count": 2.0,
+ "step": 9408,
+ "text_loss": 0.5256759524345398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 3.643384308698594e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15177713.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002327109221369028,
+ "skip_count": 1.0,
+ "step": 9410,
+ "text_loss": 0.27613985538482666
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 44.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 3.6317945509317716e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15180863.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008501979522407055,
+ "skip_count": 0.0,
+ "step": 9412,
+ "text_loss": 0.3379829525947571
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 3.6202225615487525e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15184531.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004115676507353783,
+ "skip_count": 0.0,
+ "step": 9414,
+ "text_loss": 0.24313601851463318
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 3.6086683449839454e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15187699.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017425924306735396,
+ "skip_count": 0.0,
+ "step": 9416,
+ "text_loss": 0.47485142946243286
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 44.21602582917523,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 3.597131905664935e-05,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 15190528.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0031498887110501528,
+ "skip_count": 1.0,
+ "step": 9418,
+ "text_loss": 0.5356660485267639
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 3.585613248012515e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15194165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006833057850599289,
+ "skip_count": 1.0,
+ "step": 9420,
+ "text_loss": 0.21593274176120758
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 3.574112376440658e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15197612.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013788710348308086,
+ "skip_count": 1.0,
+ "step": 9422,
+ "text_loss": 0.5275097489356995
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 3.5626292953565175e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15201103.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021296890918165445,
+ "skip_count": 0.0,
+ "step": 9424,
+ "text_loss": 0.3420610725879669
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 3.551164009160429e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15204007.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025281559210270643,
+ "skip_count": 0.0,
+ "step": 9426,
+ "text_loss": 0.4756413996219635
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 3.539716522245917e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15208066.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008577071712352335,
+ "skip_count": 0.0,
+ "step": 9428,
+ "text_loss": 0.7672523260116577
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 44.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 3.528286838999672e-05,
+ "loss": 0.0032,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15211118.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002977409167215228,
+ "skip_count": 0.0,
+ "step": 9430,
+ "text_loss": 0.5010796785354614
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 44.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 3.5168749638015806e-05,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15214245.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0009552660631015897,
+ "skip_count": 0.0,
+ "step": 9432,
+ "text_loss": 0.6633321642875671
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 3.505480901024677e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15217449.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005598205607384443,
+ "skip_count": 2.0,
+ "step": 9434,
+ "text_loss": 0.545702338218689
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 44.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 3.494104655035213e-05,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15220391.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0154950562864542,
+ "skip_count": 4.0,
+ "step": 9436,
+ "text_loss": 0.211164191365242
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.30995010272967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 3.4827462301925735e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15224061.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001531782210804522,
+ "skip_count": 0.0,
+ "step": 9438,
+ "text_loss": 0.49369096755981445
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 3.471405630849328e-05,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15227586.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004152537789195776,
+ "skip_count": 1.0,
+ "step": 9440,
+ "text_loss": 0.1624782234430313
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.32873495744057,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046875,
+ "learning_rate": 3.4600828613512156e-05,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15230713.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026113570202142,
+ "skip_count": 0.0,
+ "step": 9442,
+ "text_loss": 0.1921689808368683
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 44.33812738479601,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 3.44877792603715e-05,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15233925.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008077848702669144,
+ "skip_count": 3.0,
+ "step": 9444,
+ "text_loss": 0.32417818903923035
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 3.437490829239193e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15236684.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005273211863823235,
+ "skip_count": 0.0,
+ "step": 9446,
+ "text_loss": 0.3497772812843323
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 3.4262215752825895e-05,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15239866.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015295564662665129,
+ "skip_count": 0.0,
+ "step": 9448,
+ "text_loss": 0.7613807320594788
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 44.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 3.414970168485737e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15243615.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0039047773461788893,
+ "skip_count": 0.0,
+ "step": 9450,
+ "text_loss": 0.3325706720352173
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.375697094217784,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 3.403736613160191e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.32098764181137085,
+ "num_tokens": 15246714.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0300968699157238,
+ "skip_count": 2.0,
+ "step": 9452,
+ "text_loss": 0.3441869020462036
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 44.385089521573235,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 3.392520913610681e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15249520.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0037529836408793926,
+ "skip_count": 0.0,
+ "step": 9454,
+ "text_loss": 0.5083104968070984
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 3.381323074135073e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15252527.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019368440844118595,
+ "skip_count": 2.0,
+ "step": 9456,
+ "text_loss": 0.49744489789009094
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 3.3701430990244085e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15255330.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033424650318920612,
+ "skip_count": 1.0,
+ "step": 9458,
+ "text_loss": 0.5603348016738892
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 3.35898099256286e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15257961.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006928095244802535,
+ "skip_count": 0.0,
+ "step": 9460,
+ "text_loss": 0.5270714163780212
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 3.347836759027789e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15261137.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0030718250200152397,
+ "skip_count": 2.0,
+ "step": 9462,
+ "text_loss": 0.11651179939508438
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.43205165835045,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 3.33671040268968e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 15264234.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03508305177092552,
+ "skip_count": 2.0,
+ "step": 9464,
+ "text_loss": 0.14562347531318665
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.441444085705896,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 3.3256019278121717e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 15267047.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008365205489099026,
+ "skip_count": 1.0,
+ "step": 9466,
+ "text_loss": 0.8550931215286255
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.45083651306135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 3.3145113386520485e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15270442.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036910634953528643,
+ "skip_count": 0.0,
+ "step": 9468,
+ "text_loss": 0.24741731584072113
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038818359375,
+ "learning_rate": 3.30343863945925e-05,
+ "loss": 0.0095,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15273845.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014966290909796953,
+ "skip_count": 0.0,
+ "step": 9470,
+ "text_loss": 0.5137372612953186
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 3.2923838344768534e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15277940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028104602824896574,
+ "skip_count": 0.0,
+ "step": 9472,
+ "text_loss": 0.5737728476524353
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.47901379512768,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.056396484375,
+ "learning_rate": 3.281346927941087e-05,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15281640.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007870957255363464,
+ "skip_count": 2.0,
+ "step": 9474,
+ "text_loss": 0.27684518694877625
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.48840622248312,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 3.270327924081301e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15284877.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006224945653229952,
+ "skip_count": 0.0,
+ "step": 9476,
+ "text_loss": 0.35599255561828613
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 3.259326827120013e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15287945.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001179040758870542,
+ "skip_count": 0.0,
+ "step": 9478,
+ "text_loss": 0.26802319288253784
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.507191077194015,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 3.2483436412728553e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15290754.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001992281526327133,
+ "skip_count": 0.0,
+ "step": 9480,
+ "text_loss": 0.40124714374542236
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 3.2373783707486057e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15294841.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012830843916162848,
+ "skip_count": 0.0,
+ "step": 9482,
+ "text_loss": 0.6739225387573242
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 3.226431019749171e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15298397.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003624147269874811,
+ "skip_count": 2.0,
+ "step": 9484,
+ "text_loss": 0.5250326991081238
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.016357421875,
+ "learning_rate": 3.2155015924696105e-05,
+ "loss": 0.0031,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15301499.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019682408310472965,
+ "skip_count": 0.0,
+ "step": 9486,
+ "text_loss": 0.5574567317962646
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 3.204590093098098e-05,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15304531.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002245094161480665,
+ "skip_count": 0.0,
+ "step": 9488,
+ "text_loss": 0.4065501093864441
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.55415321397123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 3.1936965258159366e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15307826.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002919224789366126,
+ "skip_count": 1.0,
+ "step": 9490,
+ "text_loss": 0.5183609127998352
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.563545641326684,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 3.1828208947975615e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15311420.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004961747210472822,
+ "skip_count": 1.0,
+ "step": 9492,
+ "text_loss": 0.1962234377861023
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.57293806868213,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 3.171963204210537e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15314196.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026044815313071012,
+ "skip_count": 0.0,
+ "step": 9494,
+ "text_loss": 0.223251610994339
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 3.161123458215553e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15317174.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029661289881914854,
+ "skip_count": 0.0,
+ "step": 9496,
+ "text_loss": 0.32970958948135376
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 3.150301660966415e-05,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15320343.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011696632718667388,
+ "skip_count": 0.0,
+ "step": 9498,
+ "text_loss": 0.8590811491012573
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 3.13949781661006e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15324138.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015035583637654781,
+ "skip_count": 0.0,
+ "step": 9500,
+ "text_loss": 0.6658036708831787
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 3.1287119292865375e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15328395.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001930502592585981,
+ "skip_count": 0.0,
+ "step": 9502,
+ "text_loss": 0.4104210138320923
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.619900205459345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04638671875,
+ "learning_rate": 3.117944003129025e-05,
+ "loss": 0.0069,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15332196.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010025398805737495,
+ "skip_count": 0.0,
+ "step": 9504,
+ "text_loss": 0.7272399663925171
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 44.629292632814796,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 3.107194042263806e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15335253.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004520092159509659,
+ "skip_count": 0.0,
+ "step": 9506,
+ "text_loss": 0.29173022508621216
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 3.096462050810284e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15338129.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009707154240459204,
+ "skip_count": 0.0,
+ "step": 9508,
+ "text_loss": 0.6530287861824036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.024658203125,
+ "learning_rate": 3.0857480328809916e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15341487.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008689566748216748,
+ "skip_count": 0.0,
+ "step": 9510,
+ "text_loss": 0.36988505721092224
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 3.0750519925815565e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15344460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022587007842957973,
+ "skip_count": 0.0,
+ "step": 9512,
+ "text_loss": 0.2447768598794937
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.66686234223657,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 3.064373934010711e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15348135.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001986770424991846,
+ "skip_count": 0.0,
+ "step": 9514,
+ "text_loss": 0.43159469962120056
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 3.053713861260321e-05,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15351073.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003514432755764574,
+ "skip_count": 0.0,
+ "step": 9516,
+ "text_loss": 0.3638324737548828
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.685647196947464,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 3.043071778415335e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15353633.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003395392093807459,
+ "skip_count": 0.0,
+ "step": 9518,
+ "text_loss": 0.5728140473365784
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 3.03244768955383e-05,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15357322.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016641782131046057,
+ "skip_count": 0.0,
+ "step": 9520,
+ "text_loss": 0.666814386844635
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0177001953125,
+ "learning_rate": 3.021841598746966e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15360771.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024721708614379168,
+ "skip_count": 0.0,
+ "step": 9522,
+ "text_loss": 0.7148030400276184
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 3.01125351005902e-05,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15364281.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004133665468543768,
+ "skip_count": 0.0,
+ "step": 9524,
+ "text_loss": 0.2985752820968628
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 3.0006834275473737e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15367354.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003016186412423849,
+ "skip_count": 1.0,
+ "step": 9526,
+ "text_loss": 0.22689883410930634
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 44.73260933372468,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01531982421875,
+ "learning_rate": 2.9901313552624932e-05,
+ "loss": 0.003,
+ "macro_f1": 1.0,
+ "num_tokens": 15371027.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015333639457821846,
+ "skip_count": 7.0,
+ "step": 9528,
+ "text_loss": 0.8308720588684082
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 2.97959729724796e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15373948.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001420815708115697,
+ "skip_count": 0.0,
+ "step": 9530,
+ "text_loss": 0.5439777970314026
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.751394188435576,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 2.9690812575404456e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15377366.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007130459416657686,
+ "skip_count": 0.0,
+ "step": 9532,
+ "text_loss": 0.45405295491218567
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.76078661579102,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.08349609375,
+ "learning_rate": 2.95858324016971e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 15380115.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04256885498762131,
+ "skip_count": 0.0,
+ "step": 9534,
+ "text_loss": 0.39998912811279297
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 44.77017904314646,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 2.9481032491586178e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15383205.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004944019019603729,
+ "skip_count": 4.0,
+ "step": 9536,
+ "text_loss": 0.1882237195968628
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 2.937641288523124e-05,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15386619.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007820523343980312,
+ "skip_count": 1.0,
+ "step": 9538,
+ "text_loss": 0.26401394605636597
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 2.9271973622722603e-05,
+ "loss": 0.0026,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15389135.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010751578956842422,
+ "skip_count": 0.0,
+ "step": 9540,
+ "text_loss": 0.39813846349716187
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 2.9167714744081643e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15392150.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031554463785141706,
+ "skip_count": 2.0,
+ "step": 9542,
+ "text_loss": 0.669784665107727
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.807748752568244,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 2.9063636289260677e-05,
+ "loss": 0.0037,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15394974.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00287301791831851,
+ "skip_count": 1.0,
+ "step": 9544,
+ "text_loss": 0.176493301987648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.81714117992369,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 2.8959738298142635e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15398432.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011708475649356842,
+ "skip_count": 0.0,
+ "step": 9546,
+ "text_loss": 0.8762983083724976
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 2.885602081054145e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15401121.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003167103510349989,
+ "skip_count": 1.0,
+ "step": 9548,
+ "text_loss": 0.2538717985153198
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 44.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 2.8752483866201885e-05,
+ "loss": 0.0062,
+ "macro_f1": 1.0,
+ "num_tokens": 15404105.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007552143186330795,
+ "skip_count": 5.0,
+ "step": 9550,
+ "text_loss": 0.37045153975486755
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 2.8649127504799423e-05,
+ "loss": 0.0046,
+ "macro_f1": 1.0,
+ "num_tokens": 15407232.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.007718692068010569,
+ "skip_count": 2.0,
+ "step": 9552,
+ "text_loss": 0.15780900418758392
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 2.8545951765940547e-05,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15410425.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003527951193973422,
+ "skip_count": 0.0,
+ "step": 9554,
+ "text_loss": 0.5931823253631592
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 2.8442956689162193e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15413724.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00146177364513278,
+ "skip_count": 0.0,
+ "step": 9556,
+ "text_loss": 0.691118061542511
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 2.8340142313932448e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15416776.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010256811510771513,
+ "skip_count": 0.0,
+ "step": 9558,
+ "text_loss": 0.40814271569252014
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 2.823750867964997e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15419815.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0047921910881996155,
+ "skip_count": 0.0,
+ "step": 9560,
+ "text_loss": 0.28953713178634644
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.89228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 2.8135055825644072e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15422806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002010057680308819,
+ "skip_count": 1.0,
+ "step": 9562,
+ "text_loss": 0.8377944231033325
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 44.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 2.803278379117491e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15425405.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005009239539504051,
+ "skip_count": 1.0,
+ "step": 9564,
+ "text_loss": 0.5936337113380432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 2.793069261543335e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15428233.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007967893034219742,
+ "skip_count": 2.0,
+ "step": 9566,
+ "text_loss": 0.49891290068626404
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.92045788083358,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 2.7828782337540882e-05,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 15431095.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.00638923142105341,
+ "skip_count": 4.0,
+ "step": 9568,
+ "text_loss": 0.30928006768226624
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 44.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 2.7727052996549763e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15434933.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0060427505522966385,
+ "skip_count": 3.0,
+ "step": 9570,
+ "text_loss": 0.21274788677692413
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.93924273554447,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 2.762550463144281e-05,
+ "loss": 0.0031,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15437655.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012480237055569887,
+ "skip_count": 0.0,
+ "step": 9572,
+ "text_loss": 0.31049492955207825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 2.7524137281133567e-05,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15440643.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005919245071709156,
+ "skip_count": 0.0,
+ "step": 9574,
+ "text_loss": 0.16459886729717255
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 2.7422950984466233e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15443532.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0061412835493683815,
+ "skip_count": 2.0,
+ "step": 9576,
+ "text_loss": 0.7102797031402588
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 2.7321945780215573e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15447027.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001149018993601203,
+ "skip_count": 0.0,
+ "step": 9578,
+ "text_loss": 0.22778025269508362
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.97681244496625,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 2.722112170708696e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15450173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002216014079749584,
+ "skip_count": 0.0,
+ "step": 9580,
+ "text_loss": 0.21447396278381348
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 44.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06494140625,
+ "learning_rate": 2.7120478803716264e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15452838.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00498749827966094,
+ "skip_count": 0.0,
+ "step": 9582,
+ "text_loss": 0.1664455235004425
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 44.99559729967714,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037353515625,
+ "learning_rate": 2.7020017108670246e-05,
+ "loss": 0.0064,
+ "macro_f1": 1.0,
+ "num_tokens": 15455928.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005886784754693508,
+ "skip_count": 3.0,
+ "step": 9584,
+ "text_loss": 0.3929266631603241
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.004696213677725,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 2.691973666044589e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15459447.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029895263724029064,
+ "skip_count": 1.0,
+ "step": 9586,
+ "text_loss": 0.27535343170166016
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.01408864103317,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 2.681963749747085e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15462340.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038893253076821566,
+ "skip_count": 0.0,
+ "step": 9588,
+ "text_loss": 0.6950465440750122
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.02348106838861,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 2.671971965810338e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15465432.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016947018448263407,
+ "skip_count": 0.0,
+ "step": 9590,
+ "text_loss": 0.41451266407966614
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 2.6619983180632134e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15468300.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011597154662013054,
+ "skip_count": 0.0,
+ "step": 9592,
+ "text_loss": 0.5846080780029297
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 2.6520428103276316e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15471084.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005555236246436834,
+ "skip_count": 2.0,
+ "step": 9594,
+ "text_loss": 0.4151473939418793
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.05165835045494,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 2.6421054464185633e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15474348.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015279205981642008,
+ "skip_count": 0.0,
+ "step": 9596,
+ "text_loss": 0.28742483258247375
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.061050777810394,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 2.6321862301440234e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15477493.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019169533625245094,
+ "skip_count": 0.0,
+ "step": 9598,
+ "text_loss": 0.338019460439682
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.07044320516584,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.048095703125,
+ "learning_rate": 2.6222851653050773e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15480257.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015131557593122125,
+ "skip_count": 1.0,
+ "step": 9600,
+ "text_loss": 0.5982558727264404
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 2.612402255695828e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15482838.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026768618263304234,
+ "skip_count": 0.0,
+ "step": 9602,
+ "text_loss": 0.32012176513671875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 2.6025375051034306e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15485746.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002152341417968273,
+ "skip_count": 0.0,
+ "step": 9604,
+ "text_loss": 0.16942192614078522
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 45.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 2.5926909173080658e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15488669.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003325721947476268,
+ "skip_count": 3.0,
+ "step": 9606,
+ "text_loss": 0.47950080037117004
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.10801291458761,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 2.582862496082977e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15491512.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023114588111639023,
+ "skip_count": 1.0,
+ "step": 9608,
+ "text_loss": 0.3907585144042969
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.117405341943055,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 2.5730522451944292e-05,
+ "loss": 0.0033,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15494479.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003140041371807456,
+ "skip_count": 2.0,
+ "step": 9610,
+ "text_loss": 0.198005810379982
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.126797769298506,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 2.5632601684017264e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15497900.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015117402654141188,
+ "skip_count": 0.0,
+ "step": 9612,
+ "text_loss": 0.874154269695282
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 2.5534862694572114e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15501817.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00551232136785984,
+ "skip_count": 2.0,
+ "step": 9614,
+ "text_loss": 0.1933375597000122
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.14558262400939,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 2.543730552106266e-05,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15504872.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001090583624318242,
+ "skip_count": 0.0,
+ "step": 9616,
+ "text_loss": 0.4030717611312866
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 2.533993020087294e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15507727.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007001800462603569,
+ "skip_count": 0.0,
+ "step": 9618,
+ "text_loss": 0.4812186062335968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.16436747872028,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 2.5242736771317333e-05,
+ "loss": 0.0025,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15510689.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016861478798091412,
+ "skip_count": 0.0,
+ "step": 9620,
+ "text_loss": 0.4578339457511902
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.17375990607572,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05517578125,
+ "learning_rate": 2.514572526964065e-05,
+ "loss": 0.0068,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 15513419.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.050852373242378235,
+ "skip_count": 3.0,
+ "step": 9622,
+ "text_loss": 0.4038950204849243
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.183152333431174,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 2.5048895733017772e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15516289.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015001936117187142,
+ "skip_count": 0.0,
+ "step": 9624,
+ "text_loss": 0.8331962823867798
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.19254476078662,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 2.4952248198554073e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15519476.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009114370332099497,
+ "skip_count": 1.0,
+ "step": 9626,
+ "text_loss": 0.4997985363006592
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017822265625,
+ "learning_rate": 2.4855782703284925e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15523363.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011186953634023666,
+ "skip_count": 0.0,
+ "step": 9628,
+ "text_loss": 0.2572024464607239
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 45.211329615497505,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 2.4759499284176145e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 15526289.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.019600817933678627,
+ "skip_count": 4.0,
+ "step": 9630,
+ "text_loss": 0.6323924660682678
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 45.22072204285295,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 2.466339797812378e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 15530260.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.02459629252552986,
+ "skip_count": 1.0,
+ "step": 9632,
+ "text_loss": 0.1824527233839035
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 45.23011447020839,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 2.4567478821954038e-05,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 15533916.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.009077859111130238,
+ "skip_count": 2.0,
+ "step": 9634,
+ "text_loss": 0.4518069326877594
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 2.4471741852423235e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15536958.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002355317585170269,
+ "skip_count": 0.0,
+ "step": 9636,
+ "text_loss": 0.8873519897460938
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.248899324919286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 2.437618710621803e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15540544.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001198371173813939,
+ "skip_count": 0.0,
+ "step": 9638,
+ "text_loss": 0.4845949709415436
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 2.4280814619955128e-05,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15543355.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009287866414524615,
+ "skip_count": 0.0,
+ "step": 9640,
+ "text_loss": 0.5979563593864441
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.26768417963017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 2.4185624430181464e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15547215.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028763876762241125,
+ "skip_count": 0.0,
+ "step": 9642,
+ "text_loss": 0.16279318928718567
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.27707660698562,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 2.4090616573374135e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15550412.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013361044693738222,
+ "skip_count": 0.0,
+ "step": 9644,
+ "text_loss": 0.2864333987236023
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 45.28646903434106,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 2.3995791085940244e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15553660.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0019316677935421467,
+ "skip_count": 0.0,
+ "step": 9646,
+ "text_loss": 0.6333117485046387
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.295861461696504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 2.390114800421722e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15556287.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011288017267361283,
+ "skip_count": 1.0,
+ "step": 9648,
+ "text_loss": 0.6050677299499512
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.305253889051954,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 2.380668736447239e-05,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15559246.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014249378582462668,
+ "skip_count": 0.0,
+ "step": 9650,
+ "text_loss": 0.9484158754348755
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 45.3146463164074,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 2.371240920290324e-05,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 15562251.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00741320988163352,
+ "skip_count": 4.0,
+ "step": 9652,
+ "text_loss": 0.24387991428375244
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 2.361831355563726e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15565704.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.000942508690059185,
+ "skip_count": 0.0,
+ "step": 9654,
+ "text_loss": 0.6523539423942566
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 45.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.047607421875,
+ "learning_rate": 2.352440045873233e-05,
+ "loss": 0.0091,
+ "macro_f1": 1.0,
+ "num_tokens": 15568797.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0064352210611104965,
+ "skip_count": 4.0,
+ "step": 9656,
+ "text_loss": 0.3206343650817871
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 2.3430669948175943e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15571855.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0013390982057899237,
+ "skip_count": 0.0,
+ "step": 9658,
+ "text_loss": 0.8397402763366699
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.35221602582917,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 2.3337122059885806e-05,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15575379.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012212366564199328,
+ "skip_count": 0.0,
+ "step": 9660,
+ "text_loss": 0.5116108655929565
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03857421875,
+ "learning_rate": 2.324375682970975e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15578108.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003829900873824954,
+ "skip_count": 0.0,
+ "step": 9662,
+ "text_loss": 0.1423535794019699
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 45.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 2.3150574293425376e-05,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 15581830.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.012756838463246822,
+ "skip_count": 1.0,
+ "step": 9664,
+ "text_loss": 0.24676625430583954
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 2.3057574486740507e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15584872.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020642473828047514,
+ "skip_count": 0.0,
+ "step": 9666,
+ "text_loss": 0.4851650893688202
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.38978573525095,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 2.2964757445292806e-05,
+ "loss": 0.0029,
+ "macro_f1": 1.0,
+ "num_tokens": 15588000.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.007441115565598011,
+ "skip_count": 3.0,
+ "step": 9668,
+ "text_loss": 0.6416954398155212
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.3991781626064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017333984375,
+ "learning_rate": 2.287212320464993e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15591065.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015504831681028008,
+ "skip_count": 0.0,
+ "step": 9670,
+ "text_loss": 0.5852687358856201
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 45.40857058996184,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 2.2779671800309433e-05,
+ "loss": 0.0046,
+ "macro_f1": 1.0,
+ "num_tokens": 15594631.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005648284684866667,
+ "skip_count": 2.0,
+ "step": 9672,
+ "text_loss": 0.7172279357910156
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.41796301731729,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 2.2687403267699024e-05,
+ "loss": 0.0057,
+ "macro_f1": 1.0,
+ "num_tokens": 15598664.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003756999270990491,
+ "skip_count": 2.0,
+ "step": 9674,
+ "text_loss": 0.18986566364765167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.427355444672735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 2.259531764217604e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15601616.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002155672525987029,
+ "skip_count": 0.0,
+ "step": 9676,
+ "text_loss": 0.4410690367221832
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.43674787202818,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.042236328125,
+ "learning_rate": 2.250341495902797e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15604291.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0020037787035107613,
+ "skip_count": 0.0,
+ "step": 9678,
+ "text_loss": 0.5565816164016724
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 2.241169525347203e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15607203.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014305647928267717,
+ "skip_count": 0.0,
+ "step": 9680,
+ "text_loss": 0.4879189729690552
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 2.2320158560655447e-05,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 15610475.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.016029199585318565,
+ "skip_count": 3.0,
+ "step": 9682,
+ "text_loss": 0.36342933773994446
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 2.2228804915655153e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15613810.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023584216833114624,
+ "skip_count": 0.0,
+ "step": 9684,
+ "text_loss": 0.18480375409126282
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.47431758144996,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 2.2137634353478043e-05,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15617854.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004325680434703827,
+ "skip_count": 1.0,
+ "step": 9686,
+ "text_loss": 0.5345974564552307
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 45.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 2.2046646909060996e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15620874.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.006946994923055172,
+ "skip_count": 0.0,
+ "step": 9688,
+ "text_loss": 0.29016008973121643
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.49310243616085,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 2.195584261727046e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15623875.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034732038620859385,
+ "skip_count": 1.0,
+ "step": 9690,
+ "text_loss": 0.2831312119960785
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 2.1865221512902766e-05,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15626371.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002495788736268878,
+ "skip_count": 1.0,
+ "step": 9692,
+ "text_loss": 0.6090453267097473
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 45.511887290871734,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0478515625,
+ "learning_rate": 2.1774783630684246e-05,
+ "loss": 0.0076,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 15630129.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.017551302909851074,
+ "skip_count": 1.0,
+ "step": 9694,
+ "text_loss": 0.5127915740013123
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 2.168452900527068e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15633179.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004413482965901494,
+ "skip_count": 0.0,
+ "step": 9696,
+ "text_loss": 0.5901434421539307
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.53067214558262,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0308837890625,
+ "learning_rate": 2.159445767124796e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15636508.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005992567166686058,
+ "skip_count": 1.0,
+ "step": 9698,
+ "text_loss": 0.8493689298629761
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.54006457293807,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 2.1504569663131523e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15639371.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0033268092665821314,
+ "skip_count": 0.0,
+ "step": 9700,
+ "text_loss": 0.2814267873764038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 2.1414865015366548e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15643025.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004418607335537672,
+ "skip_count": 0.0,
+ "step": 9702,
+ "text_loss": 0.2619725167751312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 45.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 2.1325343762328197e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15646996.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0050115580670535564,
+ "skip_count": 4.0,
+ "step": 9704,
+ "text_loss": 0.8204038143157959
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.5682418550044,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 2.123600593832109e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15650194.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018730501178652048,
+ "skip_count": 1.0,
+ "step": 9706,
+ "text_loss": 0.694500744342804
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.577634282359845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 2.1146851577579673e-05,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15653743.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016657712403684855,
+ "skip_count": 0.0,
+ "step": 9708,
+ "text_loss": 0.8211735486984253
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.58702670971529,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.039306640625,
+ "learning_rate": 2.1057880714268064e-05,
+ "loss": 0.0078,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15657325.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029736643191426992,
+ "skip_count": 0.0,
+ "step": 9710,
+ "text_loss": 0.2846751809120178
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 45.59641913707074,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 2.0969093382479987e-05,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 15660522.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.01233653537929058,
+ "skip_count": 4.0,
+ "step": 9712,
+ "text_loss": 0.23991759121418
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 2.0880489616239062e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15663254.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012792183551937342,
+ "skip_count": 0.0,
+ "step": 9714,
+ "text_loss": 0.6943771243095398
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.61520399178163,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 2.0792069449498297e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15666283.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033134319819509983,
+ "skip_count": 0.0,
+ "step": 9716,
+ "text_loss": 0.4161235988140106
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 45.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 2.0703832916140476e-05,
+ "loss": 0.0034,
+ "macro_f1": 1.0,
+ "num_tokens": 15669774.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006201022770255804,
+ "skip_count": 1.0,
+ "step": 9718,
+ "text_loss": 0.42691144347190857
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 2.061578004997805e-05,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15672943.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033355073537677526,
+ "skip_count": 1.0,
+ "step": 9720,
+ "text_loss": 0.9724727869033813
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.64338127384796,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 2.0527910884753033e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15677847.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019593657925724983,
+ "skip_count": 0.0,
+ "step": 9722,
+ "text_loss": 0.417218416929245
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.65277370120341,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 2.0440225454137097e-05,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15681460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007862947881221771,
+ "skip_count": 2.0,
+ "step": 9724,
+ "text_loss": 0.24983589351177216
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 2.0352723791731364e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15685496.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004811233840882778,
+ "skip_count": 0.0,
+ "step": 9726,
+ "text_loss": 0.32930606603622437
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.8571428656578064,
+ "avg_layers": 22.0,
+ "epoch": 45.671558555914295,
+ "f1_execute": 0.9767441749572754,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.9230769276618958,
+ "grad_norm": 0.045166015625,
+ "learning_rate": 2.0265405931066626e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.633273720741272,
+ "num_tokens": 15688661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02648334763944149,
+ "skip_count": 7.0,
+ "step": 9728,
+ "text_loss": 0.42316386103630066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 45.68095098326974,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 2.0178271905603395e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 15692778.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04439396783709526,
+ "skip_count": 3.0,
+ "step": 9730,
+ "text_loss": 0.32248371839523315
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.69034341062518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 2.0091321748731517e-05,
+ "loss": 0.0077,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15695821.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020437403582036495,
+ "skip_count": 2.0,
+ "step": 9732,
+ "text_loss": 0.5959160923957825
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.699735837980626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 2.000455549377045e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15699324.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0002844796108547598,
+ "skip_count": 0.0,
+ "step": 9734,
+ "text_loss": 0.45465928316116333
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.70912826533607,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 1.9917973173969204e-05,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15702044.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003548701060935855,
+ "skip_count": 0.0,
+ "step": 9736,
+ "text_loss": 0.7129027843475342
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.8333333134651184,
+ "avg_layers": 23.0,
+ "epoch": 45.71852069269152,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.9090909361839294,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 1.9831574822506248e-05,
+ "loss": 0.0089,
+ "macro_f1": 0.6289562582969666,
+ "num_tokens": 15705474.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.023800918832421303,
+ "skip_count": 6.0,
+ "step": 9738,
+ "text_loss": 0.28479668498039246
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.72791312004696,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 1.9745360472489648e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15708323.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01043168269097805,
+ "skip_count": 2.0,
+ "step": 9740,
+ "text_loss": 0.4760739803314209
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 45.73730554740241,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 1.9659330156956867e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15711390.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006430295296013355,
+ "skip_count": 2.0,
+ "step": 9742,
+ "text_loss": 0.13933971524238586
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 45.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 1.957348390887487e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15714077.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005738302133977413,
+ "skip_count": 3.0,
+ "step": 9744,
+ "text_loss": 0.49661460518836975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.756090402113294,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 1.948782176114017e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15716818.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011776578612625599,
+ "skip_count": 0.0,
+ "step": 9746,
+ "text_loss": 0.36066678166389465
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 1.9402343746578567e-05,
+ "loss": 0.0081,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15720756.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005322427023202181,
+ "skip_count": 0.0,
+ "step": 9748,
+ "text_loss": 0.5549091696739197
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.77487525682419,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 1.931704989794547e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15724516.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001399765140376985,
+ "skip_count": 0.0,
+ "step": 9750,
+ "text_loss": 0.21269696950912476
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 1.9231940247925572e-05,
+ "loss": 0.0085,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15727142.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018337799701839685,
+ "skip_count": 1.0,
+ "step": 9752,
+ "text_loss": 0.18105024099349976
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 1.914701482913317e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15730023.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010057559702545404,
+ "skip_count": 0.0,
+ "step": 9754,
+ "text_loss": 0.477859228849411
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 45.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 1.906227367411173e-05,
+ "loss": 0.0035,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15733108.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002486895304173231,
+ "skip_count": 3.0,
+ "step": 9756,
+ "text_loss": 0.4802452027797699
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 45.81244496624596,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 1.8977716815334335e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15736130.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004353616386651993,
+ "skip_count": 0.0,
+ "step": 9758,
+ "text_loss": 0.5479429960250854
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.821837393601406,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 1.8893344285203228e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15738691.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031500225886702538,
+ "skip_count": 1.0,
+ "step": 9760,
+ "text_loss": 0.6871381402015686
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.83122982095686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0673828125,
+ "learning_rate": 1.8809156116050164e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15741682.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023419202771037817,
+ "skip_count": 0.0,
+ "step": 9762,
+ "text_loss": 0.6725277900695801
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.8406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 1.8725152340136163e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15745314.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018769606249406934,
+ "skip_count": 0.0,
+ "step": 9764,
+ "text_loss": 0.4549144506454468
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0439453125,
+ "learning_rate": 1.864133298965176e-05,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 15747982.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030958254355937243,
+ "skip_count": 2.0,
+ "step": 9766,
+ "text_loss": 0.4970727264881134
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.85940710302319,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 1.8557698096716534e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15750453.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020812496077269316,
+ "skip_count": 1.0,
+ "step": 9768,
+ "text_loss": 0.7540801167488098
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.049560546875,
+ "learning_rate": 1.847424769337963e-05,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15753857.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031040434259921312,
+ "skip_count": 0.0,
+ "step": 9770,
+ "text_loss": 0.5154248476028442
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.878191957734074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 1.8390981811619356e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15756742.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002128311200067401,
+ "skip_count": 0.0,
+ "step": 9772,
+ "text_loss": 0.7327702045440674
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.887584385089525,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 1.8307900483343354e-05,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15759833.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003279880853369832,
+ "skip_count": 1.0,
+ "step": 9774,
+ "text_loss": 0.2673797607421875
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.89697681244497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 1.8225003740388545e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15762768.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004170822445303202,
+ "skip_count": 0.0,
+ "step": 9776,
+ "text_loss": 0.1820847988128662
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.8888888955116272,
+ "avg_layers": 21.0,
+ "epoch": 45.90636923980041,
+ "f1_execute": 0.9729729890823364,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.9411765336990356,
+ "grad_norm": 0.0194091796875,
+ "learning_rate": 1.8142291614521132e-05,
+ "loss": 0.0045,
+ "macro_f1": 0.9713832139968872,
+ "num_tokens": 15766965.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.022715313360095024,
+ "skip_count": 9.0,
+ "step": 9778,
+ "text_loss": 0.5590897798538208
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 1.8059764137436596e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15770199.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007280370220541954,
+ "skip_count": 1.0,
+ "step": 9780,
+ "text_loss": 0.28117987513542175
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 1.7977421340759582e-05,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15773367.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003529706271365285,
+ "skip_count": 0.0,
+ "step": 9782,
+ "text_loss": 0.18752245604991913
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.93454652186674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03662109375,
+ "learning_rate": 1.7895263256044013e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15776976.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025916248559951782,
+ "skip_count": 1.0,
+ "step": 9784,
+ "text_loss": 0.6330561637878418
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 45.943938949222186,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.042724609375,
+ "learning_rate": 1.781328991477299e-05,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15780848.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0049234069883823395,
+ "skip_count": 1.0,
+ "step": 9786,
+ "text_loss": 0.15685316920280457
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 45.95333137657764,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 1.7731501348358882e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 15783808.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.011918511241674423,
+ "skip_count": 1.0,
+ "step": 9788,
+ "text_loss": 0.23963648080825806
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 1.7649897588143226e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15787421.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018508053617551923,
+ "skip_count": 0.0,
+ "step": 9790,
+ "text_loss": 0.49311593174934387
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 1.7568478665396736e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15790274.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006157457246445119,
+ "skip_count": 0.0,
+ "step": 9792,
+ "text_loss": 0.4567435085773468
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 45.98150865864397,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 1.7487244611319285e-05,
+ "loss": 0.0035,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15794462.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.0031584864482283592,
+ "skip_count": 0.0,
+ "step": 9794,
+ "text_loss": 0.4325876832008362
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 45.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 1.740619545703992e-05,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15797775.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028455168940126896,
+ "skip_count": 0.0,
+ "step": 9796,
+ "text_loss": 0.1487245261669159
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 46.0,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.06201171875,
+ "learning_rate": 1.7325331233616847e-05,
+ "loss": 0.0078,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 15801092.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.02560117095708847,
+ "skip_count": 4.0,
+ "step": 9798,
+ "text_loss": 0.5299228429794312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 46.00939242735544,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 1.7244651972037284e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 15804049.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010446238331496716,
+ "skip_count": 3.0,
+ "step": 9800,
+ "text_loss": 0.6591248512268066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 1.7164157703217886e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15807683.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017791346181184053,
+ "skip_count": 0.0,
+ "step": 9802,
+ "text_loss": 0.45421653985977173
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 1.7083848458004035e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15810743.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008831496234051883,
+ "skip_count": 0.0,
+ "step": 9804,
+ "text_loss": 0.5535439848899841
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 1.7003724267170394e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15813880.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002800740534439683,
+ "skip_count": 0.0,
+ "step": 9806,
+ "text_loss": 0.5228974223136902
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 46.046962136777225,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 1.6923785161420845e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15816808.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006823428440839052,
+ "skip_count": 3.0,
+ "step": 9808,
+ "text_loss": 0.48018959164619446
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 1.6844031171388052e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15819803.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004808149300515652,
+ "skip_count": 0.0,
+ "step": 9810,
+ "text_loss": 0.31094294786453247
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 1.6764462327633955e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15822861.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026099751703441143,
+ "skip_count": 0.0,
+ "step": 9812,
+ "text_loss": 0.5534207224845886
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0478515625,
+ "learning_rate": 1.668507866064939e-05,
+ "loss": 0.0072,
+ "macro_f1": 1.0,
+ "num_tokens": 15825960.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008356450125575066,
+ "skip_count": 2.0,
+ "step": 9814,
+ "text_loss": 0.40162262320518494
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.084531846199,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 1.660588020085452e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15828906.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006548966746777296,
+ "skip_count": 2.0,
+ "step": 9816,
+ "text_loss": 0.2071811705827713
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.09392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 1.652686697859823e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15831935.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007895465241745114,
+ "skip_count": 0.0,
+ "step": 9818,
+ "text_loss": 0.6879562735557556
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 1.6448039024158534e-05,
+ "loss": 0.0037,
+ "macro_f1": 1.0,
+ "num_tokens": 15835745.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00370208453387022,
+ "skip_count": 2.0,
+ "step": 9820,
+ "text_loss": 0.6139163970947266
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.11270912826534,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 1.6369396367742483e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15838373.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002627170644700527,
+ "skip_count": 0.0,
+ "step": 9822,
+ "text_loss": 0.3881947100162506
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.018798828125,
+ "learning_rate": 1.6290939039486084e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15841156.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005191941745579243,
+ "skip_count": 2.0,
+ "step": 9824,
+ "text_loss": 0.6564247608184814
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 46.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0306396484375,
+ "learning_rate": 1.621266706945429e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15843877.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003889352548867464,
+ "skip_count": 0.0,
+ "step": 9826,
+ "text_loss": 0.7128682136535645
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 46.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 1.6134580487641047e-05,
+ "loss": 0.0031,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15846880.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00674893194809556,
+ "skip_count": 4.0,
+ "step": 9828,
+ "text_loss": 0.30893367528915405
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.027587890625,
+ "learning_rate": 1.6056679323969425e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15850130.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009898045100271702,
+ "skip_count": 0.0,
+ "step": 9830,
+ "text_loss": 0.6550688743591309
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 46.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 1.5978963608291154e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15853578.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0046016750857234,
+ "skip_count": 0.0,
+ "step": 9832,
+ "text_loss": 0.43872204422950745
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 1.5901433370387132e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15857939.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004589201882481575,
+ "skip_count": 1.0,
+ "step": 9834,
+ "text_loss": 0.41940808296203613
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 1.5824088639967094e-05,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15860584.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018899316200986505,
+ "skip_count": 1.0,
+ "step": 9836,
+ "text_loss": 0.5105440616607666
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 1.5746929446669556e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15864386.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006366848247125745,
+ "skip_count": 0.0,
+ "step": 9838,
+ "text_loss": 0.5686481595039368
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.017333984375,
+ "learning_rate": 1.5669955820062254e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15869103.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0043256948702037334,
+ "skip_count": 1.0,
+ "step": 9840,
+ "text_loss": 0.16309607028961182
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 1.5593167789641483e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15872384.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00406000716611743,
+ "skip_count": 1.0,
+ "step": 9842,
+ "text_loss": 0.21662485599517822
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 46.21602582917523,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 1.551656538483259e-05,
+ "loss": 0.0076,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 15875261.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.020087692886590958,
+ "skip_count": 2.0,
+ "step": 9844,
+ "text_loss": 0.6189377903938293
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 1.5440148634989826e-05,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15878132.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005302145145833492,
+ "skip_count": 0.0,
+ "step": 9846,
+ "text_loss": 0.34496018290519714
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04443359375,
+ "learning_rate": 1.536391756939609e-05,
+ "loss": 0.0091,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15881381.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008405420929193497,
+ "skip_count": 2.0,
+ "step": 9848,
+ "text_loss": 0.2865080237388611
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 1.528787221726341e-05,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15884621.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016017532907426357,
+ "skip_count": 0.0,
+ "step": 9850,
+ "text_loss": 0.6104921102523804
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 1.5212012607732528e-05,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15888157.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015318389050662518,
+ "skip_count": 0.0,
+ "step": 9852,
+ "text_loss": 0.2622036933898926
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 1.5136338769872915e-05,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 15891080.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006494096480309963,
+ "skip_count": 4.0,
+ "step": 9854,
+ "text_loss": 0.23415961861610413
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 1.5060850732682928e-05,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 15895486.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.007511078380048275,
+ "skip_count": 3.0,
+ "step": 9856,
+ "text_loss": 0.7389219999313354
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 1.4985548525089709e-05,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15898747.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004874013364315033,
+ "skip_count": 2.0,
+ "step": 9858,
+ "text_loss": 0.6853085160255432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 1.4910432175949285e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15902157.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009244410903193057,
+ "skip_count": 0.0,
+ "step": 9860,
+ "text_loss": 0.8172202110290527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 46.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 1.4835501714046296e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15905012.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00456853536888957,
+ "skip_count": 3.0,
+ "step": 9862,
+ "text_loss": 0.7527797818183899
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.30995010272967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 1.4760757168094275e-05,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15908302.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009686833946034312,
+ "skip_count": 0.0,
+ "step": 9864,
+ "text_loss": 0.5548131465911865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 1.4686198566735531e-05,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15911923.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008255072170868516,
+ "skip_count": 0.0,
+ "step": 9866,
+ "text_loss": 0.5995872020721436
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.32873495744057,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 1.4611825938540935e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15914858.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002459712326526642,
+ "skip_count": 0.0,
+ "step": 9868,
+ "text_loss": 0.6777655482292175
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.33812738479601,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017578125,
+ "learning_rate": 1.4537639312010298e-05,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15918091.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014664786867797375,
+ "skip_count": 0.0,
+ "step": 9870,
+ "text_loss": 0.42750120162963867
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 46.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 1.4463638715572103e-05,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 15920943.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005549794062972069,
+ "skip_count": 1.0,
+ "step": 9872,
+ "text_loss": 0.27477580308914185
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 1.4389824177583388e-05,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15924212.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007967505604028702,
+ "skip_count": 2.0,
+ "step": 9874,
+ "text_loss": 0.3174900412559509
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.36630466686234,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 1.4316195726330139e-05,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15929143.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014913028571754694,
+ "skip_count": 2.0,
+ "step": 9876,
+ "text_loss": 0.40919792652130127
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.375697094217784,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 1.4242753390026953e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15931702.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003994424478150904,
+ "skip_count": 0.0,
+ "step": 9878,
+ "text_loss": 0.35346853733062744
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.385089521573235,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 1.4169497196816983e-05,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 15935225.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008424114435911179,
+ "skip_count": 3.0,
+ "step": 9880,
+ "text_loss": 0.230825275182724
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 1.4096427174772164e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15938630.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004314251709729433,
+ "skip_count": 1.0,
+ "step": 9882,
+ "text_loss": 0.8749642968177795
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035400390625,
+ "learning_rate": 1.4023543351893043e-05,
+ "loss": 0.0083,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15941779.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008999531855806708,
+ "skip_count": 0.0,
+ "step": 9884,
+ "text_loss": 0.6549318432807922
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0311279296875,
+ "learning_rate": 1.3950845756108943e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15944779.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010829231468960643,
+ "skip_count": 0.0,
+ "step": 9886,
+ "text_loss": 0.5681273341178894
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 1.3878334415277583e-05,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15947757.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038863453082740307,
+ "skip_count": 1.0,
+ "step": 9888,
+ "text_loss": 0.4282133877277374
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 46.43205165835045,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017822265625,
+ "learning_rate": 1.3806009357185512e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15952223.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0006428947090171278,
+ "skip_count": 0.0,
+ "step": 9890,
+ "text_loss": 0.4455379247665405
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.441444085705896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 1.3733870609547838e-05,
+ "loss": 0.0084,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15955968.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00048406270798295736,
+ "skip_count": 0.0,
+ "step": 9892,
+ "text_loss": 0.37554407119750977
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.45083651306135,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 1.3661918200008228e-05,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15959376.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004503594245761633,
+ "skip_count": 1.0,
+ "step": 9894,
+ "text_loss": 0.22027169167995453
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 1.3590152156139012e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15962882.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011738749453797936,
+ "skip_count": 0.0,
+ "step": 9896,
+ "text_loss": 0.4203954041004181
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 1.3518572505440973e-05,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 15965816.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00806320272386074,
+ "skip_count": 2.0,
+ "step": 9898,
+ "text_loss": 0.18884631991386414
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.47901379512768,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 1.3447179275343779e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15968840.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004962162580341101,
+ "skip_count": 1.0,
+ "step": 9900,
+ "text_loss": 0.22457796335220337
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.48840622248312,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 1.3375972493205268e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15972768.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025535912718623877,
+ "skip_count": 0.0,
+ "step": 9902,
+ "text_loss": 0.14859545230865479
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 1.3304952186312114e-05,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15975380.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002036662772297859,
+ "skip_count": 0.0,
+ "step": 9904,
+ "text_loss": 0.5820382833480835
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.507191077194015,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 1.3234118381879378e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15978335.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0055219330824911594,
+ "skip_count": 2.0,
+ "step": 9906,
+ "text_loss": 0.29671815037727356
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 1.316347110705074e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15982003.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005196230486035347,
+ "skip_count": 0.0,
+ "step": 9908,
+ "text_loss": 0.5204919576644897
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023193359375,
+ "learning_rate": 1.3093010388898319e-05,
+ "loss": 0.0069,
+ "macro_f1": 1.0,
+ "num_tokens": 15984937.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0032779101748019457,
+ "skip_count": 2.0,
+ "step": 9910,
+ "text_loss": 0.6803483366966248
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 1.3022736254422851e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15988992.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002347869798541069,
+ "skip_count": 0.0,
+ "step": 9912,
+ "text_loss": 0.5335546731948853
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 1.2952648730553462e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15992828.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011128517799079418,
+ "skip_count": 0.0,
+ "step": 9914,
+ "text_loss": 0.686739981174469
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.55415321397123,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 1.288274784414789e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 15995984.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031158174388110638,
+ "skip_count": 0.0,
+ "step": 9916,
+ "text_loss": 0.16102474927902222
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.563545641326684,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.059814453125,
+ "learning_rate": 1.2813033621992264e-05,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 15999606.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029228583443909883,
+ "skip_count": 1.0,
+ "step": 9918,
+ "text_loss": 0.6022558212280273
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.57293806868213,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 1.274350609080116e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16002456.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031404250767081976,
+ "skip_count": 2.0,
+ "step": 9920,
+ "text_loss": 0.7529577016830444
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 1.2674165277217653e-05,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16005547.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038669302593916655,
+ "skip_count": 0.0,
+ "step": 9922,
+ "text_loss": 0.47488540410995483
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0498046875,
+ "learning_rate": 1.2605011207813378e-05,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16009520.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004838052671402693,
+ "skip_count": 0.0,
+ "step": 9924,
+ "text_loss": 0.5252779722213745
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 46.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 1.2536043909088191e-05,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16012730.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0017430823063477874,
+ "skip_count": 0.0,
+ "step": 9926,
+ "text_loss": 0.40845534205436707
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0167236328125,
+ "learning_rate": 1.2467263407470619e-05,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16015940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010244545992463827,
+ "skip_count": 0.0,
+ "step": 9928,
+ "text_loss": 0.8465730547904968
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.619900205459345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 1.2398669729317357e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16018851.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007380630704574287,
+ "skip_count": 0.0,
+ "step": 9930,
+ "text_loss": 0.37603214383125305
+ },
+ {
+ "acc_repeat": 0.800000011920929,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.629292632814796,
+ "f1_execute": 0.9729729890823364,
+ "f1_repeat": 0.888888955116272,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 1.2330262900913657e-05,
+ "loss": 0.0087,
+ "macro_f1": 0.9539539813995361,
+ "num_tokens": 16022351.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.053848277777433395,
+ "skip_count": 5.0,
+ "step": 9932,
+ "text_loss": 0.2047014981508255
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 46.63868506017024,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 1.2262042948473163e-05,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16024902.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0020845322869718075,
+ "skip_count": 0.0,
+ "step": 9934,
+ "text_loss": 0.6269918084144592
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 1.2194009898137903e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16028056.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0008686805376783013,
+ "skip_count": 0.0,
+ "step": 9936,
+ "text_loss": 0.4100899398326874
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 46.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 1.212616377597825e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16032111.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004883588291704655,
+ "skip_count": 3.0,
+ "step": 9938,
+ "text_loss": 0.3921346664428711
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.66686234223657,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 1.2058504607993015e-05,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16035872.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005067490856163204,
+ "skip_count": 0.0,
+ "step": 9940,
+ "text_loss": 0.44368258118629456
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06787109375,
+ "learning_rate": 1.1991032420109238e-05,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16038923.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005819452460855246,
+ "skip_count": 2.0,
+ "step": 9942,
+ "text_loss": 0.27500197291374207
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.685647196947464,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 1.1923747238182403e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 16041803.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.035794492810964584,
+ "skip_count": 3.0,
+ "step": 9944,
+ "text_loss": 0.5083543062210083
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 1.1856649087996384e-05,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 16045258.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002845201175659895,
+ "skip_count": 2.0,
+ "step": 9946,
+ "text_loss": 0.6859534382820129
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 1.1789737995263228e-05,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16048618.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007575460476800799,
+ "skip_count": 0.0,
+ "step": 9948,
+ "text_loss": 0.4512535333633423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 1.1723013985623477e-05,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16051595.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002697878750041127,
+ "skip_count": 1.0,
+ "step": 9950,
+ "text_loss": 0.3572070300579071
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.72321690636924,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.027099609375,
+ "learning_rate": 1.16564770846459e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16054494.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0062429774552583694,
+ "skip_count": 1.0,
+ "step": 9952,
+ "text_loss": 0.5479834079742432
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.73260933372468,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 1.1590127317827492e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16057555.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009302232647314668,
+ "skip_count": 0.0,
+ "step": 9954,
+ "text_loss": 0.44800761342048645
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.74200176108013,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 1.1523964710593637e-05,
+ "loss": 0.0032,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16061072.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002112898975610733,
+ "skip_count": 0.0,
+ "step": 9956,
+ "text_loss": 0.3274081349372864
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.751394188435576,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0269775390625,
+ "learning_rate": 1.1457989288297942e-05,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16064165.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00028447998920455575,
+ "skip_count": 0.0,
+ "step": 9958,
+ "text_loss": 0.5712385773658752
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 1.1392201076222352e-05,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 16067293.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009599249809980392,
+ "skip_count": 2.0,
+ "step": 9960,
+ "text_loss": 0.26818037033081055
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.77017904314646,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.055908203125,
+ "learning_rate": 1.132660009957709e-05,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16069852.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005338563583791256,
+ "skip_count": 0.0,
+ "step": 9962,
+ "text_loss": 0.6658869981765747
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 1.1261186383500487e-05,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16072633.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001175224082544446,
+ "skip_count": 1.0,
+ "step": 9964,
+ "text_loss": 0.4461731016635895
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 46.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02099609375,
+ "learning_rate": 1.1195959953059221e-05,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16076065.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0036650802940130234,
+ "skip_count": 0.0,
+ "step": 9966,
+ "text_loss": 0.6107141971588135
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 1.113092083324818e-05,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16079309.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005924097262322903,
+ "skip_count": 2.0,
+ "step": 9968,
+ "text_loss": 0.5104627013206482
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 31.0,
+ "epoch": 46.807748752568244,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 1.1066069048990545e-05,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16082180.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.010777595452964306,
+ "skip_count": 0.0,
+ "step": 9970,
+ "text_loss": 0.5205907225608826
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.81714117992369,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.056640625,
+ "learning_rate": 1.100140462513749e-05,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16084654.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0019593914039433002,
+ "skip_count": 0.0,
+ "step": 9972,
+ "text_loss": 0.36411789059638977
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 46.82653360727913,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 1.0936927586468693e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 16087736.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0233579371124506,
+ "skip_count": 4.0,
+ "step": 9974,
+ "text_loss": 0.267604261636734
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 1.0872637957691833e-05,
+ "loss": 0.0048,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16090838.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00034629934816621244,
+ "skip_count": 0.0,
+ "step": 9976,
+ "text_loss": 0.576068103313446
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 1.0808535763442761e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16094084.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004253332444932312,
+ "skip_count": 0.0,
+ "step": 9978,
+ "text_loss": 0.5883988738059998
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 1.0744621028285662e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16097432.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005800648941658437,
+ "skip_count": 0.0,
+ "step": 9980,
+ "text_loss": 0.3358926475048065
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 46.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 1.068089377671272e-05,
+ "loss": 0.0074,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16100711.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015245937975123525,
+ "skip_count": 0.0,
+ "step": 9982,
+ "text_loss": 0.6802405714988708
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0225830078125,
+ "learning_rate": 1.061735403314429e-05,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16103952.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002281307242810726,
+ "skip_count": 1.0,
+ "step": 9984,
+ "text_loss": 0.3086298406124115
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 1.055400182192906e-05,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16107101.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007910717977210879,
+ "skip_count": 0.0,
+ "step": 9986,
+ "text_loss": 0.7036139965057373
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 46.89228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 1.0490837167343559e-05,
+ "loss": 0.0077,
+ "macro_f1": 1.0,
+ "num_tokens": 16110316.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030006880406290293,
+ "skip_count": 1.0,
+ "step": 9988,
+ "text_loss": 0.4638058841228485
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 1.04278600935927e-05,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16113206.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006434856331907213,
+ "skip_count": 0.0,
+ "step": 9990,
+ "text_loss": 0.6155068874359131
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052734375,
+ "learning_rate": 1.0365070624809403e-05,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16116098.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007891099085099995,
+ "skip_count": 0.0,
+ "step": 9992,
+ "text_loss": 0.4537872076034546
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 46.92045788083358,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0419921875,
+ "learning_rate": 1.0302468785054641e-05,
+ "loss": 0.0054,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 16119344.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.011918486095964909,
+ "skip_count": 1.0,
+ "step": 9994,
+ "text_loss": 0.18828579783439636
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 1.0240054598317672e-05,
+ "loss": 0.0046,
+ "macro_f1": 1.0,
+ "num_tokens": 16122615.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.016306765377521515,
+ "skip_count": 2.0,
+ "step": 9996,
+ "text_loss": 0.2876183092594147
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 46.93924273554447,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 1.0177828088515694e-05,
+ "loss": 0.0033,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16125506.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00393108231946826,
+ "skip_count": 1.0,
+ "step": 9998,
+ "text_loss": 0.6387818455696106
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 1.011578927949397e-05,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16128499.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001175055862404406,
+ "skip_count": 0.0,
+ "step": 10000,
+ "text_loss": 0.4085952639579773
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 1.0053938195025925e-05,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16130888.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029882853850722313,
+ "skip_count": 0.0,
+ "step": 10002,
+ "text_loss": 0.36795294284820557
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 9.992274858812988e-06,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16133875.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0064101857133209705,
+ "skip_count": 2.0,
+ "step": 10004,
+ "text_loss": 0.30780166387557983
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 46.97681244496625,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 9.930799294484704e-06,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16136826.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.004496502690017223,
+ "skip_count": 0.0,
+ "step": 10006,
+ "text_loss": 0.321386456489563
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 46.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 9.869511525598617e-06,
+ "loss": 0.007,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16140429.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007862923666834831,
+ "skip_count": 2.0,
+ "step": 10008,
+ "text_loss": 0.3304281234741211
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 46.99559729967714,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 9.80841157564033e-06,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16143280.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007891185232438147,
+ "skip_count": 0.0,
+ "step": 10010,
+ "text_loss": 0.6880549788475037
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.004696213677725,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 9.747499468023391e-06,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16146124.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00044356059515848756,
+ "skip_count": 0.0,
+ "step": 10012,
+ "text_loss": 0.7140262126922607
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.01408864103317,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0250244140625,
+ "learning_rate": 9.686775226089462e-06,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16148732.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003097282024100423,
+ "skip_count": 0.0,
+ "step": 10014,
+ "text_loss": 0.5629494190216064
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 47.02348106838861,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 9.626238873108262e-06,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16151364.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006588284857571125,
+ "skip_count": 1.0,
+ "step": 10016,
+ "text_loss": 0.20520731806755066
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 9.565890432277346e-06,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16154526.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000600519881118089,
+ "skip_count": 0.0,
+ "step": 10018,
+ "text_loss": 0.428753525018692
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 47.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0185546875,
+ "learning_rate": 9.50572992672233e-06,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16158182.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003753028344362974,
+ "skip_count": 0.0,
+ "step": 10020,
+ "text_loss": 0.4269808232784271
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.05165835045494,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 9.445757379496933e-06,
+ "loss": 0.0055,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 16161691.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.02429025247693062,
+ "skip_count": 3.0,
+ "step": 10022,
+ "text_loss": 0.26357248425483704
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 47.061050777810394,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.04296875,
+ "learning_rate": 9.385972813582721e-06,
+ "loss": 0.0056,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 16164862.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.021486395969986916,
+ "skip_count": 4.0,
+ "step": 10024,
+ "text_loss": 0.4035261273384094
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 47.07044320516584,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 9.326376251889201e-06,
+ "loss": 0.0059,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 16169410.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.017894137650728226,
+ "skip_count": 1.0,
+ "step": 10026,
+ "text_loss": 0.5168870091438293
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 9.266967717253938e-06,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16172430.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033336186315864325,
+ "skip_count": 0.0,
+ "step": 10028,
+ "text_loss": 0.5204904079437256
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 9.207747232442331e-06,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16175797.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022511237766593695,
+ "skip_count": 0.0,
+ "step": 10030,
+ "text_loss": 0.19971035420894623
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 47.09862048723217,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0281982421875,
+ "learning_rate": 9.148714820147841e-06,
+ "loss": 0.0082,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 16178636.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.03046531230211258,
+ "skip_count": 1.0,
+ "step": 10032,
+ "text_loss": 0.7068908214569092
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 47.10801291458761,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02294921875,
+ "learning_rate": 9.089870502991815e-06,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16182658.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013325439067557454,
+ "skip_count": 1.0,
+ "step": 10034,
+ "text_loss": 0.5161240100860596
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.117405341943055,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 9.031214303523493e-06,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16186669.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0041415193118155,
+ "skip_count": 0.0,
+ "step": 10036,
+ "text_loss": 0.17281492054462433
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 47.126797769298506,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 8.972746244219953e-06,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16189676.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00235518510453403,
+ "skip_count": 0.0,
+ "step": 10038,
+ "text_loss": 0.776432991027832
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 47.13619019665395,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 8.914466347486382e-06,
+ "loss": 0.0072,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 16193068.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.020981203764677048,
+ "skip_count": 1.0,
+ "step": 10040,
+ "text_loss": 0.6855355501174927
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 25.0,
+ "epoch": 47.14558262400939,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.888888955116272,
+ "grad_norm": 0.046875,
+ "learning_rate": 8.856374635655695e-06,
+ "loss": 0.006,
+ "macro_f1": 0.9555556178092957,
+ "num_tokens": 16195878.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.017154231667518616,
+ "skip_count": 5.0,
+ "step": 10042,
+ "text_loss": 0.7087341547012329
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 47.154975051364836,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.018310546875,
+ "learning_rate": 8.798471130988695e-06,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 16198502.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0036271605640649796,
+ "skip_count": 1.0,
+ "step": 10044,
+ "text_loss": 0.6096780300140381
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 47.16436747872028,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 8.740755855674243e-06,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16201043.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00554735166952014,
+ "skip_count": 3.0,
+ "step": 10046,
+ "text_loss": 0.4441182613372803
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.17375990607572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 8.683228831828816e-06,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16204332.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031374485697597265,
+ "skip_count": 2.0,
+ "step": 10048,
+ "text_loss": 0.7983347773551941
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.183152333431174,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029052734375,
+ "learning_rate": 8.625890081497001e-06,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16207800.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00201304629445076,
+ "skip_count": 0.0,
+ "step": 10050,
+ "text_loss": 0.34401828050613403
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.19254476078662,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 8.568739626651002e-06,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16210826.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0021288148127496243,
+ "skip_count": 0.0,
+ "step": 10052,
+ "text_loss": 0.27440160512924194
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 47.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 8.51177748919102e-06,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16213643.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.002644419437274337,
+ "skip_count": 0.0,
+ "step": 10054,
+ "text_loss": 0.33396100997924805
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
+ "epoch": 47.211329615497505,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.04296875,
+ "learning_rate": 8.45500369094504e-06,
+ "loss": 0.0052,
+ "macro_f1": 0.9452888369560242,
+ "num_tokens": 16216646.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.048469074070453644,
+ "skip_count": 4.0,
+ "step": 10056,
+ "text_loss": 0.3018307089805603
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.016845703125,
+ "learning_rate": 8.398418253668937e-06,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16219499.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013763440074399114,
+ "skip_count": 0.0,
+ "step": 10058,
+ "text_loss": 0.39421531558036804
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 47.23011447020839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0576171875,
+ "learning_rate": 8.342021199046312e-06,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16223062.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004151828121393919,
+ "skip_count": 1.0,
+ "step": 10060,
+ "text_loss": 0.16675396263599396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 47.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 8.285812548688654e-06,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16226201.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003218848491087556,
+ "skip_count": 1.0,
+ "step": 10062,
+ "text_loss": 0.6134784817695618
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.248899324919286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 8.229792324135177e-06,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16229230.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0058194492012262344,
+ "skip_count": 2.0,
+ "step": 10064,
+ "text_loss": 0.19825725257396698
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 47.25829175227473,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 8.173960546852987e-06,
+ "loss": 0.0073,
+ "macro_f1": 0.8814815282821655,
+ "num_tokens": 16232222.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03991774469614029,
+ "skip_count": 4.0,
+ "step": 10066,
+ "text_loss": 0.2851788103580475
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.26768417963017,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 8.11831723823686e-06,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16236960.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011416112538427114,
+ "skip_count": 0.0,
+ "step": 10068,
+ "text_loss": 0.32021182775497437
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.27707660698562,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0361328125,
+ "learning_rate": 8.062862419609519e-06,
+ "loss": 0.0073,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 16240419.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015871701762080193,
+ "skip_count": 0.0,
+ "step": 10070,
+ "text_loss": 0.21992693841457367
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 47.28646903434106,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0625,
+ "learning_rate": 8.007596112221293e-06,
+ "loss": 0.0057,
+ "macro_f1": 1.0,
+ "num_tokens": 16243273.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004018099047243595,
+ "skip_count": 1.0,
+ "step": 10072,
+ "text_loss": 0.4440346658229828
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.295861461696504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 7.952518337250303e-06,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16247268.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004422081634402275,
+ "skip_count": 0.0,
+ "step": 10074,
+ "text_loss": 0.3484672009944916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.305253889051954,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 7.897629115802551e-06,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16250590.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003315444104373455,
+ "skip_count": 0.0,
+ "step": 10076,
+ "text_loss": 0.32249578833580017
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.3146463164074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 7.842928468911603e-06,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16253605.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002227222314104438,
+ "skip_count": 0.0,
+ "step": 10078,
+ "text_loss": 0.4467211961746216
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.054931640625,
+ "learning_rate": 7.788416417538857e-06,
+ "loss": 0.0117,
+ "macro_f1": 1.0,
+ "num_tokens": 16256521.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010048549622297287,
+ "skip_count": 3.0,
+ "step": 10080,
+ "text_loss": 0.29726436734199524
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 47.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 7.734092982573493e-06,
+ "loss": 0.0035,
+ "macro_f1": 1.0,
+ "num_tokens": 16259721.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0012925490736961365,
+ "skip_count": 1.0,
+ "step": 10082,
+ "text_loss": 0.45976048707962036
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 7.679958184832302e-06,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16262741.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006292753387242556,
+ "skip_count": 0.0,
+ "step": 10084,
+ "text_loss": 0.32511985301971436
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.35221602582917,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 7.626012045059916e-06,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16266080.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005420933943241835,
+ "skip_count": 2.0,
+ "step": 10086,
+ "text_loss": 0.20795102417469025
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 7.572254583928406e-06,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16269733.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003182400716468692,
+ "skip_count": 2.0,
+ "step": 10088,
+ "text_loss": 0.13773657381534576
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 47.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 7.5186858220379625e-06,
+ "loss": 0.0037,
+ "macro_f1": 1.0,
+ "num_tokens": 16273812.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.008067524060606956,
+ "skip_count": 2.0,
+ "step": 10090,
+ "text_loss": 0.26591432094573975
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03076171875,
+ "learning_rate": 7.4653057799161096e-06,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16276550.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017690346576273441,
+ "skip_count": 0.0,
+ "step": 10092,
+ "text_loss": 0.6460638642311096
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.38978573525095,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.026123046875,
+ "learning_rate": 7.412114478018261e-06,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16280012.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009686960838735104,
+ "skip_count": 0.0,
+ "step": 10094,
+ "text_loss": 0.5548131465911865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.3991781626064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 7.359111936727281e-06,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16282986.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003071374725550413,
+ "skip_count": 2.0,
+ "step": 10096,
+ "text_loss": 0.09838774055242538
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.40857058996184,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 7.306298176354032e-06,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16285647.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028131429571658373,
+ "skip_count": 0.0,
+ "step": 10098,
+ "text_loss": 0.15995968878269196
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.41796301731729,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 7.253673217136658e-06,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16289663.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003445233218371868,
+ "skip_count": 0.0,
+ "step": 10100,
+ "text_loss": 0.2618424892425537
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.427355444672735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 7.201237079241252e-06,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16293270.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006494173314422369,
+ "skip_count": 0.0,
+ "step": 10102,
+ "text_loss": 0.26529571413993835
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 47.43674787202818,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 7.1489897827614615e-06,
+ "loss": 0.0085,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16296633.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0019948924891650677,
+ "skip_count": 0.0,
+ "step": 10104,
+ "text_loss": 0.774922251701355
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 47.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 7.096931347718494e-06,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16299679.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020289800595492125,
+ "skip_count": 1.0,
+ "step": 10106,
+ "text_loss": 0.715824544429779
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.455532726739065,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 7.04506179406128e-06,
+ "loss": 0.0029,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16303207.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012691093143075705,
+ "skip_count": 0.0,
+ "step": 10108,
+ "text_loss": 0.4474022090435028
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 6.993381141666255e-06,
+ "loss": 0.0066,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16306842.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004444579128175974,
+ "skip_count": 0.0,
+ "step": 10110,
+ "text_loss": 0.3689751625061035
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.47431758144996,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 6.9418894103376315e-06,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16310919.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007234106888063252,
+ "skip_count": 0.0,
+ "step": 10112,
+ "text_loss": 0.7767618298530579
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 47.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 6.890586619807126e-06,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16313780.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017169835045933723,
+ "skip_count": 1.0,
+ "step": 10114,
+ "text_loss": 0.4885733127593994
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 47.49310243616085,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0196533203125,
+ "learning_rate": 6.839472789733958e-06,
+ "loss": 0.0043,
+ "macro_f1": 1.0,
+ "num_tokens": 16317529.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.007271626964211464,
+ "skip_count": 5.0,
+ "step": 10116,
+ "text_loss": 0.6611388921737671
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 6.788547939705181e-06,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16320105.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022054670844227076,
+ "skip_count": 0.0,
+ "step": 10118,
+ "text_loss": 0.18132901191711426
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.016845703125,
+ "learning_rate": 6.737812089235185e-06,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16323080.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006391640636138618,
+ "skip_count": 0.0,
+ "step": 10120,
+ "text_loss": 0.32267218828201294
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 6.68726525776614e-06,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16326006.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00046651664888486266,
+ "skip_count": 0.0,
+ "step": 10122,
+ "text_loss": 0.4213443100452423
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.53067214558262,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 6.6369074646676635e-06,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16329108.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025715050287544727,
+ "skip_count": 2.0,
+ "step": 10124,
+ "text_loss": 0.48734065890312195
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.54006457293807,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 6.58673872923693e-06,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16332106.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001850960310548544,
+ "skip_count": 0.0,
+ "step": 10126,
+ "text_loss": 1.0562689304351807
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 47.549457000293515,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 6.536759070698672e-06,
+ "loss": 0.0045,
+ "macro_f1": 0.8814815282821655,
+ "num_tokens": 16334960.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.014950240030884743,
+ "skip_count": 4.0,
+ "step": 10128,
+ "text_loss": 0.8084779381752014
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 6.486968508205237e-06,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16338086.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018889640923589468,
+ "skip_count": 0.0,
+ "step": 10130,
+ "text_loss": 0.5870251059532166
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.5682418550044,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028564453125,
+ "learning_rate": 6.437367060836419e-06,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16341036.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001758610364049673,
+ "skip_count": 0.0,
+ "step": 10132,
+ "text_loss": 0.46824970841407776
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 47.577634282359845,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 6.387954747599622e-06,
+ "loss": 0.0067,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 16344236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013333287090063095,
+ "skip_count": 3.0,
+ "step": 10134,
+ "text_loss": 0.28457126021385193
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.58702670971529,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 6.338731587429758e-06,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16348312.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003430357202887535,
+ "skip_count": 0.0,
+ "step": 10136,
+ "text_loss": 0.2896702289581299
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.59641913707074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 6.289697599189181e-06,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16351044.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001170355360955,
+ "skip_count": 0.0,
+ "step": 10138,
+ "text_loss": 0.6347740292549133
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 6.240852801667752e-06,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16354443.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033906467724591494,
+ "skip_count": 2.0,
+ "step": 10140,
+ "text_loss": 0.5276535749435425
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.61520399178163,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 6.192197213583051e-06,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16357435.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001492051873356104,
+ "skip_count": 2.0,
+ "step": 10142,
+ "text_loss": 0.49688321352005005
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 47.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0191650390625,
+ "learning_rate": 6.143730853579887e-06,
+ "loss": 0.0054,
+ "macro_f1": 1.0,
+ "num_tokens": 16360993.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0024281898513436317,
+ "skip_count": 1.0,
+ "step": 10144,
+ "text_loss": 0.49487611651420593
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
+ "epoch": 47.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05615234375,
+ "learning_rate": 6.095453740230683e-06,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16364196.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006147443782538176,
+ "skip_count": 5.0,
+ "step": 10146,
+ "text_loss": 0.3056519329547882
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 47.64338127384796,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 6.047365892035361e-06,
+ "loss": 0.005,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 16368152.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.015886440873146057,
+ "skip_count": 0.0,
+ "step": 10148,
+ "text_loss": 0.6246888637542725
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.65277370120341,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 5.999467327421182e-06,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16371538.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004134364426136017,
+ "skip_count": 0.0,
+ "step": 10150,
+ "text_loss": 0.38278883695602417
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 5.951758064743018e-06,
+ "loss": 0.006,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16374324.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0050895679742097855,
+ "skip_count": 2.0,
+ "step": 10152,
+ "text_loss": 0.7034569978713989
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 5.9042381222831345e-06,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16377386.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0024351398460566998,
+ "skip_count": 0.0,
+ "step": 10154,
+ "text_loss": 0.5222152471542358
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05029296875,
+ "learning_rate": 5.856907518251298e-06,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16380449.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010605348274111748,
+ "skip_count": 2.0,
+ "step": 10156,
+ "text_loss": 0.3262309730052948
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 28.0,
+ "epoch": 47.69034341062518,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 5.8097662707846664e-06,
+ "loss": 0.0062,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 16383519.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.022603167220950127,
+ "skip_count": 3.0,
+ "step": 10158,
+ "text_loss": 0.28901928663253784
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.699735837980626,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 5.7628143979478465e-06,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16386345.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004141980782151222,
+ "skip_count": 0.0,
+ "step": 10160,
+ "text_loss": 0.2058449685573578
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.70912826533607,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 5.7160519177328344e-06,
+ "loss": 0.0078,
+ "macro_f1": 1.0,
+ "num_tokens": 16389766.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004226053133606911,
+ "skip_count": 3.0,
+ "step": 10162,
+ "text_loss": 0.5554977655410767
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 47.71852069269152,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 5.66947884805924e-06,
+ "loss": 0.0069,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16392323.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036407415755093098,
+ "skip_count": 1.0,
+ "step": 10164,
+ "text_loss": 0.43077412247657776
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.72791312004696,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 5.623095206773788e-06,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16395555.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020233208779245615,
+ "skip_count": 0.0,
+ "step": 10166,
+ "text_loss": 0.654839813709259
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 47.73730554740241,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 5.57690101165087e-06,
+ "loss": 0.0063,
+ "macro_f1": 1.0,
+ "num_tokens": 16398949.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006491049658507109,
+ "skip_count": 2.0,
+ "step": 10168,
+ "text_loss": 0.2042955756187439
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 47.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0206298828125,
+ "learning_rate": 5.530896280392217e-06,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16402654.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032798724714666605,
+ "skip_count": 1.0,
+ "step": 10170,
+ "text_loss": 0.303030401468277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.756090402113294,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 5.485081030626838e-06,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16405701.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010711143258959055,
+ "skip_count": 2.0,
+ "step": 10172,
+ "text_loss": 0.3775373101234436
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 47.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 5.4394552799112985e-06,
+ "loss": 0.0066,
+ "macro_f1": 1.0,
+ "num_tokens": 16408999.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038391631096601486,
+ "skip_count": 1.0,
+ "step": 10174,
+ "text_loss": 0.20590868592262268
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.77487525682419,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 5.394019045729448e-06,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16412045.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016695939702913165,
+ "skip_count": 0.0,
+ "step": 10176,
+ "text_loss": 0.5118611454963684
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.78426768417963,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030029296875,
+ "learning_rate": 5.348772345492525e-06,
+ "loss": 0.0065,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16415107.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007850619731470942,
+ "skip_count": 0.0,
+ "step": 10178,
+ "text_loss": 0.6818836331367493
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 5.30371519653916e-06,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16418012.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0039045598823577166,
+ "skip_count": 0.0,
+ "step": 10180,
+ "text_loss": 0.5973153710365295
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 5.258847616135376e-06,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16420715.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0035636175889521837,
+ "skip_count": 2.0,
+ "step": 10182,
+ "text_loss": 0.5864625573158264
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.81244496624596,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 5.214169621474419e-06,
+ "loss": 0.007,
+ "macro_f1": 1.0,
+ "num_tokens": 16423561.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0038354399148374796,
+ "skip_count": 3.0,
+ "step": 10184,
+ "text_loss": 0.6486931443214417
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.821837393601406,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 5.169681229677037e-06,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16426737.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029660905711352825,
+ "skip_count": 0.0,
+ "step": 10186,
+ "text_loss": 0.32970958948135376
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.83122982095686,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.017333984375,
+ "learning_rate": 5.125382457791316e-06,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16429810.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022391141392290592,
+ "skip_count": 0.0,
+ "step": 10188,
+ "text_loss": 0.5421582460403442
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 47.8406222483123,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 5.081273322792512e-06,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 16433324.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009630356915295124,
+ "skip_count": 2.0,
+ "step": 10190,
+ "text_loss": 0.29760071635246277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 5.037353841583436e-06,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16436466.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005291678477078676,
+ "skip_count": 0.0,
+ "step": 10192,
+ "text_loss": 0.31106626987457275
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 47.85940710302319,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 4.99362403099407e-06,
+ "loss": 0.0065,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 16439152.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01734933815896511,
+ "skip_count": 1.0,
+ "step": 10194,
+ "text_loss": 0.575576901435852
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 47.86879953037863,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 4.950083907781733e-06,
+ "loss": 0.0057,
+ "macro_f1": 1.0,
+ "num_tokens": 16442387.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.011718297377228737,
+ "skip_count": 2.0,
+ "step": 10196,
+ "text_loss": 0.19005915522575378
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.878191957734074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 4.906733488631187e-06,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16445391.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015040510334074497,
+ "skip_count": 0.0,
+ "step": 10198,
+ "text_loss": 0.6865255236625671
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 47.887584385089525,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 4.863572790154258e-06,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16449229.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001746732392348349,
+ "skip_count": 0.0,
+ "step": 10200,
+ "text_loss": 0.4538392722606659
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.89697681244497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 4.82060182889027e-06,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16452590.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009971166728064418,
+ "skip_count": 0.0,
+ "step": 10202,
+ "text_loss": 0.7585988640785217
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 4.777820621305828e-06,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16455458.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005328746512532234,
+ "skip_count": 2.0,
+ "step": 10204,
+ "text_loss": 0.558459460735321
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.915761667155856,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0595703125,
+ "learning_rate": 4.735229183794709e-06,
+ "loss": 0.0056,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 16458749.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04486622288823128,
+ "skip_count": 2.0,
+ "step": 10206,
+ "text_loss": 0.15466898679733276
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 47.9251540945113,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 4.692827532678023e-06,
+ "loss": 0.0042,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16461257.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005598196294158697,
+ "skip_count": 1.0,
+ "step": 10208,
+ "text_loss": 0.1840037852525711
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.93454652186674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0172119140625,
+ "learning_rate": 4.650615684204163e-06,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16465271.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015303631080314517,
+ "skip_count": 0.0,
+ "step": 10210,
+ "text_loss": 0.45189639925956726
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 47.943938949222186,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 4.608593654548854e-06,
+ "loss": 0.0074,
+ "macro_f1": 1.0,
+ "num_tokens": 16468804.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015223458409309387,
+ "skip_count": 5.0,
+ "step": 10212,
+ "text_loss": 0.34667012095451355
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.95333137657764,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.047119140625,
+ "learning_rate": 4.566761459814939e-06,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16471953.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004154558759182692,
+ "skip_count": 0.0,
+ "step": 10214,
+ "text_loss": 0.19757303595542908
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 47.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 4.52511911603265e-06,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16475292.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002932488452643156,
+ "skip_count": 1.0,
+ "step": 10216,
+ "text_loss": 0.4767858684062958
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.972116231288524,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 4.483666639159389e-06,
+ "loss": 0.0052,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 16478361.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.009086701087653637,
+ "skip_count": 2.0,
+ "step": 10218,
+ "text_loss": 0.3097109794616699
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 47.98150865864397,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 4.442404045079784e-06,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16481058.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007684580981731415,
+ "skip_count": 2.0,
+ "step": 10220,
+ "text_loss": 0.4293085038661957
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 47.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034423828125,
+ "learning_rate": 4.401331349605797e-06,
+ "loss": 0.0089,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16484933.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004087725654244423,
+ "skip_count": 0.0,
+ "step": 10222,
+ "text_loss": 0.2643229067325592
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 48.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0294189453125,
+ "learning_rate": 4.360448568476561e-06,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 16488096.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.003739884588867426,
+ "skip_count": 1.0,
+ "step": 10224,
+ "text_loss": 0.5812314748764038
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 48.00939242735544,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0234375,
+ "learning_rate": 4.319755717358431e-06,
+ "loss": 0.0066,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 16490918.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01758190058171749,
+ "skip_count": 2.0,
+ "step": 10226,
+ "text_loss": 0.35358762741088867
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.01878485471089,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 4.2792528118449356e-06,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16493996.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003696965519338846,
+ "skip_count": 1.0,
+ "step": 10228,
+ "text_loss": 0.28963083028793335
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.02817728206633,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 4.238939867456937e-06,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16498163.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009691051091067493,
+ "skip_count": 0.0,
+ "step": 10230,
+ "text_loss": 0.6794275045394897
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 48.03756970942178,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 4.198816899642355e-06,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16501075.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.00915334839373827,
+ "skip_count": 0.0,
+ "step": 10232,
+ "text_loss": 0.6993107795715332
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.046962136777225,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0299072265625,
+ "learning_rate": 4.158883923776447e-06,
+ "loss": 0.008,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 16504533.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010835417546331882,
+ "skip_count": 1.0,
+ "step": 10234,
+ "text_loss": 0.46092382073402405
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 22.0,
+ "epoch": 48.05635456413267,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 4.119140955161582e-06,
+ "loss": 0.0049,
+ "macro_f1": 1.0,
+ "num_tokens": 16507612.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006133808754384518,
+ "skip_count": 7.0,
+ "step": 10236,
+ "text_loss": 0.5992426872253418
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.06574699148811,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0205078125,
+ "learning_rate": 4.079588009027357e-06,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16510623.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001170355360955,
+ "skip_count": 0.0,
+ "step": 10238,
+ "text_loss": 0.6200118660926819
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.075139418843555,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 4.040225100530536e-06,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16513709.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013292148942127824,
+ "skip_count": 0.0,
+ "step": 10240,
+ "text_loss": 0.41305387020111084
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.084531846199,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 4.001052244754999e-06,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16516793.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003694178769364953,
+ "skip_count": 0.0,
+ "step": 10242,
+ "text_loss": 0.36737722158432007
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.09392427355445,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 3.962069456711903e-06,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16519837.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004149764310568571,
+ "skip_count": 2.0,
+ "step": 10244,
+ "text_loss": 0.20932413637638092
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.10331670090989,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 3.9232767513395215e-06,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16523034.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005588968750089407,
+ "skip_count": 1.0,
+ "step": 10246,
+ "text_loss": 0.22806818783283234
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 48.11270912826534,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 3.884674143503353e-06,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 16525916.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0011802187655121088,
+ "skip_count": 1.0,
+ "step": 10248,
+ "text_loss": 0.36658138036727905
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.12210155562078,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 3.846261647995897e-06,
+ "loss": 0.004,
+ "macro_f1": 1.0,
+ "num_tokens": 16529040.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.010508419014513493,
+ "skip_count": 4.0,
+ "step": 10250,
+ "text_loss": 0.20360486209392548
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.131493982976224,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041015625,
+ "learning_rate": 3.8080392795369347e-06,
+ "loss": 0.0066,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16532088.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0004971205489709973,
+ "skip_count": 1.0,
+ "step": 10252,
+ "text_loss": 0.5355691313743591
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.14088641033167,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03955078125,
+ "learning_rate": 3.770007052773361e-06,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16534996.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003430357202887535,
+ "skip_count": 0.0,
+ "step": 10254,
+ "text_loss": 0.2113809734582901
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.15027883768712,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0228271484375,
+ "learning_rate": 3.732164982279185e-06,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16538077.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017093889182433486,
+ "skip_count": 0.0,
+ "step": 10256,
+ "text_loss": 0.8436145782470703
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.15967126504256,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 3.6945130825555284e-06,
+ "loss": 0.0052,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16541341.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028951778076589108,
+ "skip_count": 0.0,
+ "step": 10258,
+ "text_loss": 0.6505146026611328
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.169063692398005,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021728515625,
+ "learning_rate": 3.6570513680307395e-06,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16544277.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007293363101780415,
+ "skip_count": 1.0,
+ "step": 10260,
+ "text_loss": 0.35743454098701477
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.17845611975345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01904296875,
+ "learning_rate": 3.6197798530601124e-06,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16547527.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00111240369733423,
+ "skip_count": 0.0,
+ "step": 10262,
+ "text_loss": 0.7323034405708313
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 48.18784854710889,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 3.582698551926278e-06,
+ "loss": 0.0052,
+ "macro_f1": 1.0,
+ "num_tokens": 16550798.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.005441631190478802,
+ "skip_count": 3.0,
+ "step": 10264,
+ "text_loss": 0.2366604059934616
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.197240974464336,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 3.5458074788387585e-06,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16553653.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033211156260222197,
+ "skip_count": 2.0,
+ "step": 10266,
+ "text_loss": 0.17687638103961945
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.20663340181978,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 3.5091066479344125e-06,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16556646.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005611624801531434,
+ "skip_count": 0.0,
+ "step": 10268,
+ "text_loss": 0.5710030198097229
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 48.21602582917523,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 3.4725960732769345e-06,
+ "loss": 0.0045,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 16560057.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.025627652183175087,
+ "skip_count": 1.0,
+ "step": 10270,
+ "text_loss": 0.45811519026756287
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.22541825653067,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0303955078125,
+ "learning_rate": 3.4362757688573555e-06,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16562955.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016429986571893096,
+ "skip_count": 0.0,
+ "step": 10272,
+ "text_loss": 0.6733152866363525
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.23481068388612,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 3.4001457485935416e-06,
+ "loss": 0.0056,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16565900.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002614749362692237,
+ "skip_count": 0.0,
+ "step": 10274,
+ "text_loss": 0.659094512462616
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 48.24420311124156,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025390625,
+ "learning_rate": 3.3642060263307515e-06,
+ "loss": 0.0045,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16568986.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0007164402049966156,
+ "skip_count": 0.0,
+ "step": 10276,
+ "text_loss": 0.687470018863678
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 48.253595538597004,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 3.3284566158410244e-06,
+ "loss": 0.0058,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16571680.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0013704646844416857,
+ "skip_count": 0.0,
+ "step": 10278,
+ "text_loss": 0.6212679743766785
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.26298796595245,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.032958984375,
+ "learning_rate": 3.29289753082368e-06,
+ "loss": 0.0085,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16575388.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004897230304777622,
+ "skip_count": 2.0,
+ "step": 10280,
+ "text_loss": 0.2466924786567688
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.2723803933079,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 3.2575287849050394e-06,
+ "loss": 0.0085,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16578717.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002897132420912385,
+ "skip_count": 0.0,
+ "step": 10282,
+ "text_loss": 0.3043138384819031
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.28177282066334,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 3.2223503916383736e-06,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16581703.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013213366037234664,
+ "skip_count": 0.0,
+ "step": 10284,
+ "text_loss": 0.4567781686782837
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.291165248018785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 3.187362364504176e-06,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16584502.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033954589162021875,
+ "skip_count": 0.0,
+ "step": 10286,
+ "text_loss": 0.7037429809570312
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.30055767537423,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.05126953125,
+ "learning_rate": 3.152564716909889e-06,
+ "loss": 0.0082,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16587621.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013146435376256704,
+ "skip_count": 0.0,
+ "step": 10288,
+ "text_loss": 0.681390643119812
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.30995010272967,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.044189453125,
+ "learning_rate": 3.1179574621901243e-06,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16590566.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.013315175659954548,
+ "skip_count": 1.0,
+ "step": 10290,
+ "text_loss": 0.28952887654304504
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 48.319342530085116,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 3.0835406136063837e-06,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 16593642.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010560612194240093,
+ "skip_count": 1.0,
+ "step": 10292,
+ "text_loss": 0.19317017495632172
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 48.32873495744057,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.032470703125,
+ "learning_rate": 3.0493141843472293e-06,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16596938.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.00572188850492239,
+ "skip_count": 0.0,
+ "step": 10294,
+ "text_loss": 0.2277865707874298
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.33812738479601,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 3.0152781875283918e-06,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16599989.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002278512343764305,
+ "skip_count": 2.0,
+ "step": 10296,
+ "text_loss": 0.6504809260368347
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.347519812151454,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 2.981432636192438e-06,
+ "loss": 0.0067,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16603008.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00433303089812398,
+ "skip_count": 1.0,
+ "step": 10298,
+ "text_loss": 0.4959591031074524
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.3569122395069,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 2.9477775433091047e-06,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16606208.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00256242579780519,
+ "skip_count": 0.0,
+ "step": 10300,
+ "text_loss": 0.68474280834198
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
+ "epoch": 48.36630466686234,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.03125,
+ "learning_rate": 2.91431292177502e-06,
+ "loss": 0.0055,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 16610278.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.019528929144144058,
+ "skip_count": 2.0,
+ "step": 10302,
+ "text_loss": 0.5476719737052917
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.375697094217784,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.043212890625,
+ "learning_rate": 2.8810387844139807e-06,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16613922.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003266195533797145,
+ "skip_count": 2.0,
+ "step": 10304,
+ "text_loss": 0.24820174276828766
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.385089521573235,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0380859375,
+ "learning_rate": 2.8479551439766215e-06,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16617359.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007189559284597635,
+ "skip_count": 2.0,
+ "step": 10306,
+ "text_loss": 0.5665034055709839
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.39448194892868,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.052490234375,
+ "learning_rate": 2.8150620131407456e-06,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16620207.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014458110090345144,
+ "skip_count": 1.0,
+ "step": 10308,
+ "text_loss": 0.6184256076812744
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 48.40387437628412,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 2.782359404510937e-06,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16624196.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008068135008215904,
+ "skip_count": 3.0,
+ "step": 10310,
+ "text_loss": 0.22482043504714966
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.413266803639566,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01495361328125,
+ "learning_rate": 2.7498473306190043e-06,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16627754.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026512884069234133,
+ "skip_count": 0.0,
+ "step": 10312,
+ "text_loss": 0.597885012626648
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.42265923099501,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 2.717525803923593e-06,
+ "loss": 0.0074,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16630642.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003541568876244128,
+ "skip_count": 0.0,
+ "step": 10314,
+ "text_loss": 0.5806127190589905
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 48.43205165835045,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01495361328125,
+ "learning_rate": 2.685394836810351e-06,
+ "loss": 0.0027,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16634219.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0009424841264262795,
+ "skip_count": 0.0,
+ "step": 10316,
+ "text_loss": 0.5818291902542114
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.441444085705896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 2.653454441591985e-06,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16637963.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026975939981639385,
+ "skip_count": 1.0,
+ "step": 10318,
+ "text_loss": 0.4503914713859558
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.45083651306135,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0223388671875,
+ "learning_rate": 2.6217046305080926e-06,
+ "loss": 0.0042,
+ "macro_f1": 1.0,
+ "num_tokens": 16640931.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.009299893863499165,
+ "skip_count": 3.0,
+ "step": 10320,
+ "text_loss": 0.4027388393878937
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.46022894041679,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0274658203125,
+ "learning_rate": 2.5901454157252204e-06,
+ "loss": 0.0057,
+ "macro_f1": 1.0,
+ "num_tokens": 16644763.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005888781510293484,
+ "skip_count": 3.0,
+ "step": 10322,
+ "text_loss": 0.3544044494628906
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 48.469621367772234,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0201416015625,
+ "learning_rate": 2.5587768093369713e-06,
+ "loss": 0.0037,
+ "macro_f1": 1.0,
+ "num_tokens": 16647642.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005154009442776442,
+ "skip_count": 1.0,
+ "step": 10324,
+ "text_loss": 0.5421624183654785
+ },
+ {
+ "acc_repeat": 0.800000011920929,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.47901379512768,
+ "f1_execute": 0.9729729890823364,
+ "f1_repeat": 0.888888955116272,
+ "f1_skip": 1.0,
+ "grad_norm": 0.035888671875,
+ "learning_rate": 2.527598823363786e-06,
+ "loss": 0.0084,
+ "macro_f1": 0.9539539813995361,
+ "num_tokens": 16650930.0,
+ "repeat_count": 5.0,
+ "routers_loss": 0.05385079234838486,
+ "skip_count": 5.0,
+ "step": 10326,
+ "text_loss": 0.11125081777572632
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 48.48840622248312,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03369140625,
+ "learning_rate": 2.4966114697532185e-06,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16655012.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010021892376244068,
+ "skip_count": 4.0,
+ "step": 10328,
+ "text_loss": 0.6925008296966553
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 48.497798649838565,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0272216796875,
+ "learning_rate": 2.4658147603796587e-06,
+ "loss": 0.0059,
+ "macro_f1": 1.0,
+ "num_tokens": 16657811.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.00567783834412694,
+ "skip_count": 2.0,
+ "step": 10330,
+ "text_loss": 0.25555673241615295
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.507191077194015,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.029541015625,
+ "learning_rate": 2.4352087070443895e-06,
+ "loss": 0.0035,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16661346.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0003201630897819996,
+ "skip_count": 0.0,
+ "step": 10332,
+ "text_loss": 0.41918623447418213
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.51658350454946,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0164794921875,
+ "learning_rate": 2.404793321475751e-06,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16664219.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0063372207805514336,
+ "skip_count": 1.0,
+ "step": 10334,
+ "text_loss": 0.2512246072292328
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.5259759319049,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 2.3745686153290315e-06,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16667608.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004200387746095657,
+ "skip_count": 1.0,
+ "step": 10336,
+ "text_loss": 0.27055928111076355
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.535368359260346,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 2.344534600186299e-06,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16670193.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005420933943241835,
+ "skip_count": 2.0,
+ "step": 10338,
+ "text_loss": 0.19804859161376953
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.54476078661579,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02978515625,
+ "learning_rate": 2.314691287556736e-06,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16673922.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022731551434844732,
+ "skip_count": 2.0,
+ "step": 10340,
+ "text_loss": 0.7323333024978638
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 48.55415321397123,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 2.2850386888763063e-06,
+ "loss": 0.0065,
+ "macro_f1": 1.0,
+ "num_tokens": 16677264.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004557183478027582,
+ "skip_count": 4.0,
+ "step": 10342,
+ "text_loss": 0.34720420837402344
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.563545641326684,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0284423828125,
+ "learning_rate": 2.2555768155079203e-06,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16680472.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001710049225948751,
+ "skip_count": 0.0,
+ "step": 10344,
+ "text_loss": 0.5197516679763794
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.57293806868213,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0203857421875,
+ "learning_rate": 2.2263056787414916e-06,
+ "loss": 0.0038,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 16684337.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.01958944834768772,
+ "skip_count": 2.0,
+ "step": 10346,
+ "text_loss": 0.3011045753955841
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.58233049603757,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.021240234375,
+ "learning_rate": 2.197225289793714e-06,
+ "loss": 0.0028,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16687209.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017791363643482327,
+ "skip_count": 0.0,
+ "step": 10348,
+ "text_loss": 0.33468589186668396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.591722923393014,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 2.168335659808285e-06,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16690418.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003162781707942486,
+ "skip_count": 1.0,
+ "step": 10350,
+ "text_loss": 0.6261059641838074
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.60111535074846,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.038330078125,
+ "learning_rate": 2.1396367998557375e-06,
+ "loss": 0.0064,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16693215.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005255573894828558,
+ "skip_count": 0.0,
+ "step": 10352,
+ "text_loss": 0.40527454018592834
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.6105077781039,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 2.1111287209335527e-06,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16696668.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033432829659432173,
+ "skip_count": 1.0,
+ "step": 10354,
+ "text_loss": 0.28645285964012146
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.619900205459345,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 2.082811433966103e-06,
+ "loss": 0.0051,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16699985.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011671687243506312,
+ "skip_count": 0.0,
+ "step": 10356,
+ "text_loss": 0.746609628200531
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
+ "epoch": 48.629292632814796,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 2.054684949804542e-06,
+ "loss": 0.0038,
+ "macro_f1": 0.5934640765190125,
+ "num_tokens": 16703300.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01967054046690464,
+ "skip_count": 3.0,
+ "step": 10358,
+ "text_loss": 0.33314839005470276
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.63868506017024,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 2.026749279227025e-06,
+ "loss": 0.0071,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 16706630.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.053273484110832214,
+ "skip_count": 2.0,
+ "step": 10360,
+ "text_loss": 0.28726521134376526
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.64807748752568,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0184326171875,
+ "learning_rate": 1.9990044329386004e-06,
+ "loss": 0.003,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16709809.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005693111452274024,
+ "skip_count": 0.0,
+ "step": 10362,
+ "text_loss": 0.4472726285457611
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.657469914881126,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.029296875,
+ "learning_rate": 1.9714504215711528e-06,
+ "loss": 0.0054,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16713089.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008012447506189346,
+ "skip_count": 1.0,
+ "step": 10364,
+ "text_loss": 0.3002646863460541
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.66686234223657,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.080078125,
+ "learning_rate": 1.9440872556833466e-06,
+ "loss": 0.0064,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16715826.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015855961246415973,
+ "skip_count": 1.0,
+ "step": 10366,
+ "text_loss": 0.4461057484149933
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.67625476959201,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 1.9169149457608504e-06,
+ "loss": 0.0048,
+ "macro_f1": 1.0,
+ "num_tokens": 16719155.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.006313335616141558,
+ "skip_count": 2.0,
+ "step": 10368,
+ "text_loss": 0.4553263485431671
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.685647196947464,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 1.889933502216168e-06,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16722191.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0025730058550834656,
+ "skip_count": 0.0,
+ "step": 10370,
+ "text_loss": 0.31290385127067566
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 48.69503962430291,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 1.8631429353885842e-06,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16725460.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.005240040831267834,
+ "skip_count": 0.0,
+ "step": 10372,
+ "text_loss": 0.4621378779411316
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.70443205165835,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 1.8365432555443318e-06,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16728690.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001968455035239458,
+ "skip_count": 1.0,
+ "step": 10374,
+ "text_loss": 0.5022224187850952
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 48.713824479013795,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 1.8101344728764234e-06,
+ "loss": 0.0063,
+ "macro_f1": 1.0,
+ "num_tokens": 16733192.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.003168600145727396,
+ "skip_count": 2.0,
+ "step": 10376,
+ "text_loss": 0.4973319470882416
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.72321690636924,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02880859375,
+ "learning_rate": 1.78391659750482e-06,
+ "loss": 0.0072,
+ "macro_f1": 0.8820862174034119,
+ "num_tokens": 16736219.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.04101128876209259,
+ "skip_count": 2.0,
+ "step": 10378,
+ "text_loss": 0.13770700991153717
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 48.73260933372468,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 1.7578896394762067e-06,
+ "loss": 0.0039,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16738840.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0013390687527135015,
+ "skip_count": 0.0,
+ "step": 10380,
+ "text_loss": 0.8668286800384521
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 48.74200176108013,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025146484375,
+ "learning_rate": 1.7320536087641613e-06,
+ "loss": 0.0074,
+ "macro_f1": 0.6595745086669922,
+ "num_tokens": 16742145.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04137809947133064,
+ "skip_count": 4.0,
+ "step": 10382,
+ "text_loss": 0.19390869140625
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.751394188435576,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.037109375,
+ "learning_rate": 1.7064085152691534e-06,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16745293.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013169923331588507,
+ "skip_count": 0.0,
+ "step": 10384,
+ "text_loss": 0.6248905658721924
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 48.76078661579102,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 1.6809543688183771e-06,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 16748438.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.005269711371511221,
+ "skip_count": 1.0,
+ "step": 10386,
+ "text_loss": 0.8555964827537537
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.77017904314646,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.037841796875,
+ "learning_rate": 1.655691179165919e-06,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16752155.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005495068617165089,
+ "skip_count": 1.0,
+ "step": 10388,
+ "text_loss": 0.17478284239768982
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.77957147050191,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02392578125,
+ "learning_rate": 1.630618955992702e-06,
+ "loss": 0.004,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16755072.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012614501174539328,
+ "skip_count": 0.0,
+ "step": 10390,
+ "text_loss": 0.4476284384727478
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.78896389785735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 1.605737708906374e-06,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16759622.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0016320135910063982,
+ "skip_count": 1.0,
+ "step": 10392,
+ "text_loss": 0.6159437894821167
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.798356325212794,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 1.5810474474415858e-06,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16763114.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00686453003436327,
+ "skip_count": 2.0,
+ "step": 10394,
+ "text_loss": 0.2532145082950592
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 48.807748752568244,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 1.5565481810596582e-06,
+ "loss": 0.0052,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 16765899.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.010446256957948208,
+ "skip_count": 3.0,
+ "step": 10396,
+ "text_loss": 0.3457704186439514
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.81714117992369,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 1.5322399191487479e-06,
+ "loss": 0.006,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 16769020.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.028018560260534286,
+ "skip_count": 3.0,
+ "step": 10398,
+ "text_loss": 0.2568260133266449
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.82653360727913,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 1.5081226710237927e-06,
+ "loss": 0.0067,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16771946.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017342215869575739,
+ "skip_count": 0.0,
+ "step": 10400,
+ "text_loss": 0.21244384348392487
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 48.835926034634575,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0208740234375,
+ "learning_rate": 1.4841964459266221e-06,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16775398.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007773366756737232,
+ "skip_count": 3.0,
+ "step": 10402,
+ "text_loss": 0.2011307328939438
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.84531846199002,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 1.4604612530257356e-06,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16778556.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032797956373542547,
+ "skip_count": 1.0,
+ "step": 10404,
+ "text_loss": 0.2331003099679947
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.85471088934546,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0296630859375,
+ "learning_rate": 1.4369171014165793e-06,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16782298.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000855600053910166,
+ "skip_count": 0.0,
+ "step": 10406,
+ "text_loss": 0.37070924043655396
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 48.86410331670091,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 1.41356400012127e-06,
+ "loss": 0.0063,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16785157.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0023640329018235207,
+ "skip_count": 0.0,
+ "step": 10408,
+ "text_loss": 0.2712402939796448
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.873495744056356,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 1.390401958088816e-06,
+ "loss": 0.0059,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16787842.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0042669083923101425,
+ "skip_count": 2.0,
+ "step": 10410,
+ "text_loss": 0.1989891678094864
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.8828881714118,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 1.367430984194895e-06,
+ "loss": 0.0049,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16791063.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010778319090604782,
+ "skip_count": 2.0,
+ "step": 10412,
+ "text_loss": 0.2656673491001129
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.89228059876724,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0341796875,
+ "learning_rate": 1.3446510872420214e-06,
+ "loss": 0.0076,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 16794468.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014451594091951847,
+ "skip_count": 1.0,
+ "step": 10414,
+ "text_loss": 0.615280032157898
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.90167302612269,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 1.3220622759596014e-06,
+ "loss": 0.0085,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16797197.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002590922173112631,
+ "skip_count": 1.0,
+ "step": 10416,
+ "text_loss": 0.6224665641784668
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.91106545347813,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 1.2996645590035439e-06,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16800435.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0002690292603801936,
+ "skip_count": 0.0,
+ "step": 10418,
+ "text_loss": 0.5916928052902222
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.92045788083358,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0262451171875,
+ "learning_rate": 1.2774579449568723e-06,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16803488.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005886071361601353,
+ "skip_count": 2.0,
+ "step": 10420,
+ "text_loss": 0.33671438694000244
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 48.929850308189025,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 1.2554424423290578e-06,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16807339.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038914172910153866,
+ "skip_count": 2.0,
+ "step": 10422,
+ "text_loss": 0.11040981113910675
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.93924273554447,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 1.2336180595565738e-06,
+ "loss": 0.0038,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16810409.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001565443933941424,
+ "skip_count": 1.0,
+ "step": 10424,
+ "text_loss": 0.5290043950080872
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.94863516289991,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 1.2119848050025083e-06,
+ "loss": 0.007,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16813888.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023584216833114624,
+ "skip_count": 0.0,
+ "step": 10426,
+ "text_loss": 0.21560436487197876
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 48.958027590255355,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.036865234375,
+ "learning_rate": 1.1905426869567859e-06,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16816678.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004424029495567083,
+ "skip_count": 0.0,
+ "step": 10428,
+ "text_loss": 0.36319077014923096
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 48.9674200176108,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0286865234375,
+ "learning_rate": 1.1692917136361115e-06,
+ "loss": 0.0039,
+ "macro_f1": 1.0,
+ "num_tokens": 16819763.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.004053499549627304,
+ "skip_count": 1.0,
+ "step": 10430,
+ "text_loss": 0.6534333825111389
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.97681244496625,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 1.1482318931838043e-06,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16823415.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0022409996017813683,
+ "skip_count": 1.0,
+ "step": 10432,
+ "text_loss": 0.33003750443458557
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 48.98620487232169,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 1.1273632336700756e-06,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16826676.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0061734220944345,
+ "skip_count": 0.0,
+ "step": 10434,
+ "text_loss": 0.23123329877853394
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 48.99559729967714,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 1.106685743091862e-06,
+ "loss": 0.0044,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16829866.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0038321982137858868,
+ "skip_count": 1.0,
+ "step": 10436,
+ "text_loss": 0.2427562028169632
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.004696213677725,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031494140625,
+ "learning_rate": 1.086199429372825e-06,
+ "loss": 0.0073,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16833765.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00676750298589468,
+ "skip_count": 2.0,
+ "step": 10438,
+ "text_loss": 0.42610102891921997
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.01408864103317,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03271484375,
+ "learning_rate": 1.0659043003632962e-06,
+ "loss": 0.0058,
+ "macro_f1": 1.0,
+ "num_tokens": 16837285.0,
+ "repeat_count": 4.0,
+ "routers_loss": 0.007271626964211464,
+ "skip_count": 5.0,
+ "step": 10440,
+ "text_loss": 0.8925374746322632
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.02348106838861,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03564453125,
+ "learning_rate": 1.0458003638404434e-06,
+ "loss": 0.0043,
+ "macro_f1": 0.6603773832321167,
+ "num_tokens": 16840273.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.02480674348771572,
+ "skip_count": 1.0,
+ "step": 10442,
+ "text_loss": 0.445250540971756
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.032873495744056,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 1.0258876275081043e-06,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16844115.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003030754392966628,
+ "skip_count": 1.0,
+ "step": 10444,
+ "text_loss": 0.5095187425613403
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.0422659230995,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 1.0061660989969523e-06,
+ "loss": 0.009,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16847375.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006397911347448826,
+ "skip_count": 2.0,
+ "step": 10446,
+ "text_loss": 0.2943403720855713
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.05165835045494,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.041259765625,
+ "learning_rate": 9.866357858642206e-07,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16850247.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007977386936545372,
+ "skip_count": 2.0,
+ "step": 10448,
+ "text_loss": 0.3035532832145691
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.061050777810394,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.024169921875,
+ "learning_rate": 9.672966955940331e-07,
+ "loss": 0.0051,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16853383.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003959330730140209,
+ "skip_count": 1.0,
+ "step": 10450,
+ "text_loss": 0.5030179619789124
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.07044320516584,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0277099609375,
+ "learning_rate": 9.481488355971291e-07,
+ "loss": 0.0061,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16856733.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003481166437268257,
+ "skip_count": 1.0,
+ "step": 10452,
+ "text_loss": 0.6293197870254517
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.07983563252128,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0267333984375,
+ "learning_rate": 9.29192213210972e-07,
+ "loss": 0.0045,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16859753.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002803726587444544,
+ "skip_count": 0.0,
+ "step": 10454,
+ "text_loss": 0.6037408113479614
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.089228059876724,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02734375,
+ "learning_rate": 9.104268356998624e-07,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16862632.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033954589162021875,
+ "skip_count": 0.0,
+ "step": 10456,
+ "text_loss": 0.631564199924469
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.09862048723217,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0390625,
+ "learning_rate": 8.918527102546592e-07,
+ "loss": 0.008,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16866025.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002237692940980196,
+ "skip_count": 0.0,
+ "step": 10458,
+ "text_loss": 0.18825361132621765
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.10801291458761,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0218505859375,
+ "learning_rate": 8.734698439930577e-07,
+ "loss": 0.005,
+ "macro_f1": 1.0,
+ "num_tokens": 16869194.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.015361418016254902,
+ "skip_count": 2.0,
+ "step": 10460,
+ "text_loss": 0.15555702149868011
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 49.117405341943055,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 8.552782439593121e-07,
+ "loss": 0.0068,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16872400.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.010845578275620937,
+ "skip_count": 4.0,
+ "step": 10462,
+ "text_loss": 0.2473229318857193
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.126797769298506,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.020751953125,
+ "learning_rate": 8.372779171245681e-07,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16874842.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0031175457406789064,
+ "skip_count": 2.0,
+ "step": 10464,
+ "text_loss": 0.21604472398757935
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.13619019665395,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0263671875,
+ "learning_rate": 8.19468870386586e-07,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16877754.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00562032638117671,
+ "skip_count": 2.0,
+ "step": 10466,
+ "text_loss": 0.7601249814033508
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.14558262400939,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0322265625,
+ "learning_rate": 8.018511105697957e-07,
+ "loss": 0.0049,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16880755.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003651288105174899,
+ "skip_count": 0.0,
+ "step": 10468,
+ "text_loss": 0.15034520626068115
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.154975051364836,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 7.844246444253522e-07,
+ "loss": 0.0066,
+ "macro_f1": 0.8817967176437378,
+ "num_tokens": 16884024.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.03286674618721008,
+ "skip_count": 3.0,
+ "step": 10470,
+ "text_loss": 0.2850193977355957
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.16436747872028,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0238037109375,
+ "learning_rate": 7.671894786310807e-07,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16887512.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005333275999873877,
+ "skip_count": 2.0,
+ "step": 10472,
+ "text_loss": 0.27574512362480164
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.17375990607572,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.023681640625,
+ "learning_rate": 7.501456197915868e-07,
+ "loss": 0.0086,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16890505.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008608506061136723,
+ "skip_count": 2.0,
+ "step": 10474,
+ "text_loss": 0.110866978764534
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.183152333431174,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0216064453125,
+ "learning_rate": 7.332930744380905e-07,
+ "loss": 0.0033,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16893806.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.001401562592945993,
+ "skip_count": 0.0,
+ "step": 10476,
+ "text_loss": 0.35479840636253357
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 49.19254476078662,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 7.166318490284818e-07,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16897076.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00925722997635603,
+ "skip_count": 4.0,
+ "step": 10478,
+ "text_loss": 0.20996634662151337
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.20193718814206,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0252685546875,
+ "learning_rate": 7.001619499474309e-07,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16900449.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017279108287766576,
+ "skip_count": 0.0,
+ "step": 10480,
+ "text_loss": 0.6246579885482788
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 49.211329615497505,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03759765625,
+ "learning_rate": 6.83883383506223e-07,
+ "loss": 0.0062,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16903275.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0030049486085772514,
+ "skip_count": 0.0,
+ "step": 10482,
+ "text_loss": 0.4425566494464874
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.22072204285295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0233154296875,
+ "learning_rate": 6.677961559428125e-07,
+ "loss": 0.004,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16906789.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028844536282122135,
+ "skip_count": 2.0,
+ "step": 10484,
+ "text_loss": 0.6716867685317993
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.23011447020839,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0198974609375,
+ "learning_rate": 6.519002734218793e-07,
+ "loss": 0.005,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16910099.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009280550293624401,
+ "skip_count": 0.0,
+ "step": 10486,
+ "text_loss": 0.7250060439109802
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.23950689756384,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0302734375,
+ "learning_rate": 6.361957420347175e-07,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16912752.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0034732224885374308,
+ "skip_count": 0.0,
+ "step": 10488,
+ "text_loss": 0.23244275152683258
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.248899324919286,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.01953125,
+ "learning_rate": 6.206825677993466e-07,
+ "loss": 0.0035,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16916677.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004304944537580013,
+ "skip_count": 2.0,
+ "step": 10490,
+ "text_loss": 0.5831108093261719
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.25829175227473,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.041748046875,
+ "learning_rate": 6.053607566604557e-07,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16919460.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002612794516608119,
+ "skip_count": 0.0,
+ "step": 10492,
+ "text_loss": 0.2705974280834198
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.26768417963017,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.040283203125,
+ "learning_rate": 5.902303144894039e-07,
+ "loss": 0.0033,
+ "macro_f1": 1.0,
+ "num_tokens": 16922572.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.006487889215350151,
+ "skip_count": 4.0,
+ "step": 10494,
+ "text_loss": 0.23415961861610413
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.27707660698562,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.01806640625,
+ "learning_rate": 5.752912470842198e-07,
+ "loss": 0.0059,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16925184.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0015968878287822008,
+ "skip_count": 0.0,
+ "step": 10496,
+ "text_loss": 0.4943143427371979
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.28646903434106,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03466796875,
+ "learning_rate": 5.605435601695464e-07,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16928598.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0010248057078570127,
+ "skip_count": 0.0,
+ "step": 10498,
+ "text_loss": 0.36662834882736206
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.295861461696504,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02490234375,
+ "learning_rate": 5.459872593966963e-07,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16931370.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012296726927161217,
+ "skip_count": 1.0,
+ "step": 10500,
+ "text_loss": 0.19061364233493805
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.305253889051954,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0458984375,
+ "learning_rate": 5.316223503437079e-07,
+ "loss": 0.0061,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16934631.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0026445689145475626,
+ "skip_count": 0.0,
+ "step": 10502,
+ "text_loss": 0.1848333775997162
+ },
+ {
+ "acc_repeat": 0.5,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 49.3146463164074,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 5.174488385152887e-07,
+ "loss": 0.0042,
+ "macro_f1": 0.8823530077934265,
+ "num_tokens": 16938097.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.011918487958610058,
+ "skip_count": 1.0,
+ "step": 10504,
+ "text_loss": 0.18828579783439636
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.32403874376284,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02587890625,
+ "learning_rate": 5.034667293427053e-07,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16941223.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004649503156542778,
+ "skip_count": 2.0,
+ "step": 10506,
+ "text_loss": 0.4231431484222412
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 49.333431171118285,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0234375,
+ "learning_rate": 4.896760281838942e-07,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16944572.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0019313854863867164,
+ "skip_count": 0.0,
+ "step": 10508,
+ "text_loss": 0.7520577311515808
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.34282359847373,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0242919921875,
+ "learning_rate": 4.7607674032351666e-07,
+ "loss": 0.0053,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16947661.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0014383324887603521,
+ "skip_count": 0.0,
+ "step": 10510,
+ "text_loss": 0.6348366737365723
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.35221602582917,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0301513671875,
+ "learning_rate": 4.626688709728488e-07,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16951242.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007596072391606867,
+ "skip_count": 0.0,
+ "step": 10512,
+ "text_loss": 0.40759870409965515
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.36160845318462,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0213623046875,
+ "learning_rate": 4.494524252698362e-07,
+ "loss": 0.0046,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16955082.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018172853160649538,
+ "skip_count": 0.0,
+ "step": 10514,
+ "text_loss": 0.18837586045265198
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.371000880540066,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0291748046875,
+ "learning_rate": 4.364274082789832e-07,
+ "loss": 0.0062,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16957903.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003865955863147974,
+ "skip_count": 0.0,
+ "step": 10516,
+ "text_loss": 0.7716887593269348
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.38039330789551,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033447265625,
+ "learning_rate": 4.2359382499151945e-07,
+ "loss": 0.0076,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16960653.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002676841337233782,
+ "skip_count": 0.0,
+ "step": 10518,
+ "text_loss": 0.5054554343223572
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 49.38978573525095,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.058349609375,
+ "learning_rate": 4.1095168032534437e-07,
+ "loss": 0.0078,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16964472.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015017857076600194,
+ "skip_count": 0.0,
+ "step": 10520,
+ "text_loss": 0.9150356650352478
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.3991781626064,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 3.985009791249161e-07,
+ "loss": 0.0071,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16967992.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006224984303116798,
+ "skip_count": 0.0,
+ "step": 10522,
+ "text_loss": 0.26261746883392334
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.40857058996184,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02001953125,
+ "learning_rate": 3.8624172616136265e-07,
+ "loss": 0.0075,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16971407.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004404739011079073,
+ "skip_count": 2.0,
+ "step": 10524,
+ "text_loss": 0.37001657485961914
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 49.41796301731729,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 3.741739261324817e-07,
+ "loss": 0.0055,
+ "macro_f1": 1.0,
+ "num_tokens": 16974847.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0010904704686254263,
+ "skip_count": 1.0,
+ "step": 10526,
+ "text_loss": 0.3782288432121277
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 49.427355444672735,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02685546875,
+ "learning_rate": 3.6229758366262967e-07,
+ "loss": 0.0071,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16977743.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.005372707732021809,
+ "skip_count": 3.0,
+ "step": 10528,
+ "text_loss": 0.2069653421640396
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.43674787202818,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06298828125,
+ "learning_rate": 3.506127033028883e-07,
+ "loss": 0.008,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16982217.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017930102767422795,
+ "skip_count": 1.0,
+ "step": 10530,
+ "text_loss": 0.23420299589633942
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 49.44614029938362,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 3.391192895308981e-07,
+ "loss": 0.0083,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16985440.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.01549626886844635,
+ "skip_count": 4.0,
+ "step": 10532,
+ "text_loss": 0.2651829421520233
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
+ "epoch": 49.455532726739065,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 3.278173467509693e-07,
+ "loss": 0.0091,
+ "macro_f1": 0.6122449040412903,
+ "num_tokens": 16988716.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.014724464155733585,
+ "skip_count": 4.0,
+ "step": 10534,
+ "text_loss": 0.9998418688774109
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.46492515409451,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033935546875,
+ "learning_rate": 3.167068792940264e-07,
+ "loss": 0.0034,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16992076.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00042533327359706163,
+ "skip_count": 0.0,
+ "step": 10536,
+ "text_loss": 0.7315229177474976
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.47431758144996,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03515625,
+ "learning_rate": 3.057878914176082e-07,
+ "loss": 0.0037,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 16994960.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0006869849166832864,
+ "skip_count": 0.0,
+ "step": 10538,
+ "text_loss": 1.1293457746505737
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
+ "epoch": 49.4837100088054,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0279541015625,
+ "learning_rate": 2.9506038730592323e-07,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 16997778.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.0009595098090358078,
+ "skip_count": 0.0,
+ "step": 10540,
+ "text_loss": 0.6721776723861694
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.49310243616085,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022705078125,
+ "learning_rate": 2.845243710697387e-07,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17001173.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003649777267128229,
+ "skip_count": 0.0,
+ "step": 10542,
+ "text_loss": 0.44033801555633545
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.50249486351629,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 2.741798467464918e-07,
+ "loss": 0.0043,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17005213.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0005361626390367746,
+ "skip_count": 0.0,
+ "step": 10544,
+ "text_loss": 0.3993811309337616
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.511887290871734,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.019775390625,
+ "learning_rate": 2.6402681830023365e-07,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17008027.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0047687748447060585,
+ "skip_count": 0.0,
+ "step": 10546,
+ "text_loss": 0.3197088837623596
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.52127971822718,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0224609375,
+ "learning_rate": 2.540652896215745e-07,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17010934.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.003123556962236762,
+ "skip_count": 1.0,
+ "step": 10548,
+ "text_loss": 0.33580848574638367
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.53067214558262,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0172119140625,
+ "learning_rate": 2.4429526452784955e-07,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17014097.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012946722563356161,
+ "skip_count": 0.0,
+ "step": 10550,
+ "text_loss": 0.48477989435195923
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 49.54006457293807,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 2.3471674676295296e-07,
+ "loss": 0.0053,
+ "macro_f1": 1.0,
+ "num_tokens": 17017643.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.016718504950404167,
+ "skip_count": 1.0,
+ "step": 10552,
+ "text_loss": 0.23426192998886108
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.549457000293515,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 2.2532973999733751e-07,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17020900.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0041206348687410355,
+ "skip_count": 1.0,
+ "step": 10554,
+ "text_loss": 0.15234927833080292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.55884942764896,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02197265625,
+ "learning_rate": 2.1613424782812584e-07,
+ "loss": 0.0073,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17024504.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002981720957905054,
+ "skip_count": 0.0,
+ "step": 10556,
+ "text_loss": 0.3161900043487549
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 49.5682418550044,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04345703125,
+ "learning_rate": 2.0713027377911032e-07,
+ "loss": 0.0082,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17027482.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.004473469685763121,
+ "skip_count": 0.0,
+ "step": 10558,
+ "text_loss": 0.5996923446655273
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.577634282359845,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.060302734375,
+ "learning_rate": 1.983178213005865e-07,
+ "loss": 0.0054,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17029843.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0032688030041754246,
+ "skip_count": 0.0,
+ "step": 10560,
+ "text_loss": 0.5574228763580322
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.58702670971529,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031005859375,
+ "learning_rate": 1.8969689376951981e-07,
+ "loss": 0.0055,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17032786.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.004503661300987005,
+ "skip_count": 1.0,
+ "step": 10562,
+ "text_loss": 0.2402963787317276
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.59641913707074,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 1.8126749448943435e-07,
+ "loss": 0.0072,
+ "macro_f1": 1.0,
+ "num_tokens": 17035399.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.00628487067297101,
+ "skip_count": 4.0,
+ "step": 10564,
+ "text_loss": 0.29870063066482544
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.60581156442618,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 1.7302962669052402e-07,
+ "loss": 0.0063,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17038486.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000694084505084902,
+ "skip_count": 0.0,
+ "step": 10566,
+ "text_loss": 0.5111265778541565
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 28.0,
+ "epoch": 49.61520399178163,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.021484375,
+ "learning_rate": 1.6498329352954143e-07,
+ "loss": 0.0056,
+ "macro_f1": 1.0,
+ "num_tokens": 17042070.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.002611940260976553,
+ "skip_count": 2.0,
+ "step": 10568,
+ "text_loss": 0.4722840189933777
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.62459641913707,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 1.5712849808985353e-07,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17045164.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0020359482150524855,
+ "skip_count": 1.0,
+ "step": 10570,
+ "text_loss": 0.5299108028411865
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.633988846492514,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 1.494652433814414e-07,
+ "loss": 0.0041,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17048468.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0017503987764939666,
+ "skip_count": 0.0,
+ "step": 10572,
+ "text_loss": 0.5245226621627808
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 28.0,
+ "epoch": 49.64338127384796,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.029296875,
+ "learning_rate": 1.4199353234090052e-07,
+ "loss": 0.0069,
+ "macro_f1": 0.9262410998344421,
+ "num_tokens": 17051716.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.02260318584740162,
+ "skip_count": 3.0,
+ "step": 10574,
+ "text_loss": 0.34682315587997437
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 24.0,
+ "epoch": 49.65277370120341,
+ "f1_execute": 0.978723406791687,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 1.347133678313295e-07,
+ "loss": 0.0055,
+ "macro_f1": 0.6595745086669922,
+ "num_tokens": 17054642.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04137809947133064,
+ "skip_count": 4.0,
+ "step": 10576,
+ "text_loss": 0.2545051574707031
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.66216612855885,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0255126953125,
+ "learning_rate": 1.2762475264260775e-07,
+ "loss": 0.0036,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17058378.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006063911598175764,
+ "skip_count": 0.0,
+ "step": 10578,
+ "text_loss": 0.5370165109634399
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.671558555914295,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.033203125,
+ "learning_rate": 1.2072768949100698e-07,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17061416.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0011335996678099036,
+ "skip_count": 0.0,
+ "step": 10580,
+ "text_loss": 0.4543360471725464
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.68095098326974,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0257568359375,
+ "learning_rate": 1.140221810195241e-07,
+ "loss": 0.0039,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17064236.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0033164035994559526,
+ "skip_count": 0.0,
+ "step": 10582,
+ "text_loss": 0.2804311215877533
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.69034341062518,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.018798828125,
+ "learning_rate": 1.075082297977703e-07,
+ "loss": 0.0053,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17067446.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0023050394374877214,
+ "skip_count": 2.0,
+ "step": 10584,
+ "text_loss": 0.23257072269916534
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 49.699735837980626,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052978515625,
+ "learning_rate": 1.0118583832186001e-07,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17070365.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.003029540413990617,
+ "skip_count": 0.0,
+ "step": 10586,
+ "text_loss": 0.5026201605796814
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.70912826533607,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0240478515625,
+ "learning_rate": 9.505500901457742e-08,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17074316.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0036497078835964203,
+ "skip_count": 0.0,
+ "step": 10588,
+ "text_loss": 0.6814579367637634
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.71852069269152,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 8.911574422520997e-08,
+ "loss": 0.0057,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17077668.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007934805005788803,
+ "skip_count": 1.0,
+ "step": 10590,
+ "text_loss": 0.14940814673900604
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.72791312004696,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.034912109375,
+ "learning_rate": 8.336804622977034e-08,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17080434.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012133397394791245,
+ "skip_count": 0.0,
+ "step": 10592,
+ "text_loss": 0.6377768516540527
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.73730554740241,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.030517578125,
+ "learning_rate": 7.781191723071902e-08,
+ "loss": 0.0056,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17083362.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0009114379645325243,
+ "skip_count": 1.0,
+ "step": 10594,
+ "text_loss": 0.41287705302238464
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.74669797475785,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.025634765625,
+ "learning_rate": 7.244735935724167e-08,
+ "loss": 0.0042,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17086649.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0018709978321567178,
+ "skip_count": 0.0,
+ "step": 10596,
+ "text_loss": 0.48996540904045105
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 49.756090402113294,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 6.727437466497177e-08,
+ "loss": 0.0062,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 17090390.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.017759598791599274,
+ "skip_count": 2.0,
+ "step": 10598,
+ "text_loss": 0.16886916756629944
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.76548282946874,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0245361328125,
+ "learning_rate": 6.229296513621253e-08,
+ "loss": 0.0044,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17092940.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0013719784328714013,
+ "skip_count": 0.0,
+ "step": 10600,
+ "text_loss": 0.6593959927558899
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
+ "epoch": 49.77487525682419,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0220947265625,
+ "learning_rate": 5.7503132679936896e-08,
+ "loss": 0.0028,
+ "macro_f1": 0.5492662787437439,
+ "num_tokens": 17096497.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.012247482314705849,
+ "skip_count": 2.0,
+ "step": 10602,
+ "text_loss": 0.4913390874862671
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 49.78426768417963,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
+ "learning_rate": 5.290487913156561e-08,
+ "loss": 0.0062,
+ "macro_f1": 0.6598639488220215,
+ "num_tokens": 17099651.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013324257917702198,
+ "skip_count": 3.0,
+ "step": 10604,
+ "text_loss": 0.18341897428035736
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 49.793660111535075,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0264892578125,
+ "learning_rate": 4.8498206253133614e-08,
+ "loss": 0.0048,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17102154.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.007073273416608572,
+ "skip_count": 3.0,
+ "step": 10606,
+ "text_loss": 0.5444790720939636
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 49.80305253889052,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0235595703125,
+ "learning_rate": 4.4283115733290134e-08,
+ "loss": 0.0046,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17105299.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.001649016048759222,
+ "skip_count": 0.0,
+ "step": 10608,
+ "text_loss": 0.5396550297737122
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.81244496624596,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.052001953125,
+ "learning_rate": 4.0259609187298654e-08,
+ "loss": 0.0038,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17108786.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.00029506805003620684,
+ "skip_count": 0.0,
+ "step": 10610,
+ "text_loss": 0.5690585374832153
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.821837393601406,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 3.6427688156981384e-08,
+ "loss": 0.0077,
+ "macro_f1": 0.6601307392120361,
+ "num_tokens": 17111796.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.04010998085141182,
+ "skip_count": 2.0,
+ "step": 10612,
+ "text_loss": 0.3106518089771271
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.83122982095686,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.026611328125,
+ "learning_rate": 3.2787354110663804e-08,
+ "loss": 0.006,
+ "macro_f1": 1.0,
+ "num_tokens": 17114975.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013439279049634933,
+ "skip_count": 2.0,
+ "step": 10614,
+ "text_loss": 0.19681362807750702
+ },
+ {
+ "acc_repeat": 0.6666666865348816,
+ "acc_skip": 1.0,
+ "avg_layers": 29.0,
+ "epoch": 49.8406222483123,
+ "f1_execute": 0.9795917868614197,
+ "f1_repeat": 0.800000011920929,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0230712890625,
+ "learning_rate": 2.933860844345215e-08,
+ "loss": 0.0061,
+ "macro_f1": 0.9265305995941162,
+ "num_tokens": 17118217.0,
+ "repeat_count": 3.0,
+ "routers_loss": 0.020981203764677048,
+ "skip_count": 1.0,
+ "step": 10616,
+ "text_loss": 0.6071886420249939
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.850014675667744,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.04296875,
+ "learning_rate": 2.6081452476789392e-08,
+ "loss": 0.0069,
+ "macro_f1": 1.0,
+ "num_tokens": 17121656.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.013027530163526535,
+ "skip_count": 3.0,
+ "step": 10618,
+ "text_loss": 0.21379177272319794
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.85940710302319,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.022216796875,
+ "learning_rate": 2.3015887458899266e-08,
+ "loss": 0.0081,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17124584.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0056997365318238735,
+ "skip_count": 2.0,
+ "step": 10620,
+ "text_loss": 0.22514000535011292
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.86879953037863,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.02783203125,
+ "learning_rate": 2.0141914564453245e-08,
+ "loss": 0.0064,
+ "macro_f1": 0.3272727429866791,
+ "num_tokens": 17127853.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.016820410266518593,
+ "skip_count": 1.0,
+ "step": 10622,
+ "text_loss": 0.22637426853179932
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.878191957734074,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0211181640625,
+ "learning_rate": 1.745953489479257e-08,
+ "loss": 0.0058,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17131958.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0029321140609681606,
+ "skip_count": 0.0,
+ "step": 10624,
+ "text_loss": 0.3751795291900635
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 49.887584385089525,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.046630859375,
+ "learning_rate": 1.4968749477872744e-08,
+ "loss": 0.0072,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17135482.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0027504474855959415,
+ "skip_count": 0.0,
+ "step": 10626,
+ "text_loss": 0.3414074778556824
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.89697681244497,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 1.2669559268041475e-08,
+ "loss": 0.0031,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17138415.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0012815104564651847,
+ "skip_count": 1.0,
+ "step": 10628,
+ "text_loss": 0.4166540801525116
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.90636923980041,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.017578125,
+ "learning_rate": 1.0561965146482777e-08,
+ "loss": 0.0044,
+ "macro_f1": 1.0,
+ "num_tokens": 17142241.0,
+ "repeat_count": 2.0,
+ "routers_loss": 0.010521184653043747,
+ "skip_count": 4.0,
+ "step": 10630,
+ "text_loss": 0.3614460825920105
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.915761667155856,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0289306640625,
+ "learning_rate": 8.645967920717369e-09,
+ "loss": 0.0047,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17145305.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.002076479373499751,
+ "skip_count": 0.0,
+ "step": 10632,
+ "text_loss": 0.4676922857761383
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 25.0,
+ "epoch": 49.9251540945113,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.888888955116272,
+ "grad_norm": 0.03173828125,
+ "learning_rate": 6.921568325046756e-09,
+ "loss": 0.0064,
+ "macro_f1": 0.9555556178092957,
+ "num_tokens": 17149574.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.020939050242304802,
+ "skip_count": 5.0,
+ "step": 10634,
+ "text_loss": 0.4579739570617676
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.93454652186674,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.018798828125,
+ "learning_rate": 5.388767020220176e-09,
+ "loss": 0.0057,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17152561.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0007589405868202448,
+ "skip_count": 0.0,
+ "step": 10636,
+ "text_loss": 0.531318187713623
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
+ "epoch": 49.943938949222186,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0247802734375,
+ "learning_rate": 4.047564593601116e-09,
+ "loss": 0.0041,
+ "macro_f1": 1.0,
+ "num_tokens": 17155284.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0013623902341350913,
+ "skip_count": 2.0,
+ "step": 10638,
+ "text_loss": 0.533105194568634
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
+ "epoch": 49.95333137657764,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0283203125,
+ "learning_rate": 2.8979615591673283e-09,
+ "loss": 0.0047,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17158345.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.008068135008215904,
+ "skip_count": 3.0,
+ "step": 10640,
+ "text_loss": 0.2997605800628662
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.96272380393308,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.031982421875,
+ "learning_rate": 1.939958357455307e-09,
+ "loss": 0.0065,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17161982.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.006473845802247524,
+ "skip_count": 2.0,
+ "step": 10642,
+ "text_loss": 0.24127982556819916
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
+ "epoch": 49.972116231288524,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.040771484375,
+ "learning_rate": 1.1735553555602963e-09,
+ "loss": 0.006,
+ "macro_f1": 0.3333333432674408,
+ "num_tokens": 17166156.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.000686702027451247,
+ "skip_count": 0.0,
+ "step": 10644,
+ "text_loss": 0.5044453144073486
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 49.98150865864397,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.04150390625,
+ "learning_rate": 5.987528471362857e-10,
+ "loss": 0.005,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17169311.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0015337419463321567,
+ "skip_count": 0.0,
+ "step": 10646,
+ "text_loss": 0.7889845371246338
+ },
+ {
+ "acc_repeat": 0.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
+ "epoch": 49.99090108599941,
+ "f1_execute": 1.0,
+ "f1_repeat": 0.0,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0260009765625,
+ "learning_rate": 2.1555105250703476e-10,
+ "loss": 0.0041,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17171990.0,
+ "repeat_count": 0.0,
+ "routers_loss": 0.0028676397632807493,
+ "skip_count": 2.0,
+ "step": 10648,
+ "text_loss": 0.4312690794467926
+ },
+ {
+ "acc_repeat": 1.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
+ "epoch": 50.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.0,
+ "grad_norm": 0.028076171875,
+ "learning_rate": 2.395011849953832e-11,
+ "loss": 0.0043,
+ "macro_f1": 0.6666666865348816,
+ "num_tokens": 17175100.0,
+ "repeat_count": 1.0,
+ "routers_loss": 0.0016953344456851482,
+ "skip_count": 0.0,
+ "step": 10650,
+ "text_loss": 0.2874845862388611
+ }
+ ],
+ "logging_steps": 2,
+ "max_steps": 10650,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 50,
+ "save_steps": 1000,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.916096880361167e+17,
+ "train_batch_size": 1,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-10650/training_args.bin b/checkpoint-10650/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a3d3ae372faf14539639f54454aa52b6ee730c4a
--- /dev/null
+++ b/checkpoint-10650/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8
+size 5880
diff --git a/checkpoint-2000/model-00002-of-00002.safetensors b/checkpoint-2000/model-00002-of-00002.safetensors
index 892caec7a1b07ac8579989849dcbdbaa51cbb1be..8de1a284813a836fcdcc8636624510f406abe203 100644
--- a/checkpoint-2000/model-00002-of-00002.safetensors
+++ b/checkpoint-2000/model-00002-of-00002.safetensors
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:0abc54f09e3d7a9e90771cb6b93f11508f30c7eaa03a0fd91cbba011629d9925
+oid sha256:a56bbbae6071c88ab17f5e1938dd6e10a779f5f8d5c5d7800a83096e7dc5cab2
size 1481790520
diff --git a/checkpoint-2000/optimizer.pt b/checkpoint-2000/optimizer.pt
index c77d4f5a912cdc04592164edac03612621ef90ec..c94c2b2394bf4185a8aad72c35646c7836d48eab 100644
--- a/checkpoint-2000/optimizer.pt
+++ b/checkpoint-2000/optimizer.pt
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:c4842a0c1ea33f6a8e2147db0772c3310a9d361f955e89265e54b367b8904402
+oid sha256:24cc1cbc00725be45237fa31c2687929be78debb22d0f2fffda8a79fcca60778
size 44191162
diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json
index ddc4fe0f098b52cf925457a0c5b2f81a1f624b41..51f9545fe8fe57be9c7cd88438c3f257b3e4de47 100644
--- a/checkpoint-2000/trainer_state.json
+++ b/checkpoint-2000/trainer_state.json
@@ -12,18 +12,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 31.0,
+ "avg_layers": 25.0,
"epoch": 0.009392427355444672,
- "f1_execute": 0.4864864945411682,
+ "f1_execute": 0.6976743936538696,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 2.40625,
+ "grad_norm": 2.25,
"learning_rate": 2e-06,
- "loss": 0.5484,
- "macro_f1": 0.1621621698141098,
+ "loss": 0.4974,
+ "macro_f1": 0.23255813121795654,
"num_tokens": 3175.0,
"repeat_count": 0.0,
- "routers_loss": 0.503563642501831,
+ "routers_loss": 0.4339469373226166,
"skip_count": 0.0,
"step": 2,
"text_loss": 0.3330848515033722
@@ -31,18 +31,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 23.0,
"epoch": 0.018784854710889344,
- "f1_execute": 0.4864864945411682,
+ "f1_execute": 0.7272726893424988,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.9140625,
+ "grad_norm": 1.8359375,
"learning_rate": 6e-06,
- "loss": 0.536,
- "macro_f1": 0.1621621698141098,
+ "loss": 0.4988,
+ "macro_f1": 0.24242423474788666,
"num_tokens": 5816.0,
"repeat_count": 0.0,
- "routers_loss": 0.4589468538761139,
+ "routers_loss": 0.4511934816837311,
"skip_count": 1.0,
"step": 4,
"text_loss": 0.4571273922920227
@@ -50,37 +50,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 32.0,
+ "avg_layers": 28.0,
"epoch": 0.02817728206633402,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.6666666865348816,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 2.375,
+ "grad_norm": 2.234375,
"learning_rate": 1e-05,
- "loss": 0.5469,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.5113,
+ "macro_f1": 0.222222238779068,
"num_tokens": 9739.0,
"repeat_count": 0.0,
- "routers_loss": 0.5736724138259888,
+ "routers_loss": 0.49306994676589966,
"skip_count": 0.0,
"step": 6,
"text_loss": 0.41060560941696167
},
{
- "acc_repeat": 1.0,
- "acc_skip": 0.5,
- "avg_layers": 33.0,
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 0.03756970942177869,
- "f1_execute": 0.47058823704719543,
- "f1_repeat": 0.1538461595773697,
- "f1_skip": 0.222222238779068,
- "grad_norm": 1.8515625,
+ "f1_execute": 0.5641025900840759,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.7265625,
"learning_rate": 1.4e-05,
- "loss": 0.5291,
- "macro_f1": 0.28221890330314636,
+ "loss": 0.4766,
+ "macro_f1": 0.18803420662879944,
"num_tokens": 12869.0,
"repeat_count": 1.0,
- "routers_loss": 0.49970296025276184,
+ "routers_loss": 0.48872503638267517,
"skip_count": 2.0,
"step": 8,
"text_loss": 0.36678561568260193
@@ -88,37 +88,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 27.0,
"epoch": 0.046962136777223364,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.6976743936538696,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.953125,
+ "grad_norm": 1.78125,
"learning_rate": 1.8e-05,
- "loss": 0.5316,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.4806,
+ "macro_f1": 0.23255813121795654,
"num_tokens": 15845.0,
"repeat_count": 0.0,
- "routers_loss": 0.5153562426567078,
+ "routers_loss": 0.45077216625213623,
"skip_count": 0.0,
"step": 10,
"text_loss": 0.5597779154777527
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 0.5,
"acc_skip": 0.3333333432674408,
- "avg_layers": 34.0,
+ "avg_layers": 26.0,
"epoch": 0.05635456413266804,
- "f1_execute": 0.5714285373687744,
- "f1_repeat": 0.0,
- "f1_skip": 0.25,
- "grad_norm": 1.6328125,
+ "f1_execute": 0.7179487347602844,
+ "f1_repeat": 0.2857142984867096,
+ "f1_skip": 0.20000000298023224,
+ "grad_norm": 1.5390625,
"learning_rate": 2.2e-05,
- "loss": 0.5051,
- "macro_f1": 0.2738095223903656,
+ "loss": 0.4557,
+ "macro_f1": 0.40122103691101074,
"num_tokens": 19353.0,
"repeat_count": 2.0,
- "routers_loss": 0.46214747428894043,
+ "routers_loss": 0.4130440056324005,
"skip_count": 3.0,
"step": 12,
"text_loss": 0.2056603729724884
@@ -126,37 +126,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 27.0,
"epoch": 0.06574699148811271,
- "f1_execute": 0.5263157486915588,
+ "f1_execute": 0.6976743936538696,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 2.671875,
+ "grad_norm": 2.4375,
"learning_rate": 2.6e-05,
- "loss": 0.5653,
- "macro_f1": 0.17543858289718628,
+ "loss": 0.5129,
+ "macro_f1": 0.23255813121795654,
"num_tokens": 22675.0,
"repeat_count": 0.0,
- "routers_loss": 0.5300976634025574,
+ "routers_loss": 0.4582902193069458,
"skip_count": 0.0,
"step": 14,
"text_loss": 0.32989829778671265
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 34.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 0.07513941884355738,
- "f1_execute": 0.6153846383094788,
+ "f1_execute": 0.6829268336296082,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 1.8828125,
+ "f1_skip": 0.2222222238779068,
+ "grad_norm": 1.7421875,
"learning_rate": 3e-05,
- "loss": 0.5225,
- "macro_f1": 0.20512822270393372,
+ "loss": 0.4729,
+ "macro_f1": 0.3017163574695587,
"num_tokens": 26022.0,
"repeat_count": 0.0,
- "routers_loss": 0.473240464925766,
+ "routers_loss": 0.42910993099212646,
"skip_count": 1.0,
"step": 16,
"text_loss": 0.1353905349969864
@@ -164,18 +164,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 38.0,
+ "avg_layers": 27.0,
"epoch": 0.08453184619900206,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.7555555105209351,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.6015625,
+ "grad_norm": 1.4765625,
"learning_rate": 3.4000000000000007e-05,
- "loss": 0.4867,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.4274,
+ "macro_f1": 0.2518518567085266,
"num_tokens": 29251.0,
"repeat_count": 0.0,
- "routers_loss": 0.4795944094657898,
+ "routers_loss": 0.3990713059902191,
"skip_count": 0.0,
"step": 18,
"text_loss": 0.3806765377521515
@@ -183,18 +183,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 36.0,
+ "avg_layers": 26.0,
"epoch": 0.09392427355444673,
- "f1_execute": 0.6153846383094788,
- "f1_repeat": 0.1538461595773697,
+ "f1_execute": 0.6829268336296082,
+ "f1_repeat": 0.2857142984867096,
"f1_skip": 0.0,
- "grad_norm": 1.3984375,
+ "grad_norm": 1.3125,
"learning_rate": 3.8e-05,
- "loss": 0.4718,
- "macro_f1": 0.25641027092933655,
+ "loss": 0.4261,
+ "macro_f1": 0.3228803873062134,
"num_tokens": 32545.0,
"repeat_count": 1.0,
- "routers_loss": 0.41872408986091614,
+ "routers_loss": 0.40146592259407043,
"skip_count": 0.0,
"step": 20,
"text_loss": 0.25648367404937744
@@ -202,18 +202,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 26.0,
"epoch": 0.1033167009098914,
- "f1_execute": 0.6341463327407837,
+ "f1_execute": 0.7272727489471436,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.7734375,
+ "grad_norm": 1.625,
"learning_rate": 4.2000000000000004e-05,
- "loss": 0.4472,
- "macro_f1": 0.21138212084770203,
+ "loss": 0.404,
+ "macro_f1": 0.24242424964904785,
"num_tokens": 36560.0,
"repeat_count": 0.0,
- "routers_loss": 0.4152105450630188,
+ "routers_loss": 0.372715026140213,
"skip_count": 0.0,
"step": 22,
"text_loss": 0.2799522578716278
@@ -221,18 +221,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 32.0,
+ "avg_layers": 27.0,
"epoch": 0.11270912826533608,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.7555555105209351,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.8046875,
+ "grad_norm": 1.6328125,
"learning_rate": 4.6e-05,
- "loss": 0.4554,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.4218,
+ "macro_f1": 0.2518518567085266,
"num_tokens": 39597.0,
"repeat_count": 0.0,
- "routers_loss": 0.47541096806526184,
+ "routers_loss": 0.4504941403865814,
"skip_count": 0.0,
"step": 24,
"text_loss": 0.6635695695877075
@@ -240,18 +240,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 34.0,
+ "avg_layers": 27.0,
"epoch": 0.12210155562078075,
- "f1_execute": 0.7826087474822998,
+ "f1_execute": 0.8085106015205383,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.875,
+ "grad_norm": 1.7109375,
"learning_rate": 5e-05,
- "loss": 0.4182,
- "macro_f1": 0.2608695924282074,
+ "loss": 0.3886,
+ "macro_f1": 0.26950353384017944,
"num_tokens": 43080.0,
"repeat_count": 0.0,
- "routers_loss": 0.37319275736808777,
+ "routers_loss": 0.3498791456222534,
"skip_count": 0.0,
"step": 26,
"text_loss": 0.7035041451454163
@@ -259,18 +259,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 0.13149398297622542,
- "f1_execute": 0.7826087474822998,
+ "f1_execute": 0.8085106015205383,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.4375,
+ "grad_norm": 1.34375,
"learning_rate": 5.4e-05,
- "loss": 0.3991,
- "macro_f1": 0.2608695924282074,
+ "loss": 0.3724,
+ "macro_f1": 0.26950353384017944,
"num_tokens": 46406.0,
"repeat_count": 0.0,
- "routers_loss": 0.3604123294353485,
+ "routers_loss": 0.31265875697135925,
"skip_count": 0.0,
"step": 28,
"text_loss": 0.6388277411460876
@@ -280,16 +280,16 @@
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.1408864103316701,
- "f1_execute": 0.8979591727256775,
+ "f1_execute": 0.8571428060531616,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.421875,
+ "grad_norm": 1.2578125,
"learning_rate": 5.800000000000001e-05,
- "loss": 0.3827,
- "macro_f1": 0.2993197441101074,
+ "loss": 0.341,
+ "macro_f1": 0.2857142686843872,
"num_tokens": 49966.0,
"repeat_count": 0.0,
- "routers_loss": 0.35880225896835327,
+ "routers_loss": 0.3200918138027191,
"skip_count": 2.0,
"step": 30,
"text_loss": 0.17372547090053558
@@ -297,18 +297,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 24.0,
+ "avg_layers": 25.0,
"epoch": 0.15027883768711475,
- "f1_execute": 0.9200000166893005,
+ "f1_execute": 0.8571428060531616,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.4609375,
+ "grad_norm": 1.4140625,
"learning_rate": 6.2e-05,
- "loss": 0.3452,
- "macro_f1": 0.30666667222976685,
+ "loss": 0.3207,
+ "macro_f1": 0.2857142686843872,
"num_tokens": 53378.0,
"repeat_count": 1.0,
- "routers_loss": 0.31086465716362,
+ "routers_loss": 0.32304447889328003,
"skip_count": 1.0,
"step": 32,
"text_loss": 0.18196581304073334
@@ -316,18 +316,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 25.0,
"epoch": 0.15967126504255943,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.3671875,
+ "grad_norm": 1.46875,
"learning_rate": 6.6e-05,
- "loss": 0.3283,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.3304,
+ "macro_f1": 0.3006536364555359,
"num_tokens": 56933.0,
"repeat_count": 0.0,
- "routers_loss": 0.2674171030521393,
+ "routers_loss": 0.24814388155937195,
"skip_count": 0.0,
"step": 34,
"text_loss": 0.28823015093803406
@@ -335,18 +335,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 0.16906369239800412,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.1015625,
+ "grad_norm": 1.1171875,
"learning_rate": 7.000000000000001e-05,
- "loss": 0.2849,
- "macro_f1": 0.3205128312110901,
+ "loss": 0.2778,
+ "macro_f1": 0.3006536066532135,
"num_tokens": 60744.0,
"repeat_count": 1.0,
- "routers_loss": 0.24587315320968628,
+ "routers_loss": 0.22411039471626282,
"skip_count": 0.0,
"step": 36,
"text_loss": 0.5260357856750488
@@ -354,18 +354,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 31.0,
+ "avg_layers": 27.0,
"epoch": 0.17845611975344877,
- "f1_execute": 0.8085106015205383,
+ "f1_execute": 0.8571428656578064,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.3046875,
+ "grad_norm": 1.484375,
"learning_rate": 7.4e-05,
- "loss": 0.2616,
- "macro_f1": 0.26950353384017944,
+ "loss": 0.2738,
+ "macro_f1": 0.2857142984867096,
"num_tokens": 64900.0,
"repeat_count": 0.0,
- "routers_loss": 0.32050269842147827,
+ "routers_loss": 0.44355395436286926,
"skip_count": 0.0,
"step": 38,
"text_loss": 0.5382097363471985
@@ -373,18 +373,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 0.18784854710889345,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.1796875,
+ "grad_norm": 1.3828125,
"learning_rate": 7.8e-05,
- "loss": 0.2084,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.2137,
+ "macro_f1": 0.3076923191547394,
"num_tokens": 68000.0,
"repeat_count": 0.0,
- "routers_loss": 0.15196125209331512,
+ "routers_loss": 0.202330082654953,
"skip_count": 0.0,
"step": 40,
"text_loss": 0.5946118831634521
@@ -392,18 +392,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 25.0,
"epoch": 0.19724097446433814,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.61328125,
+ "grad_norm": 0.78125,
"learning_rate": 8.2e-05,
- "loss": 0.1947,
+ "loss": 0.21,
"macro_f1": 0.3144654333591461,
"num_tokens": 70529.0,
"repeat_count": 0.0,
- "routers_loss": 0.14121046662330627,
+ "routers_loss": 0.18023855984210968,
"skip_count": 0.0,
"step": 42,
"text_loss": 0.5550904273986816
@@ -416,13 +416,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.50390625,
+ "grad_norm": 0.609375,
"learning_rate": 8.599999999999999e-05,
- "loss": 0.1884,
+ "loss": 0.1918,
"macro_f1": 0.32098764181137085,
"num_tokens": 73427.0,
"repeat_count": 2.0,
- "routers_loss": 0.21312278509140015,
+ "routers_loss": 0.2101590931415558,
"skip_count": 0.0,
"step": 44,
"text_loss": 0.4636923372745514
@@ -435,13 +435,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.45703125,
+ "grad_norm": 0.53125,
"learning_rate": 8.999999999999999e-05,
- "loss": 0.166,
+ "loss": 0.1881,
"macro_f1": 0.3333333432674408,
"num_tokens": 76472.0,
"repeat_count": 0.0,
- "routers_loss": 0.1184137836098671,
+ "routers_loss": 0.11800424009561539,
"skip_count": 0.0,
"step": 46,
"text_loss": 0.4187001883983612
@@ -454,13 +454,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.62890625,
+ "grad_norm": 0.953125,
"learning_rate": 9.400000000000001e-05,
- "loss": 0.1313,
+ "loss": 0.1446,
"macro_f1": 0.3272727429866791,
"num_tokens": 79124.0,
"repeat_count": 1.0,
- "routers_loss": 0.10897563397884369,
+ "routers_loss": 0.11632519960403442,
"skip_count": 0.0,
"step": 48,
"text_loss": 0.2253919243812561
@@ -468,18 +468,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 0.2348106838861168,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.4375,
+ "grad_norm": 0.58984375,
"learning_rate": 9.800000000000001e-05,
- "loss": 0.1531,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.1543,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 81980.0,
"repeat_count": 1.0,
- "routers_loss": 0.09979952871799469,
+ "routers_loss": 0.09669367223978043,
"skip_count": 0.0,
"step": 50,
"text_loss": 0.6053179502487183
@@ -487,18 +487,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 0.2442031112415615,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.515625,
+ "grad_norm": 0.8515625,
"learning_rate": 0.000102,
- "loss": 0.1265,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.1393,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 85236.0,
"repeat_count": 0.0,
- "routers_loss": 0.05543195456266403,
+ "routers_loss": 0.12471720576286316,
"skip_count": 0.0,
"step": 52,
"text_loss": 0.6027331948280334
@@ -511,13 +511,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.328125,
+ "grad_norm": 0.421875,
"learning_rate": 0.000106,
- "loss": 0.1436,
+ "loss": 0.1473,
"macro_f1": 0.32098764181137085,
"num_tokens": 88238.0,
"repeat_count": 0.0,
- "routers_loss": 0.15049344301223755,
+ "routers_loss": 0.1376056969165802,
"skip_count": 2.0,
"step": 54,
"text_loss": 0.2861751616001129
@@ -530,13 +530,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.263671875,
+ "grad_norm": 0.35546875,
"learning_rate": 0.00011,
- "loss": 0.1021,
+ "loss": 0.1082,
"macro_f1": 0.3333333432674408,
"num_tokens": 91056.0,
"repeat_count": 0.0,
- "routers_loss": 0.07367338240146637,
+ "routers_loss": 0.07449393719434738,
"skip_count": 0.0,
"step": 56,
"text_loss": 0.48106974363327026
@@ -544,18 +544,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 26.0,
"epoch": 0.2723803933078955,
- "f1_execute": 1.0,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25,
+ "grad_norm": 0.271484375,
"learning_rate": 0.000114,
- "loss": 0.114,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.1123,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 94987.0,
"repeat_count": 0.0,
- "routers_loss": 0.03782692551612854,
+ "routers_loss": 0.07064720243215561,
"skip_count": 0.0,
"step": 58,
"text_loss": 0.3554874658584595
@@ -568,13 +568,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.333984375,
+ "grad_norm": 0.5390625,
"learning_rate": 0.000118,
- "loss": 0.1197,
+ "loss": 0.1234,
"macro_f1": 0.32098764181137085,
"num_tokens": 97909.0,
"repeat_count": 0.0,
- "routers_loss": 0.14074955880641937,
+ "routers_loss": 0.16835889220237732,
"skip_count": 2.0,
"step": 60,
"text_loss": 0.5475804805755615
@@ -587,13 +587,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.000122,
- "loss": 0.1174,
+ "loss": 0.1224,
"macro_f1": 0.3333333432674408,
"num_tokens": 101043.0,
"repeat_count": 0.0,
- "routers_loss": 0.058013737201690674,
+ "routers_loss": 0.06127442046999931,
"skip_count": 0.0,
"step": 62,
"text_loss": 0.5966938734054565
@@ -606,13 +606,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.212890625,
"learning_rate": 0.000126,
- "loss": 0.0911,
+ "loss": 0.0931,
"macro_f1": 0.3333333432674408,
"num_tokens": 104103.0,
"repeat_count": 0.0,
- "routers_loss": 0.04936821386218071,
+ "routers_loss": 0.047825805842876434,
"skip_count": 0.0,
"step": 64,
"text_loss": 0.5480486750602722
@@ -625,13 +625,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.220703125,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.00013000000000000002,
- "loss": 0.1107,
+ "loss": 0.1088,
"macro_f1": 0.3006536364555359,
"num_tokens": 107009.0,
"repeat_count": 1.0,
- "routers_loss": 0.2628525495529175,
+ "routers_loss": 0.275174081325531,
"skip_count": 4.0,
"step": 66,
"text_loss": 0.41714492440223694
@@ -644,13 +644,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.000134,
- "loss": 0.1109,
+ "loss": 0.1123,
"macro_f1": 0.3333333432674408,
"num_tokens": 110486.0,
"repeat_count": 0.0,
- "routers_loss": 0.02859785594046116,
+ "routers_loss": 0.029025178402662277,
"skip_count": 0.0,
"step": 68,
"text_loss": 0.6775627732276917
@@ -663,13 +663,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.298828125,
+ "grad_norm": 0.314453125,
"learning_rate": 0.00013800000000000002,
- "loss": 0.1067,
+ "loss": 0.1049,
"macro_f1": 0.3272727429866791,
"num_tokens": 113878.0,
"repeat_count": 0.0,
- "routers_loss": 0.10459086298942566,
+ "routers_loss": 0.10141710191965103,
"skip_count": 1.0,
"step": 70,
"text_loss": 0.6678873896598816
@@ -682,13 +682,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2109375,
+ "grad_norm": 0.248046875,
"learning_rate": 0.00014199999999999998,
- "loss": 0.1166,
+ "loss": 0.1119,
"macro_f1": 0.3272727429866791,
"num_tokens": 116989.0,
"repeat_count": 0.0,
- "routers_loss": 0.0718551054596901,
+ "routers_loss": 0.08002066612243652,
"skip_count": 1.0,
"step": 72,
"text_loss": 0.405692994594574
@@ -701,13 +701,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1787109375,
"learning_rate": 0.000146,
- "loss": 0.1007,
+ "loss": 0.0944,
"macro_f1": 0.3144654333591461,
"num_tokens": 119883.0,
"repeat_count": 0.0,
- "routers_loss": 0.1850946843624115,
+ "routers_loss": 0.1867009848356247,
"skip_count": 3.0,
"step": 74,
"text_loss": 0.44616150856018066
@@ -720,13 +720,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.34375,
+ "grad_norm": 0.333984375,
"learning_rate": 0.00015,
- "loss": 0.1019,
+ "loss": 0.1003,
"macro_f1": 0.32098764181137085,
"num_tokens": 123325.0,
"repeat_count": 0.0,
- "routers_loss": 0.09809529036283493,
+ "routers_loss": 0.07042168825864792,
"skip_count": 2.0,
"step": 76,
"text_loss": 0.11340200901031494
@@ -739,13 +739,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.259765625,
+ "grad_norm": 0.26171875,
"learning_rate": 0.000154,
- "loss": 0.1088,
+ "loss": 0.1066,
"macro_f1": 0.32098764181137085,
"num_tokens": 126131.0,
"repeat_count": 0.0,
- "routers_loss": 0.11277207732200623,
+ "routers_loss": 0.11535373330116272,
"skip_count": 2.0,
"step": 78,
"text_loss": 0.3269135355949402
@@ -758,13 +758,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2412109375,
+ "grad_norm": 0.255859375,
"learning_rate": 0.000158,
- "loss": 0.0866,
+ "loss": 0.0891,
"macro_f1": 0.3272727429866791,
"num_tokens": 130349.0,
"repeat_count": 0.0,
- "routers_loss": 0.09079254418611526,
+ "routers_loss": 0.09497501701116562,
"skip_count": 1.0,
"step": 80,
"text_loss": 0.15273472666740417
@@ -777,13 +777,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.000162,
- "loss": 0.0928,
+ "loss": 0.0929,
"macro_f1": 0.3333333432674408,
"num_tokens": 133607.0,
"repeat_count": 0.0,
- "routers_loss": 0.02900076098740101,
+ "routers_loss": 0.030639523640275,
"skip_count": 0.0,
"step": 82,
"text_loss": 0.282884806394577
@@ -796,13 +796,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.00016600000000000002,
- "loss": 0.1251,
+ "loss": 0.1254,
"macro_f1": 0.3272727429866791,
"num_tokens": 136694.0,
"repeat_count": 0.0,
- "routers_loss": 0.0763339251279831,
+ "routers_loss": 0.07906441390514374,
"skip_count": 1.0,
"step": 84,
"text_loss": 0.459094375371933
@@ -817,11 +817,11 @@
"f1_skip": 0.0,
"grad_norm": 0.212890625,
"learning_rate": 0.00017,
- "loss": 0.1064,
+ "loss": 0.1071,
"macro_f1": 0.3144654333591461,
"num_tokens": 139966.0,
"repeat_count": 1.0,
- "routers_loss": 0.13191410899162292,
+ "routers_loss": 0.1124570444226265,
"skip_count": 2.0,
"step": 86,
"text_loss": 0.29985448718070984
@@ -834,13 +834,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.25390625,
"learning_rate": 0.000174,
- "loss": 0.1055,
+ "loss": 0.1031,
"macro_f1": 0.32098764181137085,
"num_tokens": 142788.0,
"repeat_count": 2.0,
- "routers_loss": 0.21200031042099,
+ "routers_loss": 0.1966402679681778,
"skip_count": 0.0,
"step": 88,
"text_loss": 0.6435291767120361
@@ -853,13 +853,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.318359375,
+ "grad_norm": 0.349609375,
"learning_rate": 0.000178,
- "loss": 0.0971,
+ "loss": 0.0963,
"macro_f1": 0.3333333432674408,
"num_tokens": 146192.0,
"repeat_count": 0.0,
- "routers_loss": 0.031911369413137436,
+ "routers_loss": 0.0325632207095623,
"skip_count": 0.0,
"step": 90,
"text_loss": 0.35170626640319824
@@ -872,13 +872,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.216796875,
+ "grad_norm": 0.2265625,
"learning_rate": 0.000182,
- "loss": 0.1056,
+ "loss": 0.1073,
"macro_f1": 0.32098764181137085,
"num_tokens": 149792.0,
"repeat_count": 1.0,
- "routers_loss": 0.14131835103034973,
+ "routers_loss": 0.15115146338939667,
"skip_count": 1.0,
"step": 92,
"text_loss": 0.83159339427948
@@ -891,13 +891,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.205078125,
"learning_rate": 0.000186,
- "loss": 0.1059,
+ "loss": 0.1073,
"macro_f1": 0.3333333432674408,
"num_tokens": 152766.0,
"repeat_count": 0.0,
- "routers_loss": 0.04137955233454704,
+ "routers_loss": 0.043313540518283844,
"skip_count": 0.0,
"step": 94,
"text_loss": 0.49707934260368347
@@ -910,13 +910,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.00019,
- "loss": 0.0934,
+ "loss": 0.0947,
"macro_f1": 0.3333333432674408,
"num_tokens": 156112.0,
"repeat_count": 0.0,
- "routers_loss": 0.03163003921508789,
+ "routers_loss": 0.032021280378103256,
"skip_count": 0.0,
"step": 96,
"text_loss": 0.27608928084373474
@@ -929,13 +929,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.2099609375,
"learning_rate": 0.000194,
- "loss": 0.0847,
+ "loss": 0.0846,
"macro_f1": 0.3076923191547394,
"num_tokens": 159454.0,
"repeat_count": 2.0,
- "routers_loss": 0.2567490339279175,
+ "routers_loss": 0.24473154544830322,
"skip_count": 2.0,
"step": 98,
"text_loss": 0.6026689410209656
@@ -948,13 +948,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.30859375,
+ "grad_norm": 0.271484375,
"learning_rate": 0.00019800000000000002,
- "loss": 0.1077,
+ "loss": 0.1028,
"macro_f1": 0.32098764181137085,
"num_tokens": 163661.0,
"repeat_count": 0.0,
- "routers_loss": 0.11468870937824249,
+ "routers_loss": 0.11468276381492615,
"skip_count": 2.0,
"step": 100,
"text_loss": 0.46733155846595764
@@ -967,13 +967,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.17578125,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.000202,
- "loss": 0.1131,
+ "loss": 0.1089,
"macro_f1": 0.3333333432674408,
"num_tokens": 167134.0,
"repeat_count": 0.0,
- "routers_loss": 0.02124219387769699,
+ "routers_loss": 0.021144939586520195,
"skip_count": 0.0,
"step": 102,
"text_loss": 0.6362994909286499
@@ -986,13 +986,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.000206,
- "loss": 0.0624,
+ "loss": 0.0621,
"macro_f1": 0.3272727429866791,
"num_tokens": 170433.0,
"repeat_count": 0.0,
- "routers_loss": 0.06983796507120132,
+ "routers_loss": 0.06594710797071457,
"skip_count": 1.0,
"step": 104,
"text_loss": 0.4515477120876312
@@ -1005,13 +1005,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.00021,
- "loss": 0.0951,
+ "loss": 0.0929,
"macro_f1": 0.3333333432674408,
"num_tokens": 173387.0,
"repeat_count": 0.0,
- "routers_loss": 0.03467355668544769,
+ "routers_loss": 0.032923027873039246,
"skip_count": 0.0,
"step": 106,
"text_loss": 0.6638453006744385
@@ -1024,13 +1024,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.240234375,
"learning_rate": 0.000214,
- "loss": 0.0881,
+ "loss": 0.0883,
"macro_f1": 0.3272727429866791,
"num_tokens": 176170.0,
"repeat_count": 1.0,
- "routers_loss": 0.08142061531543732,
+ "routers_loss": 0.08034781366586685,
"skip_count": 0.0,
"step": 108,
"text_loss": 1.186936855316162
@@ -1043,13 +1043,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.267578125,
"learning_rate": 0.000218,
- "loss": 0.0795,
+ "loss": 0.0794,
"macro_f1": 0.3272727429866791,
"num_tokens": 179877.0,
"repeat_count": 0.0,
- "routers_loss": 0.08327355235815048,
+ "routers_loss": 0.07814185321331024,
"skip_count": 1.0,
"step": 110,
"text_loss": 0.5488709211349487
@@ -1062,13 +1062,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.000222,
- "loss": 0.0943,
+ "loss": 0.0946,
"macro_f1": 0.3333333432674408,
"num_tokens": 182726.0,
"repeat_count": 0.0,
- "routers_loss": 0.019890006631612778,
+ "routers_loss": 0.01884695515036583,
"skip_count": 0.0,
"step": 112,
"text_loss": 0.5195863842964172
@@ -1081,13 +1081,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2001953125,
+ "grad_norm": 0.19921875,
"learning_rate": 0.00022600000000000002,
- "loss": 0.0933,
+ "loss": 0.0974,
"macro_f1": 0.32098764181137085,
"num_tokens": 185624.0,
"repeat_count": 0.0,
- "routers_loss": 0.09992363303899765,
+ "routers_loss": 0.09657823294401169,
"skip_count": 2.0,
"step": 114,
"text_loss": 0.43858134746551514
@@ -1100,13 +1100,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.3046875,
"learning_rate": 0.00023,
- "loss": 0.0762,
+ "loss": 0.0753,
"macro_f1": 0.3333333432674408,
"num_tokens": 188155.0,
"repeat_count": 0.0,
- "routers_loss": 0.014119029976427555,
+ "routers_loss": 0.01463601179420948,
"skip_count": 0.0,
"step": 116,
"text_loss": 0.392981618642807
@@ -1119,13 +1119,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.423828125,
+ "grad_norm": 0.439453125,
"learning_rate": 0.00023400000000000002,
- "loss": 0.0842,
+ "loss": 0.0843,
"macro_f1": 0.3333333432674408,
"num_tokens": 190970.0,
"repeat_count": 0.0,
- "routers_loss": 0.03976766765117645,
+ "routers_loss": 0.03859659656882286,
"skip_count": 0.0,
"step": 118,
"text_loss": 0.309179425239563
@@ -1138,13 +1138,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.2255859375,
"learning_rate": 0.00023799999999999998,
- "loss": 0.0517,
+ "loss": 0.053,
"macro_f1": 0.3333333432674408,
"num_tokens": 193988.0,
"repeat_count": 0.0,
- "routers_loss": 0.017428619787096977,
+ "routers_loss": 0.019092386588454247,
"skip_count": 0.0,
"step": 120,
"text_loss": 0.48543134331703186
@@ -1157,13 +1157,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.296875,
+ "grad_norm": 0.35546875,
"learning_rate": 0.000242,
- "loss": 0.1134,
+ "loss": 0.1203,
"macro_f1": 0.3272727429866791,
"num_tokens": 196475.0,
"repeat_count": 0.0,
- "routers_loss": 0.06965513527393341,
+ "routers_loss": 0.0619138665497303,
"skip_count": 1.0,
"step": 122,
"text_loss": 0.4615364074707031
@@ -1176,13 +1176,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1875,
"learning_rate": 0.000246,
- "loss": 0.0984,
+ "loss": 0.1002,
"macro_f1": 0.3272727429866791,
"num_tokens": 200045.0,
"repeat_count": 1.0,
- "routers_loss": 0.10476501286029816,
+ "routers_loss": 0.09752107411623001,
"skip_count": 0.0,
"step": 124,
"text_loss": 0.15802054107189178
@@ -1195,13 +1195,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.00025,
- "loss": 0.0771,
+ "loss": 0.0773,
"macro_f1": 0.3333333432674408,
"num_tokens": 203214.0,
"repeat_count": 0.0,
- "routers_loss": 0.028317544609308243,
+ "routers_loss": 0.02896115928888321,
"skip_count": 0.0,
"step": 126,
"text_loss": 0.4543360471725464
@@ -1214,13 +1214,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.390625,
+ "grad_norm": 0.4296875,
"learning_rate": 0.000254,
- "loss": 0.0933,
+ "loss": 0.0973,
"macro_f1": 0.3333333432674408,
"num_tokens": 206168.0,
"repeat_count": 0.0,
- "routers_loss": 0.012766432017087936,
+ "routers_loss": 0.011423567309975624,
"skip_count": 0.0,
"step": 128,
"text_loss": 0.4730179011821747
@@ -1233,13 +1233,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.353515625,
+ "grad_norm": 0.365234375,
"learning_rate": 0.00025800000000000004,
- "loss": 0.0989,
+ "loss": 0.099,
"macro_f1": 0.3333333432674408,
"num_tokens": 209907.0,
"repeat_count": 0.0,
- "routers_loss": 0.021400077268481255,
+ "routers_loss": 0.01957600563764572,
"skip_count": 0.0,
"step": 130,
"text_loss": 0.45122358202934265
@@ -1252,13 +1252,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.2060546875,
"learning_rate": 0.000262,
- "loss": 0.0873,
+ "loss": 0.0868,
"macro_f1": 0.3272727429866791,
"num_tokens": 213521.0,
"repeat_count": 0.0,
- "routers_loss": 0.05025051161646843,
+ "routers_loss": 0.04882373288273811,
"skip_count": 1.0,
"step": 132,
"text_loss": 0.4341491758823395
@@ -1271,13 +1271,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1611328125,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.000266,
- "loss": 0.085,
+ "loss": 0.0834,
"macro_f1": 0.3333333432674408,
"num_tokens": 216484.0,
"repeat_count": 0.0,
- "routers_loss": 0.017420046031475067,
+ "routers_loss": 0.016083380207419395,
"skip_count": 0.0,
"step": 134,
"text_loss": 0.46990111470222473
@@ -1290,13 +1290,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2041015625,
+ "grad_norm": 0.220703125,
"learning_rate": 0.00027,
- "loss": 0.086,
+ "loss": 0.0863,
"macro_f1": 0.3333333432674408,
"num_tokens": 219398.0,
"repeat_count": 0.0,
- "routers_loss": 0.018217921257019043,
+ "routers_loss": 0.01733536459505558,
"skip_count": 0.0,
"step": 136,
"text_loss": 0.4455361068248749
@@ -1309,13 +1309,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.00027400000000000005,
- "loss": 0.0985,
+ "loss": 0.0997,
"macro_f1": 0.3333333432674408,
"num_tokens": 222430.0,
"repeat_count": 0.0,
- "routers_loss": 0.012350660748779774,
+ "routers_loss": 0.01332803163677454,
"skip_count": 0.0,
"step": 138,
"text_loss": 0.47699397802352905
@@ -1328,13 +1328,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.302734375,
+ "grad_norm": 0.333984375,
"learning_rate": 0.00027800000000000004,
"loss": 0.0922,
"macro_f1": 0.3144654333591461,
"num_tokens": 225458.0,
"repeat_count": 1.0,
- "routers_loss": 0.14993029832839966,
+ "routers_loss": 0.14924728870391846,
"skip_count": 2.0,
"step": 140,
"text_loss": 0.5858222842216492
@@ -1347,13 +1347,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.251953125,
+ "grad_norm": 0.25,
"learning_rate": 0.00028199999999999997,
- "loss": 0.0791,
+ "loss": 0.0798,
"macro_f1": 0.3144654333591461,
"num_tokens": 229365.0,
"repeat_count": 1.0,
- "routers_loss": 0.17921413481235504,
+ "routers_loss": 0.1860177218914032,
"skip_count": 2.0,
"step": 142,
"text_loss": 0.5003137588500977
@@ -1366,13 +1366,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.00028599999999999996,
- "loss": 0.0535,
+ "loss": 0.054,
"macro_f1": 0.32098764181137085,
"num_tokens": 231787.0,
"repeat_count": 1.0,
- "routers_loss": 0.1420905590057373,
+ "routers_loss": 0.16498211026191711,
"skip_count": 1.0,
"step": 144,
"text_loss": 0.5026470422744751
@@ -1385,13 +1385,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.29296875,
+ "grad_norm": 0.306640625,
"learning_rate": 0.00029,
- "loss": 0.0956,
+ "loss": 0.0936,
"macro_f1": 0.32098764181137085,
"num_tokens": 235014.0,
"repeat_count": 1.0,
- "routers_loss": 0.12468750029802322,
+ "routers_loss": 0.11801310628652573,
"skip_count": 1.0,
"step": 146,
"text_loss": 0.611888587474823
@@ -1404,13 +1404,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.000294,
- "loss": 0.0879,
+ "loss": 0.0878,
"macro_f1": 0.3333333432674408,
"num_tokens": 238210.0,
"repeat_count": 0.0,
- "routers_loss": 0.024295611307024956,
+ "routers_loss": 0.02422776259481907,
"skip_count": 0.0,
"step": 148,
"text_loss": 0.2876914143562317
@@ -1423,13 +1423,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.000298,
- "loss": 0.087,
+ "loss": 0.0858,
"macro_f1": 0.32098764181137085,
"num_tokens": 241582.0,
"repeat_count": 0.0,
- "routers_loss": 0.07016433775424957,
+ "routers_loss": 0.07282499223947525,
"skip_count": 2.0,
"step": 150,
"text_loss": 0.3919292390346527
@@ -1442,13 +1442,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3828125,
+ "grad_norm": 0.37890625,
"learning_rate": 0.000302,
- "loss": 0.0782,
+ "loss": 0.0797,
"macro_f1": 0.32098764181137085,
"num_tokens": 244621.0,
"repeat_count": 1.0,
- "routers_loss": 0.18942493200302124,
+ "routers_loss": 0.20659038424491882,
"skip_count": 1.0,
"step": 152,
"text_loss": 0.4294498860836029
@@ -1461,13 +1461,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1787109375,
"learning_rate": 0.000306,
- "loss": 0.0713,
+ "loss": 0.072,
"macro_f1": 0.3333333432674408,
"num_tokens": 247833.0,
"repeat_count": 0.0,
- "routers_loss": 0.02319060079753399,
+ "routers_loss": 0.02428400330245495,
"skip_count": 0.0,
"step": 154,
"text_loss": 0.5930765867233276
@@ -1480,13 +1480,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.15234375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.00031,
- "loss": 0.0778,
+ "loss": 0.0772,
"macro_f1": 0.3333333432674408,
"num_tokens": 251349.0,
"repeat_count": 0.0,
- "routers_loss": 0.01764747127890587,
+ "routers_loss": 0.0167869683355093,
"skip_count": 0.0,
"step": 156,
"text_loss": 0.41063904762268066
@@ -1499,13 +1499,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.000314,
- "loss": 0.0829,
+ "loss": 0.0821,
"macro_f1": 0.3333333432674408,
"num_tokens": 254886.0,
"repeat_count": 0.0,
- "routers_loss": 0.02268100716173649,
+ "routers_loss": 0.02531604655086994,
"skip_count": 0.0,
"step": 158,
"text_loss": 0.6739020347595215
@@ -1518,13 +1518,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.201171875,
"learning_rate": 0.00031800000000000003,
- "loss": 0.0889,
+ "loss": 0.09,
"macro_f1": 0.3333333432674408,
"num_tokens": 258260.0,
"repeat_count": 0.0,
- "routers_loss": 0.016952091827988625,
+ "routers_loss": 0.017772775143384933,
"skip_count": 0.0,
"step": 160,
"text_loss": 0.46873849630355835
@@ -1537,13 +1537,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2216796875,
+ "grad_norm": 0.224609375,
"learning_rate": 0.000322,
- "loss": 0.0923,
+ "loss": 0.0893,
"macro_f1": 0.3272727429866791,
"num_tokens": 261846.0,
"repeat_count": 0.0,
- "routers_loss": 0.03669808804988861,
+ "routers_loss": 0.034902360290288925,
"skip_count": 1.0,
"step": 162,
"text_loss": 0.3727971017360687
@@ -1556,13 +1556,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.212890625,
"learning_rate": 0.000326,
- "loss": 0.0769,
+ "loss": 0.076,
"macro_f1": 0.3333333432674408,
"num_tokens": 264348.0,
"repeat_count": 0.0,
- "routers_loss": 0.012101447209715843,
+ "routers_loss": 0.013553355820477009,
"skip_count": 0.0,
"step": 164,
"text_loss": 0.5798237323760986
@@ -1575,13 +1575,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.37109375,
+ "grad_norm": 0.408203125,
"learning_rate": 0.00033,
- "loss": 0.0897,
+ "loss": 0.0926,
"macro_f1": 0.32098764181137085,
"num_tokens": 267479.0,
"repeat_count": 1.0,
- "routers_loss": 0.1562056541442871,
+ "routers_loss": 0.13571743667125702,
"skip_count": 1.0,
"step": 166,
"text_loss": 0.8084776997566223
@@ -1594,13 +1594,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.2431640625,
"learning_rate": 0.00033400000000000004,
- "loss": 0.0829,
+ "loss": 0.0817,
"macro_f1": 0.32098764181137085,
"num_tokens": 270268.0,
"repeat_count": 2.0,
- "routers_loss": 0.20807914435863495,
+ "routers_loss": 0.19884146749973297,
"skip_count": 0.0,
"step": 168,
"text_loss": 0.7366134524345398
@@ -1613,13 +1613,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.267578125,
"learning_rate": 0.00033800000000000003,
- "loss": 0.0987,
+ "loss": 0.1022,
"macro_f1": 0.32098764181137085,
"num_tokens": 273518.0,
"repeat_count": 1.0,
- "routers_loss": 0.1530539095401764,
+ "routers_loss": 0.15469175577163696,
"skip_count": 1.0,
"step": 170,
"text_loss": 0.27204006910324097
@@ -1632,13 +1632,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000342,
- "loss": 0.087,
+ "loss": 0.0865,
"macro_f1": 0.32098764181137085,
"num_tokens": 277210.0,
"repeat_count": 0.0,
- "routers_loss": 0.08004544675350189,
+ "routers_loss": 0.08603330701589584,
"skip_count": 2.0,
"step": 172,
"text_loss": 0.7137667536735535
@@ -1651,13 +1651,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1767578125,
+ "grad_norm": 0.189453125,
"learning_rate": 0.000346,
- "loss": 0.0916,
+ "loss": 0.0902,
"macro_f1": 0.3076923191547394,
"num_tokens": 280389.0,
"repeat_count": 0.0,
- "routers_loss": 0.19228078424930573,
+ "routers_loss": 0.17851492762565613,
"skip_count": 4.0,
"step": 174,
"text_loss": 0.5148105621337891
@@ -1670,13 +1670,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1611328125,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.00035,
- "loss": 0.0863,
+ "loss": 0.0853,
"macro_f1": 0.3333333432674408,
"num_tokens": 283501.0,
"repeat_count": 0.0,
- "routers_loss": 0.024507170543074608,
+ "routers_loss": 0.021331604570150375,
"skip_count": 0.0,
"step": 176,
"text_loss": 0.301013320684433
@@ -1689,13 +1689,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.000354,
- "loss": 0.0898,
+ "loss": 0.0911,
"macro_f1": 0.32098764181137085,
"num_tokens": 287154.0,
"repeat_count": 0.0,
- "routers_loss": 0.05055495724081993,
+ "routers_loss": 0.057273946702480316,
"skip_count": 2.0,
"step": 178,
"text_loss": 0.4740981459617615
@@ -1708,13 +1708,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.240234375,
"learning_rate": 0.000358,
- "loss": 0.0865,
+ "loss": 0.0904,
"macro_f1": 0.3272727429866791,
"num_tokens": 289929.0,
"repeat_count": 0.0,
- "routers_loss": 0.03999815881252289,
+ "routers_loss": 0.04116598889231682,
"skip_count": 1.0,
"step": 180,
"text_loss": 0.4838573932647705
@@ -1727,13 +1727,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.14453125,
"learning_rate": 0.000362,
- "loss": 0.0983,
+ "loss": 0.0991,
"macro_f1": 0.3333333432674408,
"num_tokens": 294293.0,
"repeat_count": 0.0,
- "routers_loss": 0.025158070027828217,
+ "routers_loss": 0.027111956849694252,
"skip_count": 0.0,
"step": 182,
"text_loss": 0.7495553493499756
@@ -1746,32 +1746,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.158203125,
"learning_rate": 0.000366,
- "loss": 0.1015,
+ "loss": 0.1038,
"macro_f1": 0.3333333432674408,
"num_tokens": 297730.0,
"repeat_count": 0.0,
- "routers_loss": 0.01825365424156189,
+ "routers_loss": 0.019166452810168266,
"skip_count": 0.0,
"step": 184,
"text_loss": 0.534831166267395
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 0.8734957440563546,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.2158203125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2236328125,
"learning_rate": 0.00037,
- "loss": 0.0736,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0784,
+ "macro_f1": 0.5427350401878357,
"num_tokens": 300593.0,
"repeat_count": 1.0,
- "routers_loss": 0.22729666531085968,
+ "routers_loss": 0.2349659502506256,
"skip_count": 2.0,
"step": 186,
"text_loss": 0.3549048602581024
@@ -1784,13 +1784,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.2041015625,
"learning_rate": 0.000374,
- "loss": 0.0838,
+ "loss": 0.0827,
"macro_f1": 0.3076923191547394,
"num_tokens": 303456.0,
"repeat_count": 2.0,
- "routers_loss": 0.24516475200653076,
+ "routers_loss": 0.22502389550209045,
"skip_count": 2.0,
"step": 188,
"text_loss": 0.8837642073631287
@@ -1803,13 +1803,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2470703125,
+ "grad_norm": 0.271484375,
"learning_rate": 0.000378,
- "loss": 0.1056,
+ "loss": 0.1085,
"macro_f1": 0.3272727429866791,
"num_tokens": 306241.0,
"repeat_count": 1.0,
- "routers_loss": 0.1307530701160431,
+ "routers_loss": 0.12291611731052399,
"skip_count": 0.0,
"step": 190,
"text_loss": 0.73353511095047
@@ -1822,13 +1822,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.15625,
"learning_rate": 0.000382,
- "loss": 0.0961,
+ "loss": 0.0969,
"macro_f1": 0.3272727429866791,
"num_tokens": 310606.0,
"repeat_count": 0.0,
- "routers_loss": 0.06541688740253448,
+ "routers_loss": 0.055988848209381104,
"skip_count": 1.0,
"step": 192,
"text_loss": 0.6261917352676392
@@ -1841,13 +1841,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.333984375,
+ "grad_norm": 0.34375,
"learning_rate": 0.000386,
- "loss": 0.1058,
+ "loss": 0.1055,
"macro_f1": 0.3144654333591461,
"num_tokens": 313564.0,
"repeat_count": 0.0,
- "routers_loss": 0.12492545694112778,
+ "routers_loss": 0.12363404780626297,
"skip_count": 3.0,
"step": 194,
"text_loss": 0.2790874242782593
@@ -1860,13 +1860,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28515625,
+ "grad_norm": 0.27734375,
"learning_rate": 0.00039000000000000005,
- "loss": 0.0966,
+ "loss": 0.0964,
"macro_f1": 0.3076923191547394,
"num_tokens": 316958.0,
"repeat_count": 2.0,
- "routers_loss": 0.2838033139705658,
+ "routers_loss": 0.2718356251716614,
"skip_count": 2.0,
"step": 196,
"text_loss": 0.14428086578845978
@@ -1881,11 +1881,11 @@
"f1_skip": 0.0,
"grad_norm": 0.2021484375,
"learning_rate": 0.00039400000000000004,
- "loss": 0.0929,
+ "loss": 0.0917,
"macro_f1": 0.32098764181137085,
"num_tokens": 320103.0,
"repeat_count": 0.0,
- "routers_loss": 0.07692629098892212,
+ "routers_loss": 0.07188102602958679,
"skip_count": 2.0,
"step": 198,
"text_loss": 0.27155816555023193
@@ -1898,13 +1898,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.201171875,
"learning_rate": 0.000398,
"loss": 0.0809,
"macro_f1": 0.32098764181137085,
"num_tokens": 323566.0,
"repeat_count": 1.0,
- "routers_loss": 0.18504399061203003,
+ "routers_loss": 0.18038256466388702,
"skip_count": 1.0,
"step": 200,
"text_loss": 0.8453494310379028
@@ -1917,13 +1917,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.000402,
- "loss": 0.078,
+ "loss": 0.0801,
"macro_f1": 0.3333333432674408,
"num_tokens": 326385.0,
"repeat_count": 0.0,
- "routers_loss": 0.014647359028458595,
+ "routers_loss": 0.014639763161540031,
"skip_count": 0.0,
"step": 202,
"text_loss": 0.5733131766319275
@@ -1936,13 +1936,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2041015625,
+ "grad_norm": 0.21875,
"learning_rate": 0.00040600000000000006,
- "loss": 0.1028,
+ "loss": 0.104,
"macro_f1": 0.3333333432674408,
"num_tokens": 329266.0,
"repeat_count": 0.0,
- "routers_loss": 0.017848484218120575,
+ "routers_loss": 0.015269627794623375,
"skip_count": 0.0,
"step": 204,
"text_loss": 0.7355639934539795
@@ -1955,13 +1955,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.27734375,
"learning_rate": 0.00041,
- "loss": 0.0832,
+ "loss": 0.0833,
"macro_f1": 0.3333333432674408,
"num_tokens": 332984.0,
"repeat_count": 0.0,
- "routers_loss": 0.01900508813560009,
+ "routers_loss": 0.018046971410512924,
"skip_count": 0.0,
"step": 206,
"text_loss": 0.587641179561615
@@ -1974,13 +1974,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.166015625,
+ "grad_norm": 0.185546875,
"learning_rate": 0.000414,
"loss": 0.0588,
"macro_f1": 0.3272727429866791,
"num_tokens": 335739.0,
"repeat_count": 1.0,
- "routers_loss": 0.13018715381622314,
+ "routers_loss": 0.12791286408901215,
"skip_count": 0.0,
"step": 208,
"text_loss": 0.6538406610488892
@@ -1993,13 +1993,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.24609375,
"learning_rate": 0.00041799999999999997,
- "loss": 0.0697,
+ "loss": 0.0732,
"macro_f1": 0.3272727429866791,
"num_tokens": 338966.0,
"repeat_count": 0.0,
- "routers_loss": 0.055288366973400116,
+ "routers_loss": 0.050490595400333405,
"skip_count": 1.0,
"step": 210,
"text_loss": 0.4188295602798462
@@ -2012,13 +2012,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.271484375,
"learning_rate": 0.000422,
- "loss": 0.0576,
+ "loss": 0.0588,
"macro_f1": 0.3144654333591461,
"num_tokens": 342063.0,
"repeat_count": 0.0,
- "routers_loss": 0.10952572524547577,
+ "routers_loss": 0.11652113497257233,
"skip_count": 3.0,
"step": 212,
"text_loss": 0.21822240948677063
@@ -2031,13 +2031,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.197265625,
+ "grad_norm": 0.2060546875,
"learning_rate": 0.000426,
- "loss": 0.062,
+ "loss": 0.0621,
"macro_f1": 0.3333333432674408,
"num_tokens": 344887.0,
"repeat_count": 0.0,
- "routers_loss": 0.02415696159005165,
+ "routers_loss": 0.023898238316178322,
"skip_count": 0.0,
"step": 214,
"text_loss": 0.24692800641059875
@@ -2050,13 +2050,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.353515625,
+ "grad_norm": 0.3671875,
"learning_rate": 0.00043,
- "loss": 0.1011,
+ "loss": 0.1005,
"macro_f1": 0.3272727429866791,
"num_tokens": 348700.0,
"repeat_count": 1.0,
- "routers_loss": 0.06956391036510468,
+ "routers_loss": 0.06414655596017838,
"skip_count": 0.0,
"step": 216,
"text_loss": 0.4744548797607422
@@ -2069,13 +2069,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1904296875,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.00043400000000000003,
- "loss": 0.076,
+ "loss": 0.0753,
"macro_f1": 0.32098764181137085,
"num_tokens": 351507.0,
"repeat_count": 1.0,
- "routers_loss": 0.1140352189540863,
+ "routers_loss": 0.11702914535999298,
"skip_count": 1.0,
"step": 218,
"text_loss": 0.5614864826202393
@@ -2090,11 +2090,11 @@
"f1_skip": 0.0,
"grad_norm": 0.189453125,
"learning_rate": 0.000438,
- "loss": 0.0788,
+ "loss": 0.0792,
"macro_f1": 0.3333333432674408,
"num_tokens": 354484.0,
"repeat_count": 0.0,
- "routers_loss": 0.011621571145951748,
+ "routers_loss": 0.014991643838584423,
"skip_count": 0.0,
"step": 220,
"text_loss": 0.47209832072257996
@@ -2107,13 +2107,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.251953125,
"learning_rate": 0.000442,
"loss": 0.106,
"macro_f1": 0.3272727429866791,
"num_tokens": 357954.0,
"repeat_count": 0.0,
- "routers_loss": 0.05813701078295708,
+ "routers_loss": 0.04747112840414047,
"skip_count": 1.0,
"step": 222,
"text_loss": 0.2968728244304657
@@ -2126,13 +2126,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.357421875,
+ "grad_norm": 0.40234375,
"learning_rate": 0.000446,
- "loss": 0.0827,
+ "loss": 0.0853,
"macro_f1": 0.32098764181137085,
"num_tokens": 360547.0,
"repeat_count": 0.0,
- "routers_loss": 0.0646885335445404,
+ "routers_loss": 0.06754162162542343,
"skip_count": 2.0,
"step": 224,
"text_loss": 0.2364148646593094
@@ -2145,13 +2145,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.244140625,
+ "grad_norm": 0.2412109375,
"learning_rate": 0.00045000000000000004,
- "loss": 0.1011,
+ "loss": 0.1016,
"macro_f1": 0.3272727429866791,
"num_tokens": 364529.0,
"repeat_count": 0.0,
- "routers_loss": 0.07224348932504654,
+ "routers_loss": 0.07830183953046799,
"skip_count": 1.0,
"step": 226,
"text_loss": 0.4787476360797882
@@ -2164,13 +2164,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.1953125,
"learning_rate": 0.00045400000000000003,
- "loss": 0.0781,
+ "loss": 0.0792,
"macro_f1": 0.3333333432674408,
"num_tokens": 367683.0,
"repeat_count": 0.0,
- "routers_loss": 0.015971746295690536,
+ "routers_loss": 0.015735948458313942,
"skip_count": 0.0,
"step": 228,
"text_loss": 0.37148505449295044
@@ -2183,13 +2183,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.25,
"learning_rate": 0.000458,
- "loss": 0.099,
+ "loss": 0.0995,
"macro_f1": 0.3333333432674408,
"num_tokens": 371402.0,
"repeat_count": 0.0,
- "routers_loss": 0.017818331718444824,
+ "routers_loss": 0.013354359194636345,
"skip_count": 0.0,
"step": 230,
"text_loss": 0.7464763522148132
@@ -2202,13 +2202,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.000462,
- "loss": 0.0757,
+ "loss": 0.0731,
"macro_f1": 0.3333333432674408,
"num_tokens": 374587.0,
"repeat_count": 0.0,
- "routers_loss": 0.01582280732691288,
+ "routers_loss": 0.013763721100986004,
"skip_count": 0.0,
"step": 232,
"text_loss": 0.8754443526268005
@@ -2221,13 +2221,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.42578125,
+ "grad_norm": 0.3984375,
"learning_rate": 0.00046600000000000005,
- "loss": 0.0876,
+ "loss": 0.0861,
"macro_f1": 0.3333333432674408,
"num_tokens": 377513.0,
"repeat_count": 0.0,
- "routers_loss": 0.011417915113270283,
+ "routers_loss": 0.010075435042381287,
"skip_count": 0.0,
"step": 234,
"text_loss": 0.31534913182258606
@@ -2240,13 +2240,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.17578125,
"learning_rate": 0.00047,
- "loss": 0.0801,
+ "loss": 0.0791,
"macro_f1": 0.3272727429866791,
"num_tokens": 380736.0,
"repeat_count": 0.0,
- "routers_loss": 0.05787832289934158,
+ "routers_loss": 0.059825167059898376,
"skip_count": 1.0,
"step": 236,
"text_loss": 0.5936337113380432
@@ -2259,13 +2259,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.236328125,
+ "grad_norm": 0.267578125,
"learning_rate": 0.000474,
- "loss": 0.0508,
+ "loss": 0.0514,
"macro_f1": 0.32098764181137085,
"num_tokens": 383236.0,
"repeat_count": 0.0,
- "routers_loss": 0.09476690739393234,
+ "routers_loss": 0.09134846180677414,
"skip_count": 2.0,
"step": 238,
"text_loss": 0.5976157784461975
@@ -2278,13 +2278,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.208984375,
"learning_rate": 0.00047799999999999996,
- "loss": 0.0833,
+ "loss": 0.0858,
"macro_f1": 0.32098764181137085,
"num_tokens": 385778.0,
"repeat_count": 1.0,
- "routers_loss": 0.1099705696105957,
+ "routers_loss": 0.11989791691303253,
"skip_count": 1.0,
"step": 240,
"text_loss": 0.3554210960865021
@@ -2297,13 +2297,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.171875,
"learning_rate": 0.000482,
- "loss": 0.0745,
+ "loss": 0.0734,
"macro_f1": 0.3333333432674408,
"num_tokens": 388777.0,
"repeat_count": 0.0,
- "routers_loss": 0.01269970741122961,
+ "routers_loss": 0.013591105118393898,
"skip_count": 0.0,
"step": 242,
"text_loss": 0.4829460382461548
@@ -2316,13 +2316,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11962890625,
+ "grad_norm": 0.12060546875,
"learning_rate": 0.000486,
- "loss": 0.061,
+ "loss": 0.0625,
"macro_f1": 0.32098764181137085,
"num_tokens": 391797.0,
"repeat_count": 0.0,
- "routers_loss": 0.08505752682685852,
+ "routers_loss": 0.0920003354549408,
"skip_count": 2.0,
"step": 244,
"text_loss": 0.3085818886756897
@@ -2335,13 +2335,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.00049,
- "loss": 0.0504,
+ "loss": 0.0501,
"macro_f1": 0.3333333432674408,
"num_tokens": 396485.0,
"repeat_count": 0.0,
- "routers_loss": 0.012750142253935337,
+ "routers_loss": 0.0129330949857831,
"skip_count": 0.0,
"step": 246,
"text_loss": 0.42803969979286194
@@ -2354,13 +2354,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.291015625,
+ "grad_norm": 0.296875,
"learning_rate": 0.000494,
- "loss": 0.0962,
+ "loss": 0.0945,
"macro_f1": 0.3144654333591461,
"num_tokens": 399923.0,
"repeat_count": 0.0,
- "routers_loss": 0.11287309974431992,
+ "routers_loss": 0.10677755624055862,
"skip_count": 3.0,
"step": 248,
"text_loss": 0.2908555567264557
@@ -2373,32 +2373,32 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.203125,
"learning_rate": 0.000498,
- "loss": 0.0821,
+ "loss": 0.0812,
"macro_f1": 0.3144654333591461,
"num_tokens": 403647.0,
"repeat_count": 0.0,
- "routers_loss": 0.1486474722623825,
+ "routers_loss": 0.1504337340593338,
"skip_count": 3.0,
"step": 250,
"text_loss": 0.333095908164978
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 1.183152333431171,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
+ "f1_skip": 0.0,
"grad_norm": 0.22265625,
"learning_rate": 0.0005020000000000001,
- "loss": 0.0832,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0828,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 409147.0,
"repeat_count": 0.0,
- "routers_loss": 0.06636594980955124,
+ "routers_loss": 0.06503184884786606,
"skip_count": 2.0,
"step": 252,
"text_loss": 0.16117942333221436
@@ -2411,13 +2411,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.267578125,
+ "grad_norm": 0.287109375,
"learning_rate": 0.000506,
- "loss": 0.1,
+ "loss": 0.0995,
"macro_f1": 0.3333333432674408,
"num_tokens": 412072.0,
"repeat_count": 0.0,
- "routers_loss": 0.015062150545418262,
+ "routers_loss": 0.016280122101306915,
"skip_count": 0.0,
"step": 254,
"text_loss": 0.4217492640018463
@@ -2430,13 +2430,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2138671875,
+ "grad_norm": 0.21484375,
"learning_rate": 0.00051,
- "loss": 0.0808,
+ "loss": 0.0803,
"macro_f1": 0.3144654333591461,
"num_tokens": 415052.0,
"repeat_count": 2.0,
- "routers_loss": 0.2051105946302414,
+ "routers_loss": 0.2117508500814438,
"skip_count": 1.0,
"step": 256,
"text_loss": 0.5795308947563171
@@ -2449,13 +2449,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2412109375,
+ "grad_norm": 0.2421875,
"learning_rate": 0.000514,
- "loss": 0.068,
+ "loss": 0.0668,
"macro_f1": 0.3272727429866791,
"num_tokens": 418099.0,
"repeat_count": 1.0,
- "routers_loss": 0.1467045396566391,
+ "routers_loss": 0.15002092719078064,
"skip_count": 0.0,
"step": 258,
"text_loss": 0.4840938448905945
@@ -2468,13 +2468,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.000518,
- "loss": 0.0543,
+ "loss": 0.0538,
"macro_f1": 0.3333333432674408,
"num_tokens": 422526.0,
"repeat_count": 0.0,
- "routers_loss": 0.013022038154304028,
+ "routers_loss": 0.012834074907004833,
"skip_count": 0.0,
"step": 260,
"text_loss": 0.36141225695610046
@@ -2487,13 +2487,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.216796875,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.000522,
- "loss": 0.0848,
+ "loss": 0.085,
"macro_f1": 0.3076923191547394,
"num_tokens": 425765.0,
"repeat_count": 2.0,
- "routers_loss": 0.2575930058956146,
+ "routers_loss": 0.23808011412620544,
"skip_count": 2.0,
"step": 262,
"text_loss": 0.27572691440582275
@@ -2506,13 +2506,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000526,
- "loss": 0.07,
+ "loss": 0.0708,
"macro_f1": 0.3272727429866791,
"num_tokens": 429048.0,
"repeat_count": 0.0,
- "routers_loss": 0.0558602549135685,
+ "routers_loss": 0.055687375366687775,
"skip_count": 1.0,
"step": 264,
"text_loss": 0.37020301818847656
@@ -2525,13 +2525,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.2080078125,
"learning_rate": 0.0005300000000000001,
- "loss": 0.082,
+ "loss": 0.0839,
"macro_f1": 0.3272727429866791,
"num_tokens": 431784.0,
"repeat_count": 0.0,
- "routers_loss": 0.09126655012369156,
+ "routers_loss": 0.0872957780957222,
"skip_count": 1.0,
"step": 266,
"text_loss": 0.5937283039093018
@@ -2544,13 +2544,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.263671875,
"learning_rate": 0.0005340000000000001,
- "loss": 0.0764,
+ "loss": 0.0733,
"macro_f1": 0.32098764181137085,
"num_tokens": 434297.0,
"repeat_count": 2.0,
- "routers_loss": 0.24805288016796112,
+ "routers_loss": 0.23507654666900635,
"skip_count": 0.0,
"step": 268,
"text_loss": 0.3367372453212738
@@ -2563,13 +2563,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.22265625,
+ "grad_norm": 0.2431640625,
"learning_rate": 0.0005380000000000001,
- "loss": 0.0686,
+ "loss": 0.0708,
"macro_f1": 0.32098764181137085,
"num_tokens": 437586.0,
"repeat_count": 0.0,
- "routers_loss": 0.13135533034801483,
+ "routers_loss": 0.12860390543937683,
"skip_count": 2.0,
"step": 270,
"text_loss": 0.7149854302406311
@@ -2582,13 +2582,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.2451171875,
"learning_rate": 0.0005420000000000001,
- "loss": 0.1083,
+ "loss": 0.1072,
"macro_f1": 0.3272727429866791,
"num_tokens": 440649.0,
"repeat_count": 0.0,
- "routers_loss": 0.04991440102458,
+ "routers_loss": 0.044308312237262726,
"skip_count": 1.0,
"step": 272,
"text_loss": 0.26778292655944824
@@ -2601,13 +2601,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.455078125,
+ "grad_norm": 0.44921875,
"learning_rate": 0.000546,
- "loss": 0.0991,
+ "loss": 0.0938,
"macro_f1": 0.3144654333591461,
"num_tokens": 443907.0,
"repeat_count": 0.0,
- "routers_loss": 0.12236632406711578,
+ "routers_loss": 0.11514109373092651,
"skip_count": 3.0,
"step": 274,
"text_loss": 0.23578761518001556
@@ -2620,13 +2620,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.25,
+ "grad_norm": 0.2578125,
"learning_rate": 0.00055,
- "loss": 0.0936,
+ "loss": 0.0932,
"macro_f1": 0.5492662787437439,
"num_tokens": 447147.0,
"repeat_count": 0.0,
- "routers_loss": 0.053506772965192795,
+ "routers_loss": 0.055705297738313675,
"skip_count": 2.0,
"step": 276,
"text_loss": 0.2513524889945984
@@ -2639,13 +2639,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.265625,
+ "grad_norm": 0.29296875,
"learning_rate": 0.000554,
- "loss": 0.066,
+ "loss": 0.0667,
"macro_f1": 0.32098764181137085,
"num_tokens": 450032.0,
"repeat_count": 0.0,
- "routers_loss": 0.13446088135242462,
+ "routers_loss": 0.13778971135616302,
"skip_count": 2.0,
"step": 278,
"text_loss": 0.4857243597507477
@@ -2658,32 +2658,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.185546875,
"learning_rate": 0.000558,
- "loss": 0.0682,
+ "loss": 0.0672,
"macro_f1": 0.3272727429866791,
"num_tokens": 453195.0,
"repeat_count": 1.0,
- "routers_loss": 0.07270720601081848,
+ "routers_loss": 0.0700262188911438,
"skip_count": 0.0,
"step": 280,
"text_loss": 0.7589789628982544
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 1.3240387437628411,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.28125,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25,
"learning_rate": 0.0005620000000000001,
- "loss": 0.0648,
- "macro_f1": 0.5427350401878357,
+ "loss": 0.0603,
+ "macro_f1": 0.3144654333591461,
"num_tokens": 455942.0,
"repeat_count": 1.0,
- "routers_loss": 0.13866399228572845,
+ "routers_loss": 0.11706235259771347,
"skip_count": 2.0,
"step": 282,
"text_loss": 0.4783432185649872
@@ -2696,13 +2696,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.236328125,
+ "grad_norm": 0.265625,
"learning_rate": 0.000566,
- "loss": 0.0782,
+ "loss": 0.0793,
"macro_f1": 0.3272727429866791,
"num_tokens": 458932.0,
"repeat_count": 0.0,
- "routers_loss": 0.0645354762673378,
+ "routers_loss": 0.07073967158794403,
"skip_count": 1.0,
"step": 284,
"text_loss": 0.7117193937301636
@@ -2715,13 +2715,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.00057,
- "loss": 0.0892,
+ "loss": 0.0915,
"macro_f1": 0.3272727429866791,
"num_tokens": 462650.0,
"repeat_count": 0.0,
- "routers_loss": 0.05967628210783005,
+ "routers_loss": 0.05301115661859512,
"skip_count": 1.0,
"step": 286,
"text_loss": 0.4175460636615753
@@ -2734,13 +2734,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23828125,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.000574,
- "loss": 0.0676,
+ "loss": 0.0675,
"macro_f1": 0.3272727429866791,
"num_tokens": 466290.0,
"repeat_count": 0.0,
- "routers_loss": 0.06438407301902771,
+ "routers_loss": 0.06356479972600937,
"skip_count": 1.0,
"step": 288,
"text_loss": 0.5832946300506592
@@ -2753,13 +2753,13 @@
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.28515625,
"learning_rate": 0.000578,
- "loss": 0.0781,
+ "loss": 0.0805,
"macro_f1": 0.3006536066532135,
"num_tokens": 469296.0,
"repeat_count": 1.0,
- "routers_loss": 0.21225209534168243,
+ "routers_loss": 0.21032999455928802,
"skip_count": 3.0,
"step": 290,
"text_loss": 0.36023473739624023
@@ -2772,13 +2772,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.244140625,
+ "grad_norm": 0.27734375,
"learning_rate": 0.0005819999999999999,
- "loss": 0.0664,
+ "loss": 0.0685,
"macro_f1": 0.32098764181137085,
"num_tokens": 472272.0,
"repeat_count": 1.0,
- "routers_loss": 0.08085516840219498,
+ "routers_loss": 0.08062280714511871,
"skip_count": 1.0,
"step": 292,
"text_loss": 0.37197956442832947
@@ -2791,13 +2791,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.28125,
"learning_rate": 0.0005859999999999999,
- "loss": 0.0874,
+ "loss": 0.0878,
"macro_f1": 0.32098764181137085,
"num_tokens": 475864.0,
"repeat_count": 0.0,
- "routers_loss": 0.05378658324480057,
+ "routers_loss": 0.05023600533604622,
"skip_count": 2.0,
"step": 294,
"text_loss": 0.4765273630619049
@@ -2810,13 +2810,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.2177734375,
"learning_rate": 0.00059,
- "loss": 0.0715,
+ "loss": 0.0728,
"macro_f1": 0.3333333432674408,
"num_tokens": 478916.0,
"repeat_count": 0.0,
- "routers_loss": 0.01145261898636818,
+ "routers_loss": 0.011689410544931889,
"skip_count": 0.0,
"step": 296,
"text_loss": 0.5878773927688599
@@ -2831,11 +2831,11 @@
"f1_skip": 0.0,
"grad_norm": 0.15625,
"learning_rate": 0.000594,
- "loss": 0.0737,
+ "loss": 0.0727,
"macro_f1": 0.3333333432674408,
"num_tokens": 482369.0,
"repeat_count": 0.0,
- "routers_loss": 0.009397956542670727,
+ "routers_loss": 0.010772093199193478,
"skip_count": 0.0,
"step": 298,
"text_loss": 0.4424116313457489
@@ -2848,13 +2848,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.181640625,
"learning_rate": 0.000598,
- "loss": 0.0802,
+ "loss": 0.0787,
"macro_f1": 0.3076923191547394,
"num_tokens": 486049.0,
"repeat_count": 2.0,
- "routers_loss": 0.2389357089996338,
+ "routers_loss": 0.23482851684093475,
"skip_count": 2.0,
"step": 300,
"text_loss": 0.21217775344848633
@@ -2862,18 +2862,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 1.417963017317288,
- "f1_execute": 0.9019607901573181,
+ "f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.2080078125,
"learning_rate": 0.000602,
- "loss": 0.0745,
- "macro_f1": 0.3006536066532135,
+ "loss": 0.073,
+ "macro_f1": 0.3076923191547394,
"num_tokens": 488683.0,
"repeat_count": 1.0,
- "routers_loss": 0.18252353370189667,
+ "routers_loss": 0.18843084573745728,
"skip_count": 3.0,
"step": 302,
"text_loss": 0.2109498232603073
@@ -2886,13 +2886,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.27734375,
+ "grad_norm": 0.279296875,
"learning_rate": 0.000606,
- "loss": 0.0935,
+ "loss": 0.0945,
"macro_f1": 0.3144654333591461,
"num_tokens": 492010.0,
"repeat_count": 0.0,
- "routers_loss": 0.18185268342494965,
+ "routers_loss": 0.17861786484718323,
"skip_count": 3.0,
"step": 304,
"text_loss": 0.8446305394172668
@@ -2905,13 +2905,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.00061,
- "loss": 0.0853,
+ "loss": 0.0827,
"macro_f1": 0.3333333432674408,
"num_tokens": 494764.0,
"repeat_count": 0.0,
- "routers_loss": 0.013210167177021503,
+ "routers_loss": 0.014124520123004913,
"skip_count": 0.0,
"step": 306,
"text_loss": 0.742735743522644
@@ -2924,13 +2924,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.26953125,
"learning_rate": 0.000614,
- "loss": 0.1089,
+ "loss": 0.1071,
"macro_f1": 0.3333333432674408,
"num_tokens": 497820.0,
"repeat_count": 0.0,
- "routers_loss": 0.016936838626861572,
+ "routers_loss": 0.017968112602829933,
"skip_count": 0.0,
"step": 308,
"text_loss": 0.28305482864379883
@@ -2943,13 +2943,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.0006180000000000001,
- "loss": 0.077,
+ "loss": 0.0775,
"macro_f1": 0.32098764181137085,
"num_tokens": 500694.0,
"repeat_count": 0.0,
- "routers_loss": 0.08630389720201492,
+ "routers_loss": 0.08593655377626419,
"skip_count": 2.0,
"step": 310,
"text_loss": 0.3496848940849304
@@ -2962,13 +2962,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.19140625,
"learning_rate": 0.000622,
- "loss": 0.0602,
+ "loss": 0.061,
"macro_f1": 0.3333333432674408,
"num_tokens": 503871.0,
"repeat_count": 0.0,
- "routers_loss": 0.013665963895618916,
+ "routers_loss": 0.016449492424726486,
"skip_count": 0.0,
"step": 312,
"text_loss": 0.6691372990608215
@@ -2981,13 +2981,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.205078125,
"learning_rate": 0.000626,
- "loss": 0.0794,
+ "loss": 0.0815,
"macro_f1": 0.3333333432674408,
"num_tokens": 506730.0,
"repeat_count": 0.0,
- "routers_loss": 0.01584783010184765,
+ "routers_loss": 0.014532964676618576,
"skip_count": 0.0,
"step": 314,
"text_loss": 0.6118118166923523
@@ -3000,13 +3000,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.224609375,
+ "grad_norm": 0.2216796875,
"learning_rate": 0.00063,
- "loss": 0.0762,
+ "loss": 0.0742,
"macro_f1": 0.3333333432674408,
"num_tokens": 510323.0,
"repeat_count": 0.0,
- "routers_loss": 0.01368923019617796,
+ "routers_loss": 0.013093139044940472,
"skip_count": 0.0,
"step": 316,
"text_loss": 0.38126271963119507
@@ -3019,13 +3019,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.388671875,
+ "grad_norm": 0.400390625,
"learning_rate": 0.000634,
- "loss": 0.0908,
+ "loss": 0.0915,
"macro_f1": 0.3333333432674408,
"num_tokens": 514075.0,
"repeat_count": 0.0,
- "routers_loss": 0.009135022759437561,
+ "routers_loss": 0.008627045899629593,
"skip_count": 0.0,
"step": 318,
"text_loss": 0.5983037948608398
@@ -3038,13 +3038,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000638,
- "loss": 0.0949,
+ "loss": 0.1008,
"macro_f1": 0.3272727429866791,
"num_tokens": 517418.0,
"repeat_count": 0.0,
- "routers_loss": 0.046641621738672256,
+ "routers_loss": 0.04561378434300423,
"skip_count": 1.0,
"step": 320,
"text_loss": 0.767257034778595
@@ -3052,18 +3052,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 1.5118872908717347,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23046875,
+ "grad_norm": 0.259765625,
"learning_rate": 0.000642,
- "loss": 0.0925,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0926,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 520443.0,
"repeat_count": 0.0,
- "routers_loss": 0.020637936890125275,
+ "routers_loss": 0.024372953921556473,
"skip_count": 0.0,
"step": 322,
"text_loss": 0.6572105884552002
@@ -3076,13 +3076,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26953125,
+ "grad_norm": 0.30078125,
"learning_rate": 0.000646,
"loss": 0.0822,
"macro_f1": 0.3272727429866791,
"num_tokens": 523317.0,
"repeat_count": 1.0,
- "routers_loss": 0.08289298415184021,
+ "routers_loss": 0.08099937438964844,
"skip_count": 0.0,
"step": 324,
"text_loss": 0.205499529838562
@@ -3090,18 +3090,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 1.530672145582624,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23828125,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.0006500000000000001,
- "loss": 0.0823,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0809,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 526355.0,
"repeat_count": 0.0,
- "routers_loss": 0.06960040330886841,
+ "routers_loss": 0.0657225176692009,
"skip_count": 1.0,
"step": 326,
"text_loss": 0.2587239742279053
@@ -3114,13 +3114,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1162109375,
+ "grad_norm": 0.111328125,
"learning_rate": 0.0006540000000000001,
- "loss": 0.0799,
+ "loss": 0.0779,
"macro_f1": 0.3333333432674408,
"num_tokens": 529689.0,
"repeat_count": 0.0,
- "routers_loss": 0.02087482251226902,
+ "routers_loss": 0.01849208027124405,
"skip_count": 0.0,
"step": 328,
"text_loss": 0.2172023057937622
@@ -3133,13 +3133,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.1845703125,
"learning_rate": 0.0006580000000000001,
- "loss": 0.0757,
+ "loss": 0.0758,
"macro_f1": 0.3333333432674408,
"num_tokens": 532603.0,
"repeat_count": 0.0,
- "routers_loss": 0.016592051833868027,
+ "routers_loss": 0.016184113919734955,
"skip_count": 0.0,
"step": 330,
"text_loss": 0.5980568528175354
@@ -3152,32 +3152,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.22265625,
+ "grad_norm": 0.220703125,
"learning_rate": 0.000662,
- "loss": 0.0438,
+ "loss": 0.0439,
"macro_f1": 0.3333333432674408,
"num_tokens": 536056.0,
"repeat_count": 0.0,
- "routers_loss": 0.012950568459928036,
+ "routers_loss": 0.01303898449987173,
"skip_count": 0.0,
"step": 332,
"text_loss": 0.5421966314315796
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
"epoch": 1.5682418550044028,
- "f1_execute": 0.8799999952316284,
+ "f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.310546875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.296875,
"learning_rate": 0.000666,
- "loss": 0.0964,
- "macro_f1": 0.29333335161209106,
+ "loss": 0.0963,
+ "macro_f1": 0.465986430644989,
"num_tokens": 539231.0,
"repeat_count": 3.0,
- "routers_loss": 0.3373340964317322,
+ "routers_loss": 0.3075675964355469,
"skip_count": 3.0,
"step": 334,
"text_loss": 0.19719554483890533
@@ -3190,13 +3190,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.171875,
+ "grad_norm": 0.173828125,
"learning_rate": 0.00067,
"loss": 0.0706,
"macro_f1": 0.3333333432674408,
"num_tokens": 542038.0,
"repeat_count": 0.0,
- "routers_loss": 0.008110735565423965,
+ "routers_loss": 0.009116224013268948,
"skip_count": 0.0,
"step": 336,
"text_loss": 0.3407036066055298
@@ -3209,13 +3209,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.248046875,
+ "grad_norm": 0.2421875,
"learning_rate": 0.000674,
- "loss": 0.0771,
+ "loss": 0.0768,
"macro_f1": 0.3333333432674408,
"num_tokens": 545019.0,
"repeat_count": 0.0,
- "routers_loss": 0.01841609925031662,
+ "routers_loss": 0.021463042125105858,
"skip_count": 0.0,
"step": 338,
"text_loss": 0.24486012756824493
@@ -3228,13 +3228,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.0006780000000000001,
- "loss": 0.0894,
+ "loss": 0.0889,
"macro_f1": 0.3333333432674408,
"num_tokens": 548036.0,
"repeat_count": 0.0,
- "routers_loss": 0.01612614095211029,
+ "routers_loss": 0.01857556402683258,
"skip_count": 0.0,
"step": 340,
"text_loss": 0.28140124678611755
@@ -3247,13 +3247,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.125,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0006820000000000001,
- "loss": 0.0611,
+ "loss": 0.0617,
"macro_f1": 0.3006536364555359,
"num_tokens": 551419.0,
"repeat_count": 2.0,
- "routers_loss": 0.26202192902565,
+ "routers_loss": 0.27090007066726685,
"skip_count": 3.0,
"step": 342,
"text_loss": 0.20690307021141052
@@ -3266,13 +3266,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.3046875,
"learning_rate": 0.0006860000000000001,
- "loss": 0.1013,
+ "loss": 0.1047,
"macro_f1": 0.32098764181137085,
"num_tokens": 554037.0,
"repeat_count": 0.0,
- "routers_loss": 0.09235779196023941,
+ "routers_loss": 0.09231195598840714,
"skip_count": 2.0,
"step": 344,
"text_loss": 0.4479128420352936
@@ -3285,13 +3285,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.255859375,
"learning_rate": 0.00069,
- "loss": 0.0856,
+ "loss": 0.0883,
"macro_f1": 0.3333333432674408,
"num_tokens": 556672.0,
"repeat_count": 0.0,
- "routers_loss": 0.010735333897173405,
+ "routers_loss": 0.00935924518853426,
"skip_count": 0.0,
"step": 346,
"text_loss": 0.6377320289611816
@@ -3304,13 +3304,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.000694,
- "loss": 0.0778,
+ "loss": 0.0781,
"macro_f1": 0.32098764181137085,
"num_tokens": 559756.0,
"repeat_count": 0.0,
- "routers_loss": 0.14742356538772583,
+ "routers_loss": 0.17641772329807281,
"skip_count": 2.0,
"step": 348,
"text_loss": 0.6097636222839355
@@ -3323,13 +3323,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.30859375,
+ "grad_norm": 0.30078125,
"learning_rate": 0.0006979999999999999,
- "loss": 0.0614,
+ "loss": 0.0616,
"macro_f1": 0.5492662787437439,
"num_tokens": 563415.0,
"repeat_count": 0.0,
- "routers_loss": 0.06606879830360413,
+ "routers_loss": 0.06240406632423401,
"skip_count": 2.0,
"step": 350,
"text_loss": 0.5291631817817688
@@ -3342,13 +3342,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.322265625,
+ "grad_norm": 0.296875,
"learning_rate": 0.0007019999999999999,
- "loss": 0.1033,
+ "loss": 0.1026,
"macro_f1": 0.3333333432674408,
"num_tokens": 566357.0,
"repeat_count": 0.0,
- "routers_loss": 0.012873432599008083,
+ "routers_loss": 0.012269247323274612,
"skip_count": 0.0,
"step": 352,
"text_loss": 0.5170195698738098
@@ -3361,13 +3361,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0007059999999999999,
- "loss": 0.0819,
+ "loss": 0.0815,
"macro_f1": 0.32098764181137085,
"num_tokens": 569449.0,
"repeat_count": 0.0,
- "routers_loss": 0.07853665202856064,
+ "routers_loss": 0.07515309751033783,
"skip_count": 2.0,
"step": 354,
"text_loss": 0.34507250785827637
@@ -3380,13 +3380,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.251953125,
+ "grad_norm": 0.263671875,
"learning_rate": 0.00071,
- "loss": 0.0804,
+ "loss": 0.0791,
"macro_f1": 0.3144654333591461,
"num_tokens": 572761.0,
"repeat_count": 1.0,
- "routers_loss": 0.2216549813747406,
+ "routers_loss": 0.20768006145954132,
"skip_count": 2.0,
"step": 356,
"text_loss": 0.3158532381057739
@@ -3399,13 +3399,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.185546875,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.000714,
- "loss": 0.0675,
+ "loss": 0.0682,
"macro_f1": 0.3333333432674408,
"num_tokens": 575909.0,
"repeat_count": 0.0,
- "routers_loss": 0.02423691377043724,
+ "routers_loss": 0.025329967960715294,
"skip_count": 0.0,
"step": 358,
"text_loss": 0.21455390751361847
@@ -3413,18 +3413,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 1.6903434106251836,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.21484375,
"learning_rate": 0.000718,
- "loss": 0.0781,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0775,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 579186.0,
"repeat_count": 1.0,
- "routers_loss": 0.07496294379234314,
+ "routers_loss": 0.07676175981760025,
"skip_count": 0.0,
"step": 360,
"text_loss": 0.61895352602005
@@ -3437,13 +3437,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2138671875,
+ "grad_norm": 0.197265625,
"learning_rate": 0.000722,
- "loss": 0.0778,
+ "loss": 0.0781,
"macro_f1": 0.32098767161369324,
"num_tokens": 582437.0,
"repeat_count": 0.0,
- "routers_loss": 0.08181872963905334,
+ "routers_loss": 0.08070661872625351,
"skip_count": 1.0,
"step": 362,
"text_loss": 0.20557661354541779
@@ -3456,13 +3456,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.2216796875,
"learning_rate": 0.000726,
- "loss": 0.1112,
+ "loss": 0.11,
"macro_f1": 0.3333333432674408,
"num_tokens": 586096.0,
"repeat_count": 0.0,
- "routers_loss": 0.016959719359874725,
+ "routers_loss": 0.015891313552856445,
"skip_count": 0.0,
"step": 364,
"text_loss": 0.597991943359375
@@ -3475,13 +3475,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.15625,
"learning_rate": 0.00073,
- "loss": 0.0577,
+ "loss": 0.0573,
"macro_f1": 0.3076923191547394,
"num_tokens": 589520.0,
"repeat_count": 1.0,
- "routers_loss": 0.13295969367027283,
+ "routers_loss": 0.12844261527061462,
"skip_count": 3.0,
"step": 366,
"text_loss": 0.2944789230823517
@@ -3494,13 +3494,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1455078125,
+ "grad_norm": 0.150390625,
"learning_rate": 0.000734,
- "loss": 0.0986,
+ "loss": 0.1005,
"macro_f1": 0.3333333432674408,
"num_tokens": 592691.0,
"repeat_count": 0.0,
- "routers_loss": 0.02476893551647663,
+ "routers_loss": 0.02382199838757515,
"skip_count": 0.0,
"step": 368,
"text_loss": 0.23989969491958618
@@ -3513,13 +3513,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1796875,
"learning_rate": 0.000738,
- "loss": 0.0682,
+ "loss": 0.0661,
"macro_f1": 0.3333333432674408,
"num_tokens": 596004.0,
"repeat_count": 0.0,
- "routers_loss": 0.019863395020365715,
+ "routers_loss": 0.018812084570527077,
"skip_count": 0.0,
"step": 370,
"text_loss": 0.22111408412456512
@@ -3532,13 +3532,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.2412109375,
"learning_rate": 0.000742,
- "loss": 0.0663,
+ "loss": 0.0666,
"macro_f1": 0.3272727429866791,
"num_tokens": 599087.0,
"repeat_count": 0.0,
- "routers_loss": 0.07230417430400848,
+ "routers_loss": 0.08290331065654755,
"skip_count": 1.0,
"step": 372,
"text_loss": 0.2567356526851654
@@ -3551,13 +3551,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.2412109375,
"learning_rate": 0.000746,
- "loss": 0.0986,
+ "loss": 0.0941,
"macro_f1": 0.32098764181137085,
"num_tokens": 602330.0,
"repeat_count": 1.0,
- "routers_loss": 0.11727793514728546,
+ "routers_loss": 0.11482042074203491,
"skip_count": 1.0,
"step": 374,
"text_loss": 0.7217292785644531
@@ -3570,13 +3570,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.224609375,
+ "grad_norm": 0.2265625,
"learning_rate": 0.00075,
- "loss": 0.0724,
+ "loss": 0.0728,
"macro_f1": 0.3272727429866791,
"num_tokens": 605503.0,
"repeat_count": 1.0,
- "routers_loss": 0.13495951890945435,
+ "routers_loss": 0.11849870532751083,
"skip_count": 0.0,
"step": 376,
"text_loss": 0.5122153759002686
@@ -3589,13 +3589,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23046875,
+ "grad_norm": 0.2333984375,
"learning_rate": 0.000754,
- "loss": 0.0823,
+ "loss": 0.0835,
"macro_f1": 0.32098767161369324,
"num_tokens": 608505.0,
"repeat_count": 0.0,
- "routers_loss": 0.07612533867359161,
+ "routers_loss": 0.07090992480516434,
"skip_count": 1.0,
"step": 378,
"text_loss": 0.2204965502023697
@@ -3608,13 +3608,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.1826171875,
"learning_rate": 0.000758,
- "loss": 0.0803,
+ "loss": 0.0794,
"macro_f1": 0.3272727429866791,
"num_tokens": 611193.0,
"repeat_count": 0.0,
- "routers_loss": 0.0484120175242424,
+ "routers_loss": 0.03812089189887047,
"skip_count": 1.0,
"step": 380,
"text_loss": 0.44909021258354187
@@ -3627,13 +3627,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.000762,
- "loss": 0.0866,
+ "loss": 0.0882,
"macro_f1": 0.3272727429866791,
"num_tokens": 614231.0,
"repeat_count": 1.0,
- "routers_loss": 0.10939671844244003,
+ "routers_loss": 0.10270529240369797,
"skip_count": 0.0,
"step": 382,
"text_loss": 0.13624964654445648
@@ -3646,13 +3646,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.326171875,
+ "grad_norm": 0.330078125,
"learning_rate": 0.0007660000000000001,
- "loss": 0.1083,
+ "loss": 0.1107,
"macro_f1": 0.32098764181137085,
"num_tokens": 617090.0,
"repeat_count": 1.0,
- "routers_loss": 0.11382336914539337,
+ "routers_loss": 0.11624004691839218,
"skip_count": 1.0,
"step": 384,
"text_loss": 0.7314052581787109
@@ -3667,11 +3667,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1396484375,
"learning_rate": 0.0007700000000000001,
- "loss": 0.0616,
+ "loss": 0.0628,
"macro_f1": 0.32098764181137085,
"num_tokens": 620596.0,
"repeat_count": 0.0,
- "routers_loss": 0.07494530081748962,
+ "routers_loss": 0.07114322483539581,
"skip_count": 2.0,
"step": 386,
"text_loss": 0.503322958946228
@@ -3684,13 +3684,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.298828125,
+ "grad_norm": 0.306640625,
"learning_rate": 0.0007740000000000001,
- "loss": 0.0816,
+ "loss": 0.0829,
"macro_f1": 0.32098764181137085,
"num_tokens": 624108.0,
"repeat_count": 0.0,
- "routers_loss": 0.05718417093157768,
+ "routers_loss": 0.06061873584985733,
"skip_count": 2.0,
"step": 388,
"text_loss": 0.11481904983520508
@@ -3703,13 +3703,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1982421875,
+ "grad_norm": 0.2099609375,
"learning_rate": 0.000778,
- "loss": 0.0783,
+ "loss": 0.0791,
"macro_f1": 0.3006536364555359,
"num_tokens": 626895.0,
"repeat_count": 1.0,
- "routers_loss": 0.2848989963531494,
+ "routers_loss": 0.2921771705150604,
"skip_count": 4.0,
"step": 390,
"text_loss": 0.3069624602794647
@@ -3722,13 +3722,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.30078125,
+ "grad_norm": 0.30859375,
"learning_rate": 0.000782,
- "loss": 0.0608,
+ "loss": 0.0605,
"macro_f1": 0.3076923191547394,
"num_tokens": 630204.0,
"repeat_count": 0.0,
- "routers_loss": 0.2050076276063919,
+ "routers_loss": 0.202707901597023,
"skip_count": 4.0,
"step": 392,
"text_loss": 0.6022785305976868
@@ -3741,13 +3741,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28125,
+ "grad_norm": 0.29296875,
"learning_rate": 0.000786,
- "loss": 0.0863,
+ "loss": 0.0877,
"macro_f1": 0.3333333432674408,
"num_tokens": 634373.0,
"repeat_count": 0.0,
- "routers_loss": 0.020946886390447617,
+ "routers_loss": 0.0221510399132967,
"skip_count": 0.0,
"step": 394,
"text_loss": 0.26787394285202026
@@ -3760,13 +3760,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.376953125,
+ "grad_norm": 0.37890625,
"learning_rate": 0.00079,
- "loss": 0.0798,
+ "loss": 0.0805,
"macro_f1": 0.32098764181137085,
"num_tokens": 637442.0,
"repeat_count": 2.0,
- "routers_loss": 0.1270289123058319,
+ "routers_loss": 0.12636390328407288,
"skip_count": 0.0,
"step": 396,
"text_loss": 0.2799781560897827
@@ -3779,13 +3779,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.2080078125,
"learning_rate": 0.0007940000000000001,
- "loss": 0.0701,
+ "loss": 0.0724,
"macro_f1": 0.32098764181137085,
"num_tokens": 641231.0,
"repeat_count": 0.0,
- "routers_loss": 0.08012636005878448,
+ "routers_loss": 0.07933453470468521,
"skip_count": 2.0,
"step": 398,
"text_loss": 0.2507784366607666
@@ -3798,13 +3798,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.0007980000000000001,
- "loss": 0.0901,
+ "loss": 0.0909,
"macro_f1": 0.3272727429866791,
"num_tokens": 644560.0,
"repeat_count": 1.0,
- "routers_loss": 0.09315784275531769,
+ "routers_loss": 0.10324911028146744,
"skip_count": 0.0,
"step": 400,
"text_loss": 0.7756280303001404
@@ -3817,13 +3817,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0008020000000000001,
- "loss": 0.078,
+ "loss": 0.0783,
"macro_f1": 0.3144654333591461,
"num_tokens": 647393.0,
"repeat_count": 1.0,
- "routers_loss": 0.18492189049720764,
+ "routers_loss": 0.18546262383460999,
"skip_count": 2.0,
"step": 402,
"text_loss": 0.5013328194618225
@@ -3836,13 +3836,13 @@
"f1_execute": 0.8571428656578064,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.283203125,
"learning_rate": 0.0008060000000000001,
- "loss": 0.0801,
+ "loss": 0.0787,
"macro_f1": 0.2857142984867096,
"num_tokens": 650355.0,
"repeat_count": 3.0,
- "routers_loss": 0.32641324400901794,
+ "routers_loss": 0.3280293643474579,
"skip_count": 4.0,
"step": 404,
"text_loss": 0.2842077314853668
@@ -3855,13 +3855,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2080078125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.0008100000000000001,
- "loss": 0.0905,
+ "loss": 0.0901,
"macro_f1": 0.3333333432674408,
"num_tokens": 654280.0,
"repeat_count": 0.0,
- "routers_loss": 0.02722037397325039,
+ "routers_loss": 0.02623247355222702,
"skip_count": 0.0,
"step": 406,
"text_loss": 0.46742817759513855
@@ -3874,13 +3874,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.216796875,
"learning_rate": 0.0008139999999999999,
- "loss": 0.0958,
+ "loss": 0.0945,
"macro_f1": 0.3333333432674408,
"num_tokens": 657568.0,
"repeat_count": 0.0,
- "routers_loss": 0.010129833593964577,
+ "routers_loss": 0.009744114242494106,
"skip_count": 0.0,
"step": 408,
"text_loss": 0.7168047428131104
@@ -3893,13 +3893,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2373046875,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.0008179999999999999,
- "loss": 0.1084,
+ "loss": 0.1065,
"macro_f1": 0.32098764181137085,
"num_tokens": 660593.0,
"repeat_count": 0.0,
- "routers_loss": 0.07298308610916138,
+ "routers_loss": 0.07591600716114044,
"skip_count": 2.0,
"step": 410,
"text_loss": 0.449823260307312
@@ -3912,13 +3912,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.15625,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0008219999999999999,
- "loss": 0.0802,
+ "loss": 0.0795,
"macro_f1": 0.3333333432674408,
"num_tokens": 663916.0,
"repeat_count": 0.0,
- "routers_loss": 0.024257874116301537,
+ "routers_loss": 0.02076602540910244,
"skip_count": 0.0,
"step": 412,
"text_loss": 0.4764713943004608
@@ -3931,13 +3931,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1904296875,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.000826,
- "loss": 0.0842,
+ "loss": 0.0836,
"macro_f1": 0.3272727429866791,
"num_tokens": 667502.0,
"repeat_count": 0.0,
- "routers_loss": 0.048864223062992096,
+ "routers_loss": 0.049170155078172684,
"skip_count": 1.0,
"step": 414,
"text_loss": 0.30333325266838074
@@ -3950,13 +3950,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1513671875,
"learning_rate": 0.00083,
- "loss": 0.1026,
+ "loss": 0.1021,
"macro_f1": 0.3272727429866791,
"num_tokens": 670510.0,
"repeat_count": 1.0,
- "routers_loss": 0.1592330038547516,
+ "routers_loss": 0.15554003417491913,
"skip_count": 0.0,
"step": 416,
"text_loss": 0.3691870868206024
@@ -3969,13 +3969,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000834,
- "loss": 0.0963,
+ "loss": 0.1013,
"macro_f1": 0.3333333432674408,
"num_tokens": 674761.0,
"repeat_count": 0.0,
- "routers_loss": 0.02291976846754551,
+ "routers_loss": 0.024516675621271133,
"skip_count": 0.0,
"step": 418,
"text_loss": 0.32850381731987
@@ -3988,13 +3988,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.000838,
- "loss": 0.0634,
+ "loss": 0.0649,
"macro_f1": 0.3333333432674408,
"num_tokens": 678055.0,
"repeat_count": 0.0,
- "routers_loss": 0.010272650048136711,
+ "routers_loss": 0.011026890948414803,
"skip_count": 0.0,
"step": 420,
"text_loss": 0.6637290716171265
@@ -4007,13 +4007,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28125,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000842,
- "loss": 0.0786,
+ "loss": 0.0771,
"macro_f1": 0.3272727429866791,
"num_tokens": 680979.0,
"repeat_count": 0.0,
- "routers_loss": 0.0692613497376442,
+ "routers_loss": 0.07451887428760529,
"skip_count": 1.0,
"step": 422,
"text_loss": 0.27131685614585876
@@ -4026,13 +4026,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12890625,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.000846,
- "loss": 0.0706,
+ "loss": 0.0714,
"macro_f1": 0.32098764181137085,
"num_tokens": 684144.0,
"repeat_count": 1.0,
- "routers_loss": 0.12713804841041565,
+ "routers_loss": 0.11341800540685654,
"skip_count": 1.0,
"step": 424,
"text_loss": 0.652126669883728
@@ -4045,13 +4045,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.00085,
- "loss": 0.0758,
+ "loss": 0.0754,
"macro_f1": 0.3272727429866791,
"num_tokens": 687004.0,
"repeat_count": 1.0,
- "routers_loss": 0.08670130372047424,
+ "routers_loss": 0.08985847979784012,
"skip_count": 0.0,
"step": 426,
"text_loss": 0.2589428424835205
@@ -4064,13 +4064,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.23828125,
"learning_rate": 0.000854,
- "loss": 0.0857,
+ "loss": 0.0866,
"macro_f1": 0.3333333432674408,
"num_tokens": 689702.0,
"repeat_count": 0.0,
- "routers_loss": 0.01053862925618887,
+ "routers_loss": 0.011355436407029629,
"skip_count": 0.0,
"step": 428,
"text_loss": 0.8909716010093689
@@ -4083,13 +4083,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.000858,
- "loss": 0.0615,
+ "loss": 0.0623,
"macro_f1": 0.3333333432674408,
"num_tokens": 692698.0,
"repeat_count": 0.0,
- "routers_loss": 0.012946994043886662,
+ "routers_loss": 0.013788948766887188,
"skip_count": 0.0,
"step": 430,
"text_loss": 0.19141142070293427
@@ -4102,13 +4102,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.000862,
- "loss": 0.0498,
+ "loss": 0.0499,
"macro_f1": 0.32098764181137085,
"num_tokens": 696007.0,
"repeat_count": 0.0,
- "routers_loss": 0.08222822099924088,
+ "routers_loss": 0.07998392730951309,
"skip_count": 2.0,
"step": 432,
"text_loss": 0.1611809879541397
@@ -4121,13 +4121,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.173828125,
"learning_rate": 0.000866,
- "loss": 0.0532,
+ "loss": 0.0541,
"macro_f1": 0.32098764181137085,
"num_tokens": 700271.0,
"repeat_count": 0.0,
- "routers_loss": 0.07086442410945892,
+ "routers_loss": 0.06988382339477539,
"skip_count": 2.0,
"step": 434,
"text_loss": 0.37254223227500916
@@ -4140,13 +4140,13 @@
"f1_execute": 0.8333333730697632,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.00087,
- "loss": 0.0825,
+ "loss": 0.0834,
"macro_f1": 0.2777777910232544,
"num_tokens": 703519.0,
"repeat_count": 3.0,
- "routers_loss": 0.29007306694984436,
+ "routers_loss": 0.28240787982940674,
"skip_count": 5.0,
"step": 436,
"text_loss": 0.29636648297309875
@@ -4159,13 +4159,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.396484375,
+ "grad_norm": 0.423828125,
"learning_rate": 0.000874,
- "loss": 0.0658,
+ "loss": 0.0657,
"macro_f1": 0.3333333432674408,
"num_tokens": 706826.0,
"repeat_count": 0.0,
- "routers_loss": 0.014652491547167301,
+ "routers_loss": 0.013924967497587204,
"skip_count": 0.0,
"step": 438,
"text_loss": 0.20867908000946045
@@ -4178,13 +4178,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.000878,
- "loss": 0.0685,
+ "loss": 0.0657,
"macro_f1": 0.3333333432674408,
"num_tokens": 710530.0,
"repeat_count": 0.0,
- "routers_loss": 0.013720969669520855,
+ "routers_loss": 0.01170142088085413,
"skip_count": 0.0,
"step": 440,
"text_loss": 0.7273373007774353
@@ -4197,13 +4197,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.173828125,
+ "grad_norm": 0.171875,
"learning_rate": 0.000882,
- "loss": 0.0771,
+ "loss": 0.076,
"macro_f1": 0.3333333432674408,
"num_tokens": 713503.0,
"repeat_count": 0.0,
- "routers_loss": 0.011687638238072395,
+ "routers_loss": 0.011930872686207294,
"skip_count": 0.0,
"step": 442,
"text_loss": 0.39314430952072144
@@ -4216,13 +4216,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.0008860000000000001,
- "loss": 0.0604,
+ "loss": 0.0592,
"macro_f1": 0.3333333432674408,
"num_tokens": 716582.0,
"repeat_count": 0.0,
- "routers_loss": 0.007869532331824303,
+ "routers_loss": 0.008630385622382164,
"skip_count": 0.0,
"step": 444,
"text_loss": 0.5925271511077881
@@ -4230,18 +4230,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 2.0939242735544465,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.23046875,
"learning_rate": 0.0008900000000000001,
- "loss": 0.0797,
- "macro_f1": 0.3076923191547394,
+ "loss": 0.0811,
+ "macro_f1": 0.3006536066532135,
"num_tokens": 719941.0,
"repeat_count": 3.0,
- "routers_loss": 0.3034668564796448,
+ "routers_loss": 0.3015584945678711,
"skip_count": 1.0,
"step": 446,
"text_loss": 0.5059905052185059
@@ -4254,13 +4254,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2314453125,
+ "grad_norm": 0.203125,
"learning_rate": 0.000894,
- "loss": 0.0823,
+ "loss": 0.0822,
"macro_f1": 0.31446540355682373,
"num_tokens": 723113.0,
"repeat_count": 1.0,
- "routers_loss": 0.11066079139709473,
+ "routers_loss": 0.10897493362426758,
"skip_count": 1.0,
"step": 448,
"text_loss": 0.19616436958312988
@@ -4273,13 +4273,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3046875,
+ "grad_norm": 0.33984375,
"learning_rate": 0.000898,
- "loss": 0.0773,
+ "loss": 0.0782,
"macro_f1": 0.32098764181137085,
"num_tokens": 726193.0,
"repeat_count": 0.0,
- "routers_loss": 0.0755370482802391,
+ "routers_loss": 0.07236456125974655,
"skip_count": 2.0,
"step": 450,
"text_loss": 0.1773054152727127
@@ -4292,13 +4292,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28125,
+ "grad_norm": 0.3203125,
"learning_rate": 0.000902,
- "loss": 0.0596,
+ "loss": 0.058,
"macro_f1": 0.3272727429866791,
"num_tokens": 729275.0,
"repeat_count": 1.0,
- "routers_loss": 0.08470689505338669,
+ "routers_loss": 0.08184371143579483,
"skip_count": 0.0,
"step": 452,
"text_loss": 0.4927310049533844
@@ -4311,13 +4311,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19921875,
+ "grad_norm": 0.1953125,
"learning_rate": 0.000906,
- "loss": 0.0608,
+ "loss": 0.0607,
"macro_f1": 0.3333333432674408,
"num_tokens": 731948.0,
"repeat_count": 0.0,
- "routers_loss": 0.0130238626152277,
+ "routers_loss": 0.014033539220690727,
"skip_count": 0.0,
"step": 454,
"text_loss": 0.4745742678642273
@@ -4330,13 +4330,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.00091,
- "loss": 0.0652,
+ "loss": 0.0651,
"macro_f1": 0.3333333432674408,
"num_tokens": 735351.0,
"repeat_count": 0.0,
- "routers_loss": 0.007108641788363457,
+ "routers_loss": 0.0071774693205952644,
"skip_count": 0.0,
"step": 456,
"text_loss": 0.18523462116718292
@@ -4351,11 +4351,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.400390625,
"learning_rate": 0.0009140000000000001,
- "loss": 0.0746,
+ "loss": 0.0738,
"macro_f1": 0.5492662787437439,
"num_tokens": 738587.0,
"repeat_count": 0.0,
- "routers_loss": 0.06834109872579575,
+ "routers_loss": 0.07781517505645752,
"skip_count": 2.0,
"step": 458,
"text_loss": 0.3459635376930237
@@ -4368,13 +4368,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.28125,
"learning_rate": 0.0009180000000000001,
- "loss": 0.0733,
+ "loss": 0.0723,
"macro_f1": 0.3076923191547394,
"num_tokens": 741779.0,
"repeat_count": 0.0,
- "routers_loss": 0.10230778902769089,
+ "routers_loss": 0.09529037028551102,
"skip_count": 2.0,
"step": 460,
"text_loss": 0.20197433233261108
@@ -4387,13 +4387,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.1865234375,
"learning_rate": 0.0009220000000000001,
- "loss": 0.0528,
+ "loss": 0.0519,
"macro_f1": 0.3333333432674408,
"num_tokens": 745355.0,
"repeat_count": 0.0,
- "routers_loss": 0.009987542405724525,
+ "routers_loss": 0.009765669703483582,
"skip_count": 0.0,
"step": 462,
"text_loss": 0.7031404376029968
@@ -4406,13 +4406,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.125,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009260000000000001,
- "loss": 0.0536,
+ "loss": 0.0527,
"macro_f1": 0.3272727429866791,
"num_tokens": 748628.0,
"repeat_count": 0.0,
- "routers_loss": 0.03448869287967682,
+ "routers_loss": 0.03344850242137909,
"skip_count": 1.0,
"step": 464,
"text_loss": 0.21274663507938385
@@ -4425,13 +4425,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.173828125,
"learning_rate": 0.00093,
- "loss": 0.053,
+ "loss": 0.0534,
"macro_f1": 0.3076923191547394,
"num_tokens": 751472.0,
"repeat_count": 2.0,
- "routers_loss": 0.13631699979305267,
+ "routers_loss": 0.1354292333126068,
"skip_count": 2.0,
"step": 466,
"text_loss": 0.5350717306137085
@@ -4444,13 +4444,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.142578125,
"learning_rate": 0.000934,
- "loss": 0.06,
+ "loss": 0.0598,
"macro_f1": 0.3272727429866791,
"num_tokens": 754479.0,
"repeat_count": 0.0,
- "routers_loss": 0.053951870650053024,
+ "routers_loss": 0.056420840322971344,
"skip_count": 1.0,
"step": 468,
"text_loss": 0.28153330087661743
@@ -4463,13 +4463,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.228515625,
+ "grad_norm": 0.234375,
"learning_rate": 0.0009379999999999999,
- "loss": 0.059,
+ "loss": 0.0597,
"macro_f1": 0.31446540355682373,
"num_tokens": 757872.0,
"repeat_count": 1.0,
- "routers_loss": 0.14479905366897583,
+ "routers_loss": 0.1622387170791626,
"skip_count": 1.0,
"step": 470,
"text_loss": 0.22956843674182892
@@ -4482,13 +4482,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.44140625,
+ "grad_norm": 0.5,
"learning_rate": 0.000942,
- "loss": 0.0913,
+ "loss": 0.0953,
"macro_f1": 0.32098764181137085,
"num_tokens": 760468.0,
"repeat_count": 0.0,
- "routers_loss": 0.056221429258584976,
+ "routers_loss": 0.05146972835063934,
"skip_count": 2.0,
"step": 472,
"text_loss": 0.4513966739177704
@@ -4501,13 +4501,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1904296875,
+ "grad_norm": 0.212890625,
"learning_rate": 0.000946,
- "loss": 0.0591,
+ "loss": 0.0592,
"macro_f1": 0.3272727429866791,
"num_tokens": 763519.0,
"repeat_count": 1.0,
- "routers_loss": 0.09729792177677155,
+ "routers_loss": 0.09022669494152069,
"skip_count": 0.0,
"step": 474,
"text_loss": 0.25758957862854004
@@ -4520,13 +4520,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12158203125,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.00095,
- "loss": 0.0496,
+ "loss": 0.0498,
"macro_f1": 0.3272727429866791,
"num_tokens": 767391.0,
"repeat_count": 0.0,
- "routers_loss": 0.029447713866829872,
+ "routers_loss": 0.03044828027486801,
"skip_count": 1.0,
"step": 476,
"text_loss": 0.21366681158542633
@@ -4539,13 +4539,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.271484375,
+ "grad_norm": 0.291015625,
"learning_rate": 0.000954,
- "loss": 0.0801,
+ "loss": 0.0802,
"macro_f1": 0.3272727429866791,
"num_tokens": 770338.0,
"repeat_count": 0.0,
- "routers_loss": 0.09337342530488968,
+ "routers_loss": 0.10397060960531235,
"skip_count": 1.0,
"step": 478,
"text_loss": 1.0396177768707275
@@ -4560,11 +4560,11 @@
"f1_skip": 0.0,
"grad_norm": 0.267578125,
"learning_rate": 0.000958,
- "loss": 0.1102,
+ "loss": 0.1099,
"macro_f1": 0.285714328289032,
"num_tokens": 773699.0,
"repeat_count": 2.0,
- "routers_loss": 0.23193210363388062,
+ "routers_loss": 0.22604143619537354,
"skip_count": 4.0,
"step": 480,
"text_loss": 0.2570283114910126
@@ -4572,18 +4572,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 2.2629879659524508,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.146484375,
"learning_rate": 0.000962,
- "loss": 0.0669,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0667,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 777473.0,
"repeat_count": 0.0,
- "routers_loss": 0.046257760375738144,
+ "routers_loss": 0.048258859664201736,
"skip_count": 1.0,
"step": 482,
"text_loss": 0.2540103495121002
@@ -4596,13 +4596,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1708984375,
+ "grad_norm": 0.197265625,
"learning_rate": 0.000966,
- "loss": 0.0552,
+ "loss": 0.0592,
"macro_f1": 0.3333333432674408,
"num_tokens": 780833.0,
"repeat_count": 0.0,
- "routers_loss": 0.01683143898844719,
+ "routers_loss": 0.023018671199679375,
"skip_count": 0.0,
"step": 484,
"text_loss": 0.38524550199508667
@@ -4615,13 +4615,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.326171875,
+ "grad_norm": 0.314453125,
"learning_rate": 0.0009699999999999999,
- "loss": 0.071,
+ "loss": 0.0709,
"macro_f1": 0.3272727429866791,
"num_tokens": 783656.0,
"repeat_count": 0.0,
- "routers_loss": 0.04129387438297272,
+ "routers_loss": 0.044845327734947205,
"skip_count": 1.0,
"step": 486,
"text_loss": 0.5859048366546631
@@ -4634,13 +4634,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000974,
- "loss": 0.0605,
+ "loss": 0.0615,
"macro_f1": 0.3333333432674408,
"num_tokens": 787173.0,
"repeat_count": 0.0,
- "routers_loss": 0.01262948103249073,
+ "routers_loss": 0.010898692533373833,
"skip_count": 0.0,
"step": 488,
"text_loss": 0.3456067442893982
@@ -4653,13 +4653,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000978,
- "loss": 0.081,
+ "loss": 0.0796,
"macro_f1": 0.32098764181137085,
"num_tokens": 790395.0,
"repeat_count": 0.0,
- "routers_loss": 0.07404553890228271,
+ "routers_loss": 0.06497956812381744,
"skip_count": 2.0,
"step": 490,
"text_loss": 0.3751123249530792
@@ -4672,13 +4672,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.000982,
- "loss": 0.0751,
+ "loss": 0.0772,
"macro_f1": 0.3272727429866791,
"num_tokens": 793137.0,
"repeat_count": 0.0,
- "routers_loss": 0.06795930862426758,
+ "routers_loss": 0.07763728499412537,
"skip_count": 1.0,
"step": 492,
"text_loss": 0.43296709656715393
@@ -4691,13 +4691,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.1416015625,
"learning_rate": 0.0009860000000000001,
- "loss": 0.0804,
+ "loss": 0.0819,
"macro_f1": 0.3333333432674408,
"num_tokens": 796497.0,
"repeat_count": 0.0,
- "routers_loss": 0.02233024686574936,
+ "routers_loss": 0.02127906307578087,
"skip_count": 0.0,
"step": 494,
"text_loss": 0.4841311275959015
@@ -4710,13 +4710,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.00099,
- "loss": 0.0731,
+ "loss": 0.073,
"macro_f1": 0.3272727429866791,
"num_tokens": 799361.0,
"repeat_count": 1.0,
- "routers_loss": 0.07979031652212143,
+ "routers_loss": 0.09518691152334213,
"skip_count": 0.0,
"step": 496,
"text_loss": 0.5094487071037292
@@ -4729,13 +4729,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.130859375,
"learning_rate": 0.000994,
- "loss": 0.0795,
+ "loss": 0.0789,
"macro_f1": 0.5492662787437439,
"num_tokens": 802629.0,
"repeat_count": 0.0,
- "routers_loss": 0.045646365731954575,
+ "routers_loss": 0.0563947930932045,
"skip_count": 2.0,
"step": 498,
"text_loss": 0.42783617973327637
@@ -4748,13 +4748,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.1865234375,
"learning_rate": 0.000998,
"loss": 0.0476,
"macro_f1": 0.3272727429866791,
"num_tokens": 805881.0,
"repeat_count": 1.0,
- "routers_loss": 0.09717849642038345,
+ "routers_loss": 0.10570426285266876,
"skip_count": 0.0,
"step": 500,
"text_loss": 0.28395503759384155
@@ -4767,13 +4767,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.30078125,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0009999999760498814,
- "loss": 0.0894,
+ "loss": 0.0849,
"macro_f1": 0.5492662787437439,
"num_tokens": 809283.0,
"repeat_count": 0.0,
- "routers_loss": 0.03948225453495979,
+ "routers_loss": 0.031202208250761032,
"skip_count": 2.0,
"step": 502,
"text_loss": 0.32970911264419556
@@ -4786,13 +4786,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.15625,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009999997844489475,
- "loss": 0.0557,
+ "loss": 0.0574,
"macro_f1": 0.3272727429866791,
"num_tokens": 812440.0,
"repeat_count": 0.0,
- "routers_loss": 0.0742638111114502,
+ "routers_loss": 0.07647835463285446,
"skip_count": 1.0,
"step": 504,
"text_loss": 0.4901447296142578
@@ -4805,13 +4805,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.25,
"learning_rate": 0.000999999401247153,
- "loss": 0.0682,
+ "loss": 0.0668,
"macro_f1": 0.32098764181137085,
"num_tokens": 815716.0,
"repeat_count": 0.0,
- "routers_loss": 0.08293049037456512,
+ "routers_loss": 0.08515176922082901,
"skip_count": 2.0,
"step": 506,
"text_loss": 0.6157599687576294
@@ -4824,13 +4824,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.25390625,
"learning_rate": 0.0009999988264446445,
- "loss": 0.0697,
+ "loss": 0.0686,
"macro_f1": 0.3333333432674408,
"num_tokens": 819086.0,
"repeat_count": 0.0,
- "routers_loss": 0.010080376639962196,
+ "routers_loss": 0.00946938619017601,
"skip_count": 0.0,
"step": 508,
"text_loss": 0.5053519010543823
@@ -4843,13 +4843,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009999980600416424,
- "loss": 0.0611,
+ "loss": 0.0574,
"macro_f1": 0.3333333432674408,
"num_tokens": 822268.0,
"repeat_count": 0.0,
- "routers_loss": 0.009179878048598766,
+ "routers_loss": 0.01058756373822689,
"skip_count": 0.0,
"step": 510,
"text_loss": 0.5570021867752075
@@ -4862,13 +4862,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11083984375,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.000999997102038441,
- "loss": 0.0689,
+ "loss": 0.0678,
"macro_f1": 0.3333333432674408,
"num_tokens": 825728.0,
"repeat_count": 0.0,
- "routers_loss": 0.006718529388308525,
+ "routers_loss": 0.008705209009349346,
"skip_count": 0.0,
"step": 512,
"text_loss": 0.6519040465354919
@@ -4881,13 +4881,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.220703125,
"learning_rate": 0.0009999959524354064,
- "loss": 0.0826,
+ "loss": 0.083,
"macro_f1": 0.3272727429866791,
"num_tokens": 829459.0,
"repeat_count": 0.0,
- "routers_loss": 0.049344487488269806,
+ "routers_loss": 0.04024193435907364,
"skip_count": 1.0,
"step": 514,
"text_loss": 0.5290043950080872
@@ -4900,13 +4900,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.259765625,
+ "grad_norm": 0.25390625,
"learning_rate": 0.00099999461123298,
- "loss": 0.0739,
+ "loss": 0.0727,
"macro_f1": 0.3333333432674408,
"num_tokens": 832291.0,
"repeat_count": 0.0,
- "routers_loss": 0.013402626849710941,
+ "routers_loss": 0.015742862597107887,
"skip_count": 0.0,
"step": 516,
"text_loss": 0.7910057902336121
@@ -4919,13 +4919,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.000999993078431675,
- "loss": 0.0761,
+ "loss": 0.0759,
"macro_f1": 0.3076923191547394,
"num_tokens": 835399.0,
"repeat_count": 1.0,
- "routers_loss": 0.16964484751224518,
+ "routers_loss": 0.16753782331943512,
"skip_count": 3.0,
"step": 518,
"text_loss": 0.45196083188056946
@@ -4938,13 +4938,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.236328125,
"learning_rate": 0.0009999913540320792,
- "loss": 0.095,
+ "loss": 0.0968,
"macro_f1": 0.31446540355682373,
"num_tokens": 838993.0,
"repeat_count": 0.0,
- "routers_loss": 0.08609295636415482,
+ "routers_loss": 0.09357143193483353,
"skip_count": 2.0,
"step": 520,
"text_loss": 0.5499435663223267
@@ -4957,13 +4957,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.2392578125,
+ "grad_norm": 0.2451171875,
"learning_rate": 0.0009999894380348536,
- "loss": 0.0816,
+ "loss": 0.0821,
"macro_f1": 0.5492662787437439,
"num_tokens": 842652.0,
"repeat_count": 0.0,
- "routers_loss": 0.05354784056544304,
+ "routers_loss": 0.056803856045007706,
"skip_count": 2.0,
"step": 522,
"text_loss": 0.197520449757576
@@ -4976,13 +4976,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.2333984375,
"learning_rate": 0.000999987330440732,
- "loss": 0.0715,
+ "loss": 0.0725,
"macro_f1": 0.4871794879436493,
"num_tokens": 847061.0,
"repeat_count": 0.0,
- "routers_loss": 0.09146631509065628,
+ "routers_loss": 0.08962195366621017,
"skip_count": 3.0,
"step": 524,
"text_loss": 0.27509039640426636
@@ -4995,13 +4995,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.189453125,
"learning_rate": 0.000999985031250522,
- "loss": 0.0574,
+ "loss": 0.0561,
"macro_f1": 0.3333333432674408,
"num_tokens": 850780.0,
"repeat_count": 0.0,
- "routers_loss": 0.02344255894422531,
+ "routers_loss": 0.022930558770895004,
"skip_count": 0.0,
"step": 526,
"text_loss": 0.13291706144809723
@@ -5014,13 +5014,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1982421875,
+ "grad_norm": 0.197265625,
"learning_rate": 0.0009999825404651053,
- "loss": 0.0621,
+ "loss": 0.0614,
"macro_f1": 0.3333333432674408,
"num_tokens": 853886.0,
"repeat_count": 0.0,
- "routers_loss": 0.018271517008543015,
+ "routers_loss": 0.017097990959882736,
"skip_count": 0.0,
"step": 528,
"text_loss": 0.21706295013427734
@@ -5033,13 +5033,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2060546875,
+ "grad_norm": 0.212890625,
"learning_rate": 0.0009999798580854356,
- "loss": 0.0717,
+ "loss": 0.0724,
"macro_f1": 0.3333333432674408,
"num_tokens": 857364.0,
"repeat_count": 0.0,
- "routers_loss": 0.026990914717316628,
+ "routers_loss": 0.02831801027059555,
"skip_count": 0.0,
"step": 530,
"text_loss": 0.9035662412643433
@@ -5052,13 +5052,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16015625,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.000999976984112541,
- "loss": 0.0681,
+ "loss": 0.0674,
"macro_f1": 0.3333333432674408,
"num_tokens": 860661.0,
"repeat_count": 0.0,
- "routers_loss": 0.019737249240279198,
+ "routers_loss": 0.019671892747282982,
"skip_count": 0.0,
"step": 532,
"text_loss": 0.8354863524436951
@@ -5071,13 +5071,13 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.3046875,
+ "grad_norm": 0.2890625,
"learning_rate": 0.0009999739185475231,
- "loss": 0.0978,
+ "loss": 0.0963,
"macro_f1": 0.47333335876464844,
"num_tokens": 864124.0,
"repeat_count": 2.0,
- "routers_loss": 0.212640181183815,
+ "routers_loss": 0.21383361518383026,
"skip_count": 3.0,
"step": 534,
"text_loss": 0.23422949016094208
@@ -5090,13 +5090,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.0009999706613915565,
- "loss": 0.0602,
+ "loss": 0.0598,
"macro_f1": 0.32098767161369324,
"num_tokens": 866976.0,
"repeat_count": 0.0,
- "routers_loss": 0.07302755117416382,
+ "routers_loss": 0.07158871740102768,
"skip_count": 1.0,
"step": 536,
"text_loss": 0.11800774186849594
@@ -5109,13 +5109,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.296875,
+ "grad_norm": 0.26953125,
"learning_rate": 0.0009999672126458894,
- "loss": 0.0825,
+ "loss": 0.0822,
"macro_f1": 0.3272727429866791,
"num_tokens": 870549.0,
"repeat_count": 0.0,
- "routers_loss": 0.08667246252298355,
+ "routers_loss": 0.08185924589633942,
"skip_count": 1.0,
"step": 538,
"text_loss": 0.19232480227947235
@@ -5128,13 +5128,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1318359375,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.000999963572311843,
- "loss": 0.0597,
+ "loss": 0.0604,
"macro_f1": 0.3333333432674408,
"num_tokens": 873733.0,
"repeat_count": 0.0,
- "routers_loss": 0.015047167427837849,
+ "routers_loss": 0.01633382774889469,
"skip_count": 0.0,
"step": 540,
"text_loss": 0.3725031912326813
@@ -5147,13 +5147,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.15234375,
"learning_rate": 0.0009999597403908128,
- "loss": 0.076,
+ "loss": 0.0761,
"macro_f1": 0.3272727429866791,
"num_tokens": 877099.0,
"repeat_count": 0.0,
- "routers_loss": 0.07481446117162704,
+ "routers_loss": 0.0782657191157341,
"skip_count": 1.0,
"step": 542,
"text_loss": 0.17589199542999268
@@ -5166,13 +5166,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.2177734375,
"learning_rate": 0.0009999557168842669,
- "loss": 0.0724,
+ "loss": 0.0716,
"macro_f1": 0.5492662787437439,
"num_tokens": 879883.0,
"repeat_count": 0.0,
- "routers_loss": 0.049495212733745575,
+ "routers_loss": 0.05275818333029747,
"skip_count": 2.0,
"step": 544,
"text_loss": 0.26448264718055725
@@ -5185,13 +5185,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.0009999515017937468,
- "loss": 0.0718,
+ "loss": 0.071,
"macro_f1": 0.32098764181137085,
"num_tokens": 882223.0,
"repeat_count": 0.0,
- "routers_loss": 0.08043002337217331,
+ "routers_loss": 0.09335892647504807,
"skip_count": 2.0,
"step": 546,
"text_loss": 0.208544060587883
@@ -5204,13 +5204,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.34765625,
+ "grad_norm": 0.376953125,
"learning_rate": 0.0009999470951208684,
- "loss": 0.086,
+ "loss": 0.0855,
"macro_f1": 0.32098764181137085,
"num_tokens": 885241.0,
"repeat_count": 2.0,
- "routers_loss": 0.22461950778961182,
+ "routers_loss": 0.22983254492282867,
"skip_count": 0.0,
"step": 548,
"text_loss": 0.6612338423728943
@@ -5223,13 +5223,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.216796875,
"learning_rate": 0.00099994249686732,
- "loss": 0.0798,
+ "loss": 0.0786,
"macro_f1": 0.3272727429866791,
"num_tokens": 887897.0,
"repeat_count": 1.0,
- "routers_loss": 0.11754962801933289,
+ "routers_loss": 0.12858282029628754,
"skip_count": 0.0,
"step": 550,
"text_loss": 0.4673548936843872
@@ -5242,13 +5242,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1611328125,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.0009999377070348638,
- "loss": 0.0978,
+ "loss": 0.0944,
"macro_f1": 0.3333333432674408,
"num_tokens": 891224.0,
"repeat_count": 0.0,
- "routers_loss": 0.017412789165973663,
+ "routers_loss": 0.017421770840883255,
"skip_count": 0.0,
"step": 552,
"text_loss": 0.6419258117675781
@@ -5261,13 +5261,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.15625,
"learning_rate": 0.000999932725625335,
- "loss": 0.0792,
+ "loss": 0.0791,
"macro_f1": 0.32098764181137085,
"num_tokens": 894578.0,
"repeat_count": 0.0,
- "routers_loss": 0.08969525247812271,
+ "routers_loss": 0.07890026271343231,
"skip_count": 2.0,
"step": 554,
"text_loss": 0.5970752239227295
@@ -5280,13 +5280,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2158203125,
+ "grad_norm": 0.216796875,
"learning_rate": 0.0009999275526406427,
- "loss": 0.0803,
+ "loss": 0.0796,
"macro_f1": 0.31446540355682373,
"num_tokens": 897145.0,
"repeat_count": 1.0,
- "routers_loss": 0.09876437485218048,
+ "routers_loss": 0.09836960583925247,
"skip_count": 1.0,
"step": 556,
"text_loss": 0.752425491809845
@@ -5299,13 +5299,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.1875,
"learning_rate": 0.0009999221880827693,
- "loss": 0.0887,
+ "loss": 0.0882,
"macro_f1": 0.3333333432674408,
"num_tokens": 900565.0,
"repeat_count": 0.0,
- "routers_loss": 0.019108204171061516,
+ "routers_loss": 0.017694659531116486,
"skip_count": 0.0,
"step": 558,
"text_loss": 0.195619136095047
@@ -5318,32 +5318,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.2021484375,
"learning_rate": 0.0009999166319537703,
- "loss": 0.0573,
+ "loss": 0.0561,
"macro_f1": 0.3333333432674408,
"num_tokens": 903506.0,
"repeat_count": 0.0,
- "routers_loss": 0.019048813730478287,
+ "routers_loss": 0.019375264644622803,
"skip_count": 0.0,
"step": 560,
"text_loss": 0.4603337347507477
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
"epoch": 2.638685060170238,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1435546875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.146484375,
"learning_rate": 0.0009999108842557748,
- "loss": 0.0947,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0953,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 906380.0,
"repeat_count": 0.0,
- "routers_loss": 0.11889495700597763,
+ "routers_loss": 0.12013207376003265,
"skip_count": 3.0,
"step": 562,
"text_loss": 0.6279402375221252
@@ -5356,13 +5356,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.228515625,
+ "grad_norm": 0.255859375,
"learning_rate": 0.0009999049449909854,
- "loss": 0.0771,
+ "loss": 0.0799,
"macro_f1": 0.3272727429866791,
"num_tokens": 909116.0,
"repeat_count": 0.0,
- "routers_loss": 0.06202332302927971,
+ "routers_loss": 0.06441342830657959,
"skip_count": 1.0,
"step": 564,
"text_loss": 0.23741699755191803
@@ -5375,13 +5375,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.15234375,
"learning_rate": 0.0009998988141616781,
- "loss": 0.0623,
+ "loss": 0.064,
"macro_f1": 0.32098767161369324,
"num_tokens": 912189.0,
"repeat_count": 0.0,
- "routers_loss": 0.08294244855642319,
+ "routers_loss": 0.08309414982795715,
"skip_count": 1.0,
"step": 566,
"text_loss": 0.27780941128730774
@@ -5394,13 +5394,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009998924917702023,
- "loss": 0.0885,
+ "loss": 0.0876,
"macro_f1": 0.3272727429866791,
"num_tokens": 916279.0,
"repeat_count": 1.0,
- "routers_loss": 0.07545182853937149,
+ "routers_loss": 0.07197169959545135,
"skip_count": 0.0,
"step": 568,
"text_loss": 0.6371755599975586
@@ -5413,13 +5413,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.2255859375,
"learning_rate": 0.0009998859778189806,
- "loss": 0.0712,
+ "loss": 0.0706,
"macro_f1": 0.3333333432674408,
"num_tokens": 919490.0,
"repeat_count": 0.0,
- "routers_loss": 0.008711219765245914,
+ "routers_loss": 0.008022273890674114,
"skip_count": 0.0,
"step": 570,
"text_loss": 0.6028938889503479
@@ -5432,13 +5432,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.000999879272310509,
- "loss": 0.0837,
+ "loss": 0.084,
"macro_f1": 0.3333333432674408,
"num_tokens": 923694.0,
"repeat_count": 0.0,
- "routers_loss": 0.01639273390173912,
+ "routers_loss": 0.01634674146771431,
"skip_count": 0.0,
"step": 572,
"text_loss": 0.7177054286003113
@@ -5451,13 +5451,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1669921875,
+ "grad_norm": 0.17578125,
"learning_rate": 0.0009998723752473574,
- "loss": 0.0707,
+ "loss": 0.0716,
"macro_f1": 0.3272727429866791,
"num_tokens": 926933.0,
"repeat_count": 0.0,
- "routers_loss": 0.04997137933969498,
+ "routers_loss": 0.060559045523405075,
"skip_count": 1.0,
"step": 574,
"text_loss": 0.5203254818916321
@@ -5470,13 +5470,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1845703125,
+ "grad_norm": 0.185546875,
"learning_rate": 0.0009998652866321687,
- "loss": 0.0799,
+ "loss": 0.0801,
"macro_f1": 0.3333333432674408,
"num_tokens": 929832.0,
"repeat_count": 0.0,
- "routers_loss": 0.011360209435224533,
+ "routers_loss": 0.011485611088573933,
"skip_count": 0.0,
"step": 576,
"text_loss": 0.6147452592849731
@@ -5489,13 +5489,13 @@
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1669921875,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.000999858006467659,
- "loss": 0.0658,
+ "loss": 0.0649,
"macro_f1": 0.29333335161209106,
"num_tokens": 933266.0,
"repeat_count": 2.0,
- "routers_loss": 0.31349560618400574,
+ "routers_loss": 0.2929030954837799,
"skip_count": 4.0,
"step": 578,
"text_loss": 0.1720666140317917
@@ -5508,13 +5508,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.24609375,
"learning_rate": 0.0009998505347566186,
- "loss": 0.0801,
+ "loss": 0.0782,
"macro_f1": 0.32098764181137085,
"num_tokens": 937545.0,
"repeat_count": 0.0,
- "routers_loss": 0.058660347014665604,
+ "routers_loss": 0.053780000656843185,
"skip_count": 2.0,
"step": 580,
"text_loss": 0.3258405327796936
@@ -5527,13 +5527,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.1416015625,
"learning_rate": 0.00099984287150191,
- "loss": 0.0578,
+ "loss": 0.0582,
"macro_f1": 0.3333333432674408,
"num_tokens": 941001.0,
"repeat_count": 0.0,
- "routers_loss": 0.025836754590272903,
+ "routers_loss": 0.02637636847794056,
"skip_count": 0.0,
"step": 582,
"text_loss": 0.23762771487236023
@@ -5546,13 +5546,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009998350167064705,
- "loss": 0.0683,
+ "loss": 0.0672,
"macro_f1": 0.3333333432674408,
"num_tokens": 943989.0,
"repeat_count": 0.0,
- "routers_loss": 0.016504868865013123,
+ "routers_loss": 0.01637580618262291,
"skip_count": 0.0,
"step": 584,
"text_loss": 0.7460582852363586
@@ -5565,13 +5565,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1787109375,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009998269703733096,
- "loss": 0.0685,
+ "loss": 0.0686,
"macro_f1": 0.3272727429866791,
"num_tokens": 947245.0,
"repeat_count": 1.0,
- "routers_loss": 0.1379794180393219,
+ "routers_loss": 0.13934117555618286,
"skip_count": 0.0,
"step": 586,
"text_loss": 0.5284690260887146
@@ -5584,13 +5584,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.13671875,
"learning_rate": 0.0009998187325055106,
- "loss": 0.0657,
+ "loss": 0.0667,
"macro_f1": 0.3333333432674408,
"num_tokens": 950116.0,
"repeat_count": 0.0,
- "routers_loss": 0.01802757754921913,
+ "routers_loss": 0.02138397842645645,
"skip_count": 0.0,
"step": 588,
"text_loss": 0.3920256197452545
@@ -5603,13 +5603,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0009998103031062305,
- "loss": 0.0762,
+ "loss": 0.0778,
"macro_f1": 0.3333333432674408,
"num_tokens": 953277.0,
"repeat_count": 0.0,
- "routers_loss": 0.006902900990098715,
+ "routers_loss": 0.007098200265318155,
"skip_count": 0.0,
"step": 590,
"text_loss": 0.7472905516624451
@@ -5622,13 +5622,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3046875,
+ "grad_norm": 0.318359375,
"learning_rate": 0.0009998016821786994,
- "loss": 0.0912,
+ "loss": 0.0872,
"macro_f1": 0.32098764181137085,
"num_tokens": 958229.0,
"repeat_count": 1.0,
- "routers_loss": 0.08348741382360458,
+ "routers_loss": 0.07946522533893585,
"skip_count": 1.0,
"step": 592,
"text_loss": 0.5506448745727539
@@ -5641,13 +5641,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.000999792869726221,
- "loss": 0.0527,
+ "loss": 0.0523,
"macro_f1": 0.3272727429866791,
"num_tokens": 961016.0,
"repeat_count": 0.0,
- "routers_loss": 0.08290062099695206,
+ "routers_loss": 0.0850791186094284,
"skip_count": 1.0,
"step": 594,
"text_loss": 0.3824431002140045
@@ -5660,13 +5660,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.0009997838657521717,
- "loss": 0.0643,
+ "loss": 0.0632,
"macro_f1": 0.3333333432674408,
"num_tokens": 963847.0,
"repeat_count": 0.0,
- "routers_loss": 0.018620988354086876,
+ "routers_loss": 0.016370445489883423,
"skip_count": 0.0,
"step": 596,
"text_loss": 0.2139475792646408
@@ -5679,13 +5679,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009997746702600026,
- "loss": 0.073,
+ "loss": 0.0702,
"macro_f1": 0.307692289352417,
"num_tokens": 966619.0,
"repeat_count": 0.0,
- "routers_loss": 0.1211671382188797,
+ "routers_loss": 0.1310746818780899,
"skip_count": 3.0,
"step": 598,
"text_loss": 0.3651018440723419
@@ -5698,13 +5698,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.23828125,
"learning_rate": 0.0009997652832532372,
- "loss": 0.079,
+ "loss": 0.0792,
"macro_f1": 0.3272727429866791,
"num_tokens": 970418.0,
"repeat_count": 1.0,
- "routers_loss": 0.15485027432441711,
+ "routers_loss": 0.14303378760814667,
"skip_count": 0.0,
"step": 600,
"text_loss": 0.7094736099243164
@@ -5717,13 +5717,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009997557047354722,
- "loss": 0.0562,
+ "loss": 0.0531,
"macro_f1": 0.3272727429866791,
"num_tokens": 973491.0,
"repeat_count": 0.0,
- "routers_loss": 0.036684274673461914,
+ "routers_loss": 0.03334212675690651,
"skip_count": 1.0,
"step": 602,
"text_loss": 0.4812237024307251
@@ -5731,18 +5731,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 2.835926034634576,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.302734375,
+ "grad_norm": 0.2890625,
"learning_rate": 0.0009997459347103783,
- "loss": 0.0985,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0956,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 976672.0,
"repeat_count": 0.0,
- "routers_loss": 0.026901578530669212,
+ "routers_loss": 0.02831871062517166,
"skip_count": 0.0,
"step": 604,
"text_loss": 0.21737146377563477
@@ -5755,13 +5755,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12158203125,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009997359731816998,
- "loss": 0.0632,
+ "loss": 0.0646,
"macro_f1": 0.3333333432674408,
"num_tokens": 979898.0,
"repeat_count": 0.0,
- "routers_loss": 0.01700405217707157,
+ "routers_loss": 0.017968013882637024,
"skip_count": 0.0,
"step": 606,
"text_loss": 0.5458008050918579
@@ -5774,13 +5774,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2099609375,
+ "grad_norm": 0.224609375,
"learning_rate": 0.0009997258201532536,
- "loss": 0.0758,
+ "loss": 0.0751,
"macro_f1": 0.3333333432674408,
"num_tokens": 982811.0,
"repeat_count": 0.0,
- "routers_loss": 0.015013590455055237,
+ "routers_loss": 0.016256732866168022,
"skip_count": 0.0,
"step": 608,
"text_loss": 0.8643257021903992
@@ -5793,13 +5793,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0009997154756289303,
- "loss": 0.0576,
+ "loss": 0.0561,
"macro_f1": 0.3333333432674408,
"num_tokens": 985245.0,
"repeat_count": 0.0,
- "routers_loss": 0.02037946693599224,
+ "routers_loss": 0.021214161068201065,
"skip_count": 0.0,
"step": 610,
"text_loss": 0.2204967886209488
@@ -5812,13 +5812,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.150390625,
"learning_rate": 0.000999704939612694,
- "loss": 0.0648,
+ "loss": 0.0636,
"macro_f1": 0.3006536364555359,
"num_tokens": 988539.0,
"repeat_count": 3.0,
- "routers_loss": 0.22834022343158722,
+ "routers_loss": 0.23249399662017822,
"skip_count": 2.0,
"step": 612,
"text_loss": 0.32489025592803955
@@ -5831,13 +5831,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009996942121085824,
- "loss": 0.0449,
+ "loss": 0.0445,
"macro_f1": 0.3333333432674408,
"num_tokens": 991660.0,
"repeat_count": 0.0,
- "routers_loss": 0.009838113561272621,
+ "routers_loss": 0.010706410743296146,
"skip_count": 0.0,
"step": 614,
"text_loss": 0.4551754891872406
@@ -5850,13 +5850,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.353515625,
+ "grad_norm": 0.3671875,
"learning_rate": 0.000999683293120706,
- "loss": 0.1009,
+ "loss": 0.1016,
"macro_f1": 0.3333333432674408,
"num_tokens": 994828.0,
"repeat_count": 0.0,
- "routers_loss": 0.005943270865827799,
+ "routers_loss": 0.006676184479147196,
"skip_count": 0.0,
"step": 616,
"text_loss": 0.6212068200111389
@@ -5869,13 +5869,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.38671875,
+ "grad_norm": 0.408203125,
"learning_rate": 0.0009996721826532491,
- "loss": 0.0941,
+ "loss": 0.0976,
"macro_f1": 0.3076923191547394,
"num_tokens": 997951.0,
"repeat_count": 2.0,
- "routers_loss": 0.21597740054130554,
+ "routers_loss": 0.2148125320672989,
"skip_count": 2.0,
"step": 618,
"text_loss": 0.26514527201652527
@@ -5888,13 +5888,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.1904296875,
"learning_rate": 0.000999660880710469,
- "loss": 0.0896,
+ "loss": 0.0909,
"macro_f1": 0.3333333432674408,
"num_tokens": 1001139.0,
"repeat_count": 0.0,
- "routers_loss": 0.023726588115096092,
+ "routers_loss": 0.022332455962896347,
"skip_count": 0.0,
"step": 620,
"text_loss": 0.26131340861320496
@@ -5907,13 +5907,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009996493872966971,
"loss": 0.0732,
"macro_f1": 0.3272727429866791,
"num_tokens": 1003678.0,
"repeat_count": 1.0,
- "routers_loss": 0.08467255532741547,
+ "routers_loss": 0.08348730951547623,
"skip_count": 0.0,
"step": 622,
"text_loss": 0.19151706993579865
@@ -5926,13 +5926,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.173828125,
"learning_rate": 0.0009996377024163374,
- "loss": 0.0816,
+ "loss": 0.0822,
"macro_f1": 0.3333333432674408,
"num_tokens": 1007082.0,
"repeat_count": 0.0,
- "routers_loss": 0.029468854889273643,
+ "routers_loss": 0.028577150776982307,
"skip_count": 0.0,
"step": 624,
"text_loss": 0.305387407541275
@@ -5945,13 +5945,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.11279296875,
"learning_rate": 0.0009996258260738676,
- "loss": 0.0891,
+ "loss": 0.0892,
"macro_f1": 0.3272727429866791,
"num_tokens": 1010064.0,
"repeat_count": 1.0,
- "routers_loss": 0.09438466280698776,
+ "routers_loss": 0.08312026411294937,
"skip_count": 0.0,
"step": 626,
"text_loss": 0.49436143040657043
@@ -5964,13 +5964,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009996137582738388,
- "loss": 0.0581,
+ "loss": 0.0591,
"macro_f1": 0.3333333432674408,
"num_tokens": 1013462.0,
"repeat_count": 0.0,
- "routers_loss": 0.013679586350917816,
+ "routers_loss": 0.013337327167391777,
"skip_count": 0.0,
"step": 628,
"text_loss": 0.6515294313430786
@@ -5983,13 +5983,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.140625,
"learning_rate": 0.000999601499020875,
- "loss": 0.0528,
+ "loss": 0.0537,
"macro_f1": 0.3333333432674408,
"num_tokens": 1016246.0,
"repeat_count": 0.0,
- "routers_loss": 0.029532987624406815,
+ "routers_loss": 0.029126765206456184,
"skip_count": 0.0,
"step": 630,
"text_loss": 0.18834827840328217
@@ -6002,13 +6002,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009995890483196746,
- "loss": 0.0601,
+ "loss": 0.0602,
"macro_f1": 0.3272727429866791,
"num_tokens": 1019286.0,
"repeat_count": 0.0,
- "routers_loss": 0.05516733601689339,
+ "routers_loss": 0.054844800382852554,
"skip_count": 1.0,
"step": 632,
"text_loss": 0.6988179087638855
@@ -6021,13 +6021,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.357421875,
+ "grad_norm": 0.322265625,
"learning_rate": 0.0009995764061750086,
- "loss": 0.0785,
+ "loss": 0.0767,
"macro_f1": 0.3333333432674408,
"num_tokens": 1022207.0,
"repeat_count": 0.0,
- "routers_loss": 0.010254866443574429,
+ "routers_loss": 0.010095693171024323,
"skip_count": 0.0,
"step": 634,
"text_loss": 0.558451771736145
@@ -6040,13 +6040,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.2890625,
"learning_rate": 0.000999563572591721,
- "loss": 0.0518,
+ "loss": 0.0521,
"macro_f1": 0.32098764181137085,
"num_tokens": 1025319.0,
"repeat_count": 1.0,
- "routers_loss": 0.07528360933065414,
+ "routers_loss": 0.0698433518409729,
"skip_count": 1.0,
"step": 636,
"text_loss": 0.5961872935295105
@@ -6059,13 +6059,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1064453125,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009995505475747302,
- "loss": 0.0844,
+ "loss": 0.0849,
"macro_f1": 0.3272727429866791,
"num_tokens": 1028362.0,
"repeat_count": 0.0,
- "routers_loss": 0.04301584139466286,
+ "routers_loss": 0.040211405605077744,
"skip_count": 1.0,
"step": 638,
"text_loss": 0.546863317489624
@@ -6078,13 +6078,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11572265625,
+ "grad_norm": 0.119140625,
"learning_rate": 0.0009995373311290272,
- "loss": 0.0699,
+ "loss": 0.0709,
"macro_f1": 0.3144654333591461,
"num_tokens": 1032199.0,
"repeat_count": 2.0,
- "routers_loss": 0.14521080255508423,
+ "routers_loss": 0.1457643061876297,
"skip_count": 1.0,
"step": 640,
"text_loss": 0.2137298285961151
@@ -6097,13 +6097,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1328125,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.0009995239232596764,
- "loss": 0.0543,
+ "loss": 0.0545,
"macro_f1": 0.3333333432674408,
"num_tokens": 1035801.0,
"repeat_count": 0.0,
- "routers_loss": 0.01074797473847866,
+ "routers_loss": 0.011394930072128773,
"skip_count": 0.0,
"step": 642,
"text_loss": 0.43054503202438354
@@ -6116,13 +6116,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009995103239718163,
- "loss": 0.0659,
+ "loss": 0.0665,
"macro_f1": 0.3333333432674408,
"num_tokens": 1039223.0,
"repeat_count": 0.0,
- "routers_loss": 0.009271817281842232,
+ "routers_loss": 0.00997432041913271,
"skip_count": 0.0,
"step": 644,
"text_loss": 0.7749615907669067
@@ -6135,13 +6135,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0009994965332706573,
- "loss": 0.0737,
+ "loss": 0.0755,
"macro_f1": 0.3144654333591461,
"num_tokens": 1042154.0,
"repeat_count": 3.0,
- "routers_loss": 0.10257050395011902,
+ "routers_loss": 0.10589150339365005,
"skip_count": 0.0,
"step": 646,
"text_loss": 0.7812211513519287
@@ -6154,13 +6154,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.0009994825511614846,
- "loss": 0.0363,
+ "loss": 0.0383,
"macro_f1": 0.3272727429866791,
"num_tokens": 1045250.0,
"repeat_count": 0.0,
- "routers_loss": 0.07091924548149109,
+ "routers_loss": 0.0748734176158905,
"skip_count": 1.0,
"step": 648,
"text_loss": 0.844803512096405
@@ -6173,13 +6173,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11572265625,
+ "grad_norm": 0.1220703125,
"learning_rate": 0.0009994683776496562,
- "loss": 0.0421,
+ "loss": 0.0433,
"macro_f1": 0.3272727429866791,
"num_tokens": 1048446.0,
"repeat_count": 0.0,
- "routers_loss": 0.034446243196725845,
+ "routers_loss": 0.03742415830492973,
"skip_count": 1.0,
"step": 650,
"text_loss": 0.2098839282989502
@@ -6192,13 +6192,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009994540127406034,
- "loss": 0.0593,
+ "loss": 0.0591,
"macro_f1": 0.32098764181137085,
"num_tokens": 1051840.0,
"repeat_count": 0.0,
- "routers_loss": 0.06077485531568527,
+ "routers_loss": 0.06025516986846924,
"skip_count": 2.0,
"step": 652,
"text_loss": 0.27727583050727844
@@ -6211,13 +6211,13 @@
"f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.181640625,
"learning_rate": 0.0009994394564398306,
- "loss": 0.0537,
+ "loss": 0.0519,
"macro_f1": 0.521541953086853,
"num_tokens": 1055142.0,
"repeat_count": 4.0,
- "routers_loss": 0.2382282167673111,
+ "routers_loss": 0.22807340323925018,
"skip_count": 2.0,
"step": 654,
"text_loss": 0.9672397971153259
@@ -6230,13 +6230,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.142578125,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009994247087529158,
- "loss": 0.0613,
+ "loss": 0.0618,
"macro_f1": 0.3333333432674408,
"num_tokens": 1057698.0,
"repeat_count": 0.0,
- "routers_loss": 0.011971636675298214,
+ "routers_loss": 0.01348950993269682,
"skip_count": 0.0,
"step": 656,
"text_loss": 0.6375506520271301
@@ -6249,13 +6249,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.212890625,
+ "grad_norm": 0.1953125,
"learning_rate": 0.0009994097696855106,
- "loss": 0.0414,
+ "loss": 0.0412,
"macro_f1": 0.3333333432674408,
"num_tokens": 1060624.0,
"repeat_count": 0.0,
- "routers_loss": 0.010221127420663834,
+ "routers_loss": 0.009649243205785751,
"skip_count": 0.0,
"step": 658,
"text_loss": 0.5315385460853577
@@ -6268,13 +6268,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2265625,
+ "grad_norm": 0.2041015625,
"learning_rate": 0.0009993946392433395,
- "loss": 0.061,
+ "loss": 0.0609,
"macro_f1": 0.307692289352417,
"num_tokens": 1065076.0,
"repeat_count": 0.0,
- "routers_loss": 0.11860335618257523,
+ "routers_loss": 0.1250980943441391,
"skip_count": 3.0,
"step": 660,
"text_loss": 0.25780341029167175
@@ -6287,13 +6287,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009993793174322006,
- "loss": 0.0485,
+ "loss": 0.0471,
"macro_f1": 0.3333333432674408,
"num_tokens": 1068365.0,
"repeat_count": 0.0,
- "routers_loss": 0.011139829643070698,
+ "routers_loss": 0.011544390581548214,
"skip_count": 0.0,
"step": 662,
"text_loss": 0.34876301884651184
@@ -6306,13 +6306,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.166015625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009993638042579654,
- "loss": 0.0478,
+ "loss": 0.0473,
"macro_f1": 0.3272727429866791,
"num_tokens": 1071693.0,
"repeat_count": 0.0,
- "routers_loss": 0.03978770971298218,
+ "routers_loss": 0.03777370601892471,
"skip_count": 1.0,
"step": 664,
"text_loss": 0.21811571717262268
@@ -6327,11 +6327,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.203125,
"learning_rate": 0.0009993480997265783,
- "loss": 0.0481,
+ "loss": 0.0475,
"macro_f1": 0.5492662787437439,
"num_tokens": 1074733.0,
"repeat_count": 0.0,
- "routers_loss": 0.051231011748313904,
+ "routers_loss": 0.049949806183576584,
"skip_count": 2.0,
"step": 666,
"text_loss": 0.38410288095474243
@@ -6344,13 +6344,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.10302734375,
"learning_rate": 0.0009993322038440572,
- "loss": 0.0615,
+ "loss": 0.0605,
"macro_f1": 0.3333333432674408,
"num_tokens": 1077993.0,
"repeat_count": 0.0,
- "routers_loss": 0.024917088449001312,
+ "routers_loss": 0.0247171800583601,
"skip_count": 0.0,
"step": 668,
"text_loss": 0.25576895475387573
@@ -6363,13 +6363,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1982421875,
+ "grad_norm": 0.216796875,
"learning_rate": 0.000999316116616494,
- "loss": 0.0627,
+ "loss": 0.0619,
"macro_f1": 0.3333333432674408,
"num_tokens": 1080491.0,
"repeat_count": 0.0,
- "routers_loss": 0.008834881708025932,
+ "routers_loss": 0.008118715137243271,
"skip_count": 0.0,
"step": 670,
"text_loss": 0.6269792914390564
@@ -6382,13 +6382,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.173828125,
"learning_rate": 0.0009992998380500527,
"loss": 0.0462,
"macro_f1": 0.3272727429866791,
"num_tokens": 1083817.0,
"repeat_count": 0.0,
- "routers_loss": 0.033405229449272156,
+ "routers_loss": 0.03366057574748993,
"skip_count": 1.0,
"step": 672,
"text_loss": 0.26891493797302246
@@ -6401,13 +6401,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.13671875,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009992833681509716,
- "loss": 0.0523,
+ "loss": 0.0529,
"macro_f1": 0.3333333432674408,
"num_tokens": 1087368.0,
"repeat_count": 0.0,
- "routers_loss": 0.020753704011440277,
+ "routers_loss": 0.020552074536681175,
"skip_count": 0.0,
"step": 674,
"text_loss": 0.14421936869621277
@@ -6420,13 +6420,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.18359375,
"learning_rate": 0.0009992667069255619,
- "loss": 0.0698,
+ "loss": 0.0696,
"macro_f1": 0.31446540355682373,
"num_tokens": 1090452.0,
"repeat_count": 0.0,
- "routers_loss": 0.06932353973388672,
+ "routers_loss": 0.06937336176633835,
"skip_count": 2.0,
"step": 676,
"text_loss": 0.24999259412288666
@@ -6439,13 +6439,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.08740234375,
"learning_rate": 0.0009992498543802085,
- "loss": 0.059,
+ "loss": 0.0588,
"macro_f1": 0.3272727429866791,
"num_tokens": 1093996.0,
"repeat_count": 1.0,
- "routers_loss": 0.032903749495744705,
+ "routers_loss": 0.0380021296441555,
"skip_count": 0.0,
"step": 678,
"text_loss": 0.42473849654197693
@@ -6458,32 +6458,32 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.2099609375,
+ "grad_norm": 0.2119140625,
"learning_rate": 0.0009992328105213688,
- "loss": 0.0417,
+ "loss": 0.0411,
"macro_f1": 0.4400000274181366,
"num_tokens": 1096837.0,
"repeat_count": 1.0,
- "routers_loss": 0.19733747839927673,
+ "routers_loss": 0.20885063707828522,
"skip_count": 4.0,
"step": 680,
"text_loss": 0.3829527199268341
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 26.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 3.2019371881420606,
- "f1_execute": 1.0,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.154296875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009992155753555747,
- "loss": 0.0729,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0722,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1100320.0,
"repeat_count": 0.0,
- "routers_loss": 0.013452666811645031,
+ "routers_loss": 0.018230699002742767,
"skip_count": 2.0,
"step": 682,
"text_loss": 0.6190969944000244
@@ -6496,13 +6496,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.30859375,
"learning_rate": 0.0009991981488894303,
"loss": 0.0681,
"macro_f1": 0.32098767161369324,
"num_tokens": 1103682.0,
"repeat_count": 0.0,
- "routers_loss": 0.05302857980132103,
+ "routers_loss": 0.05550144240260124,
"skip_count": 1.0,
"step": 684,
"text_loss": 0.44418027997016907
@@ -6515,13 +6515,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.0009991805311296133,
- "loss": 0.0527,
+ "loss": 0.0507,
"macro_f1": 0.32098764181137085,
"num_tokens": 1106427.0,
"repeat_count": 0.0,
- "routers_loss": 0.08124994486570358,
+ "routers_loss": 0.07990608364343643,
"skip_count": 2.0,
"step": 686,
"text_loss": 0.5577231645584106
@@ -6534,13 +6534,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.22265625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009991627220828753,
- "loss": 0.0579,
+ "loss": 0.0568,
"macro_f1": 0.32098764181137085,
"num_tokens": 1109314.0,
"repeat_count": 0.0,
- "routers_loss": 0.058633625507354736,
+ "routers_loss": 0.05167485028505325,
"skip_count": 2.0,
"step": 688,
"text_loss": 0.27325430512428284
@@ -6553,13 +6553,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009991447217560408,
- "loss": 0.0533,
+ "loss": 0.0521,
"macro_f1": 0.5492662787437439,
"num_tokens": 1112748.0,
"repeat_count": 0.0,
- "routers_loss": 0.04703643172979355,
+ "routers_loss": 0.04621964320540428,
"skip_count": 2.0,
"step": 690,
"text_loss": 0.5288321375846863
@@ -6572,13 +6572,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.000999126530156007,
- "loss": 0.0485,
+ "loss": 0.0499,
"macro_f1": 0.307692289352417,
"num_tokens": 1116965.0,
"repeat_count": 1.0,
- "routers_loss": 0.11615128815174103,
+ "routers_loss": 0.11950276792049408,
"skip_count": 2.0,
"step": 692,
"text_loss": 0.14215624332427979
@@ -6591,13 +6591,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2314453125,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.0009991081472897454,
- "loss": 0.0718,
+ "loss": 0.0722,
"macro_f1": 0.3333333432674408,
"num_tokens": 1120570.0,
"repeat_count": 0.0,
- "routers_loss": 0.017403846606612206,
+ "routers_loss": 0.01905500330030918,
"skip_count": 0.0,
"step": 694,
"text_loss": 0.41862696409225464
@@ -6610,13 +6610,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.0009990895731643002,
- "loss": 0.0444,
+ "loss": 0.0464,
"macro_f1": 0.3272727429866791,
"num_tokens": 1124009.0,
"repeat_count": 1.0,
- "routers_loss": 0.07067303359508514,
+ "routers_loss": 0.06974572688341141,
"skip_count": 0.0,
"step": 696,
"text_loss": 0.41160130500793457
@@ -6629,13 +6629,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.000999070807786789,
- "loss": 0.0527,
+ "loss": 0.0531,
"macro_f1": 0.3272727429866791,
"num_tokens": 1127370.0,
"repeat_count": 1.0,
- "routers_loss": 0.07131028175354004,
+ "routers_loss": 0.07055293023586273,
"skip_count": 0.0,
"step": 698,
"text_loss": 0.48068273067474365
@@ -6648,13 +6648,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.18359375,
+ "grad_norm": 0.197265625,
"learning_rate": 0.000999051851164403,
- "loss": 0.0629,
+ "loss": 0.0619,
"macro_f1": 0.32098764181137085,
"num_tokens": 1130234.0,
"repeat_count": 1.0,
- "routers_loss": 0.1152748316526413,
+ "routers_loss": 0.12506946921348572,
"skip_count": 1.0,
"step": 700,
"text_loss": 0.47925490140914917
@@ -6667,13 +6667,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.216796875,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.000999032703304406,
- "loss": 0.0663,
+ "loss": 0.0674,
"macro_f1": 0.3333333432674408,
"num_tokens": 1132874.0,
"repeat_count": 0.0,
- "routers_loss": 0.0077212234027683735,
+ "routers_loss": 0.00809287466108799,
"skip_count": 0.0,
"step": 702,
"text_loss": 0.47433632612228394
@@ -6686,13 +6686,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.1064453125,
"learning_rate": 0.0009990133642141358,
- "loss": 0.0494,
+ "loss": 0.0497,
"macro_f1": 0.5492662787437439,
"num_tokens": 1136011.0,
"repeat_count": 0.0,
- "routers_loss": 0.02726336568593979,
+ "routers_loss": 0.0319170281291008,
"skip_count": 2.0,
"step": 704,
"text_loss": 0.6574832201004028
@@ -6705,13 +6705,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.32421875,
+ "grad_norm": 0.33984375,
"learning_rate": 0.000998993833901003,
- "loss": 0.0615,
+ "loss": 0.0619,
"macro_f1": 0.32098764181137085,
"num_tokens": 1139674.0,
"repeat_count": 0.0,
- "routers_loss": 0.0958542674779892,
+ "routers_loss": 0.09850362688302994,
"skip_count": 2.0,
"step": 706,
"text_loss": 0.7660127282142639
@@ -6724,13 +6724,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009989741123724919,
- "loss": 0.0583,
+ "loss": 0.0574,
"macro_f1": 0.3333333432674408,
"num_tokens": 1143558.0,
"repeat_count": 0.0,
- "routers_loss": 0.007100600749254227,
+ "routers_loss": 0.006673311349004507,
"skip_count": 0.0,
"step": 708,
"text_loss": 0.5976111888885498
@@ -6743,13 +6743,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009989541996361594,
- "loss": 0.0445,
+ "loss": 0.045,
"macro_f1": 0.3333333432674408,
"num_tokens": 1146122.0,
"repeat_count": 0.0,
- "routers_loss": 0.0047812811098992825,
+ "routers_loss": 0.004988791421055794,
"skip_count": 0.0,
"step": 710,
"text_loss": 0.5256119966506958
@@ -6762,13 +6762,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009989340956996367,
- "loss": 0.052,
+ "loss": 0.0528,
"macro_f1": 0.3333333432674408,
"num_tokens": 1149546.0,
"repeat_count": 0.0,
- "routers_loss": 0.006643407512456179,
+ "routers_loss": 0.0067769973538815975,
"skip_count": 0.0,
"step": 712,
"text_loss": 0.5040497779846191
@@ -6781,13 +6781,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2890625,
+ "grad_norm": 0.26953125,
"learning_rate": 0.0009989138005706273,
- "loss": 0.0719,
+ "loss": 0.0735,
"macro_f1": 0.32098764181137085,
"num_tokens": 1153195.0,
"repeat_count": 0.0,
- "routers_loss": 0.0910436138510704,
+ "routers_loss": 0.09899546951055527,
"skip_count": 2.0,
"step": 714,
"text_loss": 0.20803412795066833
@@ -6800,13 +6800,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1484375,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.000998893314256908,
- "loss": 0.0649,
+ "loss": 0.064,
"macro_f1": 0.3333333432674408,
"num_tokens": 1157081.0,
"repeat_count": 0.0,
- "routers_loss": 0.010978946462273598,
+ "routers_loss": 0.010492355562746525,
"skip_count": 0.0,
"step": 716,
"text_loss": 0.23077639937400818
@@ -6819,13 +6819,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009988726367663298,
- "loss": 0.0543,
+ "loss": 0.0539,
"macro_f1": 0.3333333432674408,
"num_tokens": 1160079.0,
"repeat_count": 0.0,
- "routers_loss": 0.009956461377441883,
+ "routers_loss": 0.01063773687928915,
"skip_count": 0.0,
"step": 718,
"text_loss": 0.6085864901542664
@@ -6838,13 +6838,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009988517681068163,
- "loss": 0.0412,
+ "loss": 0.0421,
"macro_f1": 0.3272727429866791,
"num_tokens": 1163249.0,
"repeat_count": 1.0,
- "routers_loss": 0.057210199534893036,
+ "routers_loss": 0.05981874838471413,
"skip_count": 0.0,
"step": 720,
"text_loss": 0.4047050476074219
@@ -6857,32 +6857,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009988307082863638,
- "loss": 0.0364,
+ "loss": 0.0361,
"macro_f1": 0.3333333432674408,
"num_tokens": 1166259.0,
"repeat_count": 0.0,
- "routers_loss": 0.01035996899008751,
+ "routers_loss": 0.009750043973326683,
"skip_count": 0.0,
"step": 722,
"text_loss": 0.5306474566459656
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 3.3991781626063986,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.2412109375,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.240234375,
"learning_rate": 0.0009988094573130434,
- "loss": 0.0661,
- "macro_f1": 0.3076923191547394,
+ "loss": 0.063,
+ "macro_f1": 0.5359477400779724,
"num_tokens": 1168887.0,
"repeat_count": 2.0,
- "routers_loss": 0.18087820708751678,
+ "routers_loss": 0.18601104617118835,
"skip_count": 2.0,
"step": 724,
"text_loss": 0.53528892993927
@@ -6895,32 +6895,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009987880151949974,
- "loss": 0.0505,
+ "loss": 0.0496,
"macro_f1": 0.3272727429866791,
"num_tokens": 1172625.0,
"repeat_count": 0.0,
- "routers_loss": 0.04720238968729973,
+ "routers_loss": 0.02845010720193386,
"skip_count": 1.0,
"step": 726,
"text_loss": 0.4760453701019287
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 26.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 3.417963017317288,
- "f1_execute": 1.0,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.2216796875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2177734375,
"learning_rate": 0.0009987663819404434,
- "loss": 0.0603,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.06,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1176580.0,
"repeat_count": 0.0,
- "routers_loss": 0.015407778322696686,
+ "routers_loss": 0.017596980556845665,
"skip_count": 2.0,
"step": 728,
"text_loss": 0.5146099328994751
@@ -6933,13 +6933,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.000998744557557671,
- "loss": 0.0489,
+ "loss": 0.0484,
"macro_f1": 0.3272727429866791,
"num_tokens": 1179804.0,
"repeat_count": 0.0,
- "routers_loss": 0.060891781002283096,
+ "routers_loss": 0.0625474750995636,
"skip_count": 1.0,
"step": 730,
"text_loss": 0.27738022804260254
@@ -6947,18 +6947,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 3.436747872028177,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.203125,
"learning_rate": 0.0009987225420550433,
- "loss": 0.0825,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0796,
+ "macro_f1": 0.307692289352417,
"num_tokens": 1182658.0,
"repeat_count": 1.0,
- "routers_loss": 0.1661442220211029,
+ "routers_loss": 0.16188351809978485,
"skip_count": 2.0,
"step": 732,
"text_loss": 0.23231445252895355
@@ -6966,18 +6966,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 3.446140299383622,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.2001953125,
"learning_rate": 0.0009987003354409965,
- "loss": 0.0634,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0626,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1185451.0,
"repeat_count": 0.0,
- "routers_loss": 0.02108248695731163,
+ "routers_loss": 0.02391529455780983,
"skip_count": 0.0,
"step": 734,
"text_loss": 0.4496627151966095
@@ -6990,13 +6990,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.248046875,
+ "grad_norm": 0.234375,
"learning_rate": 0.0009986779377240405,
- "loss": 0.0534,
+ "loss": 0.0513,
"macro_f1": 0.32098767161369324,
"num_tokens": 1188666.0,
"repeat_count": 0.0,
- "routers_loss": 0.08318125456571579,
+ "routers_loss": 0.08435963839292526,
"skip_count": 1.0,
"step": 736,
"text_loss": 0.4950787127017975
@@ -7009,13 +7009,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11962890625,
+ "grad_norm": 0.1220703125,
"learning_rate": 0.000998655348912758,
- "loss": 0.0514,
+ "loss": 0.0515,
"macro_f1": 0.3333333432674408,
"num_tokens": 1193035.0,
"repeat_count": 0.0,
- "routers_loss": 0.015889234840869904,
+ "routers_loss": 0.01648722216486931,
"skip_count": 0.0,
"step": 738,
"text_loss": 0.24761848151683807
@@ -7028,13 +7028,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1513671875,
"learning_rate": 0.0009986325690158051,
"loss": 0.0435,
"macro_f1": 0.3333333432674408,
"num_tokens": 1196840.0,
"repeat_count": 0.0,
- "routers_loss": 0.01378484908491373,
+ "routers_loss": 0.013143910095095634,
"skip_count": 0.0,
"step": 740,
"text_loss": 0.15662719309329987
@@ -7047,13 +7047,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1787109375,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009986095980419113,
- "loss": 0.076,
+ "loss": 0.0757,
"macro_f1": 0.3333333432674408,
"num_tokens": 1200573.0,
"repeat_count": 0.0,
- "routers_loss": 0.02673683874309063,
+ "routers_loss": 0.026706280186772346,
"skip_count": 0.0,
"step": 742,
"text_loss": 0.16725164651870728
@@ -7066,13 +7066,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.185546875,
+ "grad_norm": 0.1982421875,
"learning_rate": 0.0009985864359998787,
- "loss": 0.0778,
+ "loss": 0.0795,
"macro_f1": 0.3006536364555359,
"num_tokens": 1203589.0,
"repeat_count": 2.0,
- "routers_loss": 0.27776041626930237,
+ "routers_loss": 0.28607678413391113,
"skip_count": 3.0,
"step": 744,
"text_loss": 0.6350882053375244
@@ -7085,13 +7085,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009985630828985835,
- "loss": 0.0575,
+ "loss": 0.0572,
"macro_f1": 0.3272727429866791,
"num_tokens": 1206422.0,
"repeat_count": 0.0,
- "routers_loss": 0.0575483962893486,
+ "routers_loss": 0.05685260891914368,
"skip_count": 1.0,
"step": 746,
"text_loss": 0.33779552578926086
@@ -7104,13 +7104,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009985395387469742,
- "loss": 0.0478,
+ "loss": 0.0458,
"macro_f1": 0.5492662787437439,
"num_tokens": 1211588.0,
"repeat_count": 0.0,
- "routers_loss": 0.0458797849714756,
+ "routers_loss": 0.0437830351293087,
"skip_count": 2.0,
"step": 748,
"text_loss": 0.28664472699165344
@@ -7123,13 +7123,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.15625,
"learning_rate": 0.0009985158035540735,
- "loss": 0.0701,
+ "loss": 0.0714,
"macro_f1": 0.32098764181137085,
"num_tokens": 1214580.0,
"repeat_count": 2.0,
- "routers_loss": 0.07850238680839539,
+ "routers_loss": 0.07074898481369019,
"skip_count": 0.0,
"step": 750,
"text_loss": 0.3939313292503357
@@ -7142,13 +7142,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.21484375,
"learning_rate": 0.0009984918773289762,
- "loss": 0.0702,
+ "loss": 0.0699,
"macro_f1": 0.3333333432674408,
"num_tokens": 1217388.0,
"repeat_count": 0.0,
- "routers_loss": 0.009507967159152031,
+ "routers_loss": 0.009757856838405132,
"skip_count": 0.0,
"step": 752,
"text_loss": 0.37641215324401855
@@ -7161,13 +7161,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1484375,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009984677600808512,
- "loss": 0.0543,
+ "loss": 0.054,
"macro_f1": 0.3333333432674408,
"num_tokens": 1219960.0,
"repeat_count": 0.0,
- "routers_loss": 0.02620997279882431,
+ "routers_loss": 0.02515069581568241,
"skip_count": 0.0,
"step": 754,
"text_loss": 0.155938982963562
@@ -7180,13 +7180,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3359375,
+ "grad_norm": 0.30078125,
"learning_rate": 0.0009984434518189405,
- "loss": 0.0791,
+ "loss": 0.0764,
"macro_f1": 0.3333333432674408,
"num_tokens": 1223234.0,
"repeat_count": 0.0,
- "routers_loss": 0.02798631228506565,
+ "routers_loss": 0.025766927748918533,
"skip_count": 0.0,
"step": 756,
"text_loss": 0.691118061542511
@@ -7201,11 +7201,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1416015625,
"learning_rate": 0.0009984189525525584,
- "loss": 0.046,
+ "loss": 0.0451,
"macro_f1": 0.5359477400779724,
"num_tokens": 1225764.0,
"repeat_count": 2.0,
- "routers_loss": 0.16614431142807007,
+ "routers_loss": 0.1782722771167755,
"skip_count": 2.0,
"step": 758,
"text_loss": 0.3592209219932556
@@ -7218,13 +7218,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.193359375,
+ "grad_norm": 0.189453125,
"learning_rate": 0.0009983942622910935,
- "loss": 0.0669,
+ "loss": 0.0659,
"macro_f1": 0.3333333432674408,
"num_tokens": 1230097.0,
"repeat_count": 0.0,
- "routers_loss": 0.008541896007955074,
+ "routers_loss": 0.00825568474829197,
"skip_count": 0.0,
"step": 760,
"text_loss": 0.4646475315093994
@@ -7237,13 +7237,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009983693810440074,
- "loss": 0.0478,
+ "loss": 0.0477,
"macro_f1": 0.32098764181137085,
"num_tokens": 1233140.0,
"repeat_count": 0.0,
- "routers_loss": 0.045411624014377594,
+ "routers_loss": 0.04156976938247681,
"skip_count": 2.0,
"step": 762,
"text_loss": 0.298682302236557
@@ -7256,13 +7256,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.380859375,
+ "grad_norm": 0.3515625,
"learning_rate": 0.000998344308820834,
- "loss": 0.0689,
+ "loss": 0.0666,
"macro_f1": 0.3272727429866791,
"num_tokens": 1236305.0,
"repeat_count": 0.0,
- "routers_loss": 0.052299100905656815,
+ "routers_loss": 0.05697929114103317,
"skip_count": 1.0,
"step": 764,
"text_loss": 0.5249121189117432
@@ -7275,13 +7275,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.18359375,
"learning_rate": 0.0009983190456311817,
- "loss": 0.0602,
+ "loss": 0.0592,
"macro_f1": 0.3144654333591461,
"num_tokens": 1239673.0,
"repeat_count": 0.0,
- "routers_loss": 0.09140212833881378,
+ "routers_loss": 0.09547408670186996,
"skip_count": 3.0,
"step": 766,
"text_loss": 0.41277334094047546
@@ -7294,13 +7294,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.201171875,
+ "grad_norm": 0.185546875,
"learning_rate": 0.000998293591484731,
- "loss": 0.0475,
+ "loss": 0.0484,
"macro_f1": 0.5492662787437439,
"num_tokens": 1242292.0,
"repeat_count": 0.0,
- "routers_loss": 0.030750583857297897,
+ "routers_loss": 0.030693158507347107,
"skip_count": 2.0,
"step": 768,
"text_loss": 0.1583656519651413
@@ -7313,13 +7313,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000998267946391236,
- "loss": 0.052,
+ "loss": 0.051,
"macro_f1": 0.3333333432674408,
"num_tokens": 1244661.0,
"repeat_count": 0.0,
- "routers_loss": 0.010202950797975063,
+ "routers_loss": 0.01211300864815712,
"skip_count": 0.0,
"step": 770,
"text_loss": 0.4629349112510681
@@ -7332,13 +7332,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0009982421103605238,
- "loss": 0.0434,
+ "loss": 0.0441,
"macro_f1": 0.32098764181137085,
"num_tokens": 1248688.0,
"repeat_count": 0.0,
- "routers_loss": 0.07364192605018616,
+ "routers_loss": 0.0665968507528305,
"skip_count": 2.0,
"step": 772,
"text_loss": 0.4019293785095215
@@ -7353,11 +7353,11 @@
"f1_skip": 0.0,
"grad_norm": 0.2890625,
"learning_rate": 0.000998216083402495,
- "loss": 0.0606,
+ "loss": 0.0613,
"macro_f1": 0.32098764181137085,
"num_tokens": 1251395.0,
"repeat_count": 0.0,
- "routers_loss": 0.06553081423044205,
+ "routers_loss": 0.07186859846115112,
"skip_count": 2.0,
"step": 774,
"text_loss": 0.4659276604652405
@@ -7370,13 +7370,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.302734375,
"learning_rate": 0.0009981898655271235,
- "loss": 0.0475,
+ "loss": 0.0488,
"macro_f1": 0.3333333432674408,
"num_tokens": 1254888.0,
"repeat_count": 0.0,
- "routers_loss": 0.008751659654080868,
+ "routers_loss": 0.007823926396667957,
"skip_count": 0.0,
"step": 776,
"text_loss": 0.5160359740257263
@@ -7389,13 +7389,13 @@
"f1_execute": 0.9130434989929199,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.11962890625,
"learning_rate": 0.0009981634567444557,
- "loss": 0.0777,
+ "loss": 0.0775,
"macro_f1": 0.590062141418457,
"num_tokens": 1258250.0,
"repeat_count": 3.0,
- "routers_loss": 0.24522721767425537,
+ "routers_loss": 0.24624499678611755,
"skip_count": 4.0,
"step": 778,
"text_loss": 0.29319918155670166
@@ -7408,13 +7408,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.263671875,
"learning_rate": 0.0009981368570646115,
"loss": 0.0885,
"macro_f1": 0.3272727429866791,
"num_tokens": 1260916.0,
"repeat_count": 0.0,
- "routers_loss": 0.03767623379826546,
+ "routers_loss": 0.030730176717042923,
"skip_count": 1.0,
"step": 780,
"text_loss": 0.624981164932251
@@ -7427,13 +7427,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009981100664977838,
- "loss": 0.0708,
+ "loss": 0.0699,
"macro_f1": 0.3333333432674408,
"num_tokens": 1264004.0,
"repeat_count": 0.0,
- "routers_loss": 0.006098059006035328,
+ "routers_loss": 0.006829176563769579,
"skip_count": 0.0,
"step": 782,
"text_loss": 0.6137266159057617
@@ -7446,13 +7446,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009980830850542391,
- "loss": 0.0589,
+ "loss": 0.058,
"macro_f1": 0.3333333432674408,
"num_tokens": 1267130.0,
"repeat_count": 0.0,
- "routers_loss": 0.01731623336672783,
+ "routers_loss": 0.018471000716090202,
"skip_count": 0.0,
"step": 784,
"text_loss": 0.15213175117969513
@@ -7465,13 +7465,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.0009980559127443166,
- "loss": 0.0526,
+ "loss": 0.052,
"macro_f1": 0.3333333432674408,
"num_tokens": 1271129.0,
"repeat_count": 0.0,
- "routers_loss": 0.0076471962966024876,
+ "routers_loss": 0.007903140969574451,
"skip_count": 0.0,
"step": 786,
"text_loss": 0.5768613219261169
@@ -7484,13 +7484,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.130859375,
"learning_rate": 0.000998028549578429,
- "loss": 0.0745,
+ "loss": 0.0719,
"macro_f1": 0.307692289352417,
"num_tokens": 1274232.0,
"repeat_count": 0.0,
- "routers_loss": 0.0637628585100174,
+ "routers_loss": 0.06737866252660751,
"skip_count": 3.0,
"step": 788,
"text_loss": 0.2877073585987091
@@ -7503,13 +7503,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009980009955670615,
- "loss": 0.0699,
+ "loss": 0.0698,
"macro_f1": 0.3144654333591461,
"num_tokens": 1277193.0,
"repeat_count": 0.0,
- "routers_loss": 0.10882514715194702,
+ "routers_loss": 0.10194934904575348,
"skip_count": 3.0,
"step": 790,
"text_loss": 0.11860492825508118
@@ -7522,13 +7522,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.126953125,
"learning_rate": 0.000997973250720773,
- "loss": 0.056,
+ "loss": 0.0552,
"macro_f1": 0.32098764181137085,
"num_tokens": 1280960.0,
"repeat_count": 0.0,
- "routers_loss": 0.10924118757247925,
+ "routers_loss": 0.10297708213329315,
"skip_count": 2.0,
"step": 792,
"text_loss": 0.13477706909179688
@@ -7541,13 +7541,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009979453150501954,
- "loss": 0.0664,
+ "loss": 0.0663,
"macro_f1": 0.32098764181137085,
"num_tokens": 1284611.0,
"repeat_count": 1.0,
- "routers_loss": 0.06571807712316513,
+ "routers_loss": 0.06122037023305893,
"skip_count": 1.0,
"step": 794,
"text_loss": 0.40569379925727844
@@ -7560,13 +7560,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1181640625,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.000997917188566034,
- "loss": 0.0616,
+ "loss": 0.062,
"macro_f1": 0.32098764181137085,
"num_tokens": 1287834.0,
"repeat_count": 0.0,
- "routers_loss": 0.058966971933841705,
+ "routers_loss": 0.061135001480579376,
"skip_count": 2.0,
"step": 796,
"text_loss": 0.2829287648200989
@@ -7579,32 +7579,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.109375,
"learning_rate": 0.0009978888712790664,
- "loss": 0.067,
+ "loss": 0.0654,
"macro_f1": 0.3272727429866791,
"num_tokens": 1291666.0,
"repeat_count": 0.0,
- "routers_loss": 0.04844636470079422,
+ "routers_loss": 0.04841872677206993,
"skip_count": 1.0,
"step": 798,
"text_loss": 1.011757254600525
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.4000000059604645,
- "avg_layers": 26.0,
+ "acc_skip": 0.20000000298023224,
+ "avg_layers": 27.0,
"epoch": 3.756090402113296,
- "f1_execute": 0.9166666865348816,
+ "f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
- "f1_skip": 0.5714285969734192,
- "grad_norm": 0.1416015625,
+ "f1_skip": 0.3333333134651184,
+ "grad_norm": 0.14453125,
"learning_rate": 0.0009978603632001444,
- "loss": 0.0634,
- "macro_f1": 0.4960317611694336,
+ "loss": 0.0636,
+ "macro_f1": 0.4104308485984802,
"num_tokens": 1294627.0,
"repeat_count": 1.0,
- "routers_loss": 0.1591777801513672,
+ "routers_loss": 0.15698759257793427,
"skip_count": 5.0,
"step": 800,
"text_loss": 0.4457623362541199
@@ -7617,13 +7617,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.283203125,
"learning_rate": 0.0009978316643401916,
- "loss": 0.0694,
+ "loss": 0.0688,
"macro_f1": 0.3333333432674408,
"num_tokens": 1297711.0,
"repeat_count": 0.0,
- "routers_loss": 0.017735568806529045,
+ "routers_loss": 0.018952010199427605,
"skip_count": 0.0,
"step": 802,
"text_loss": 0.2069481462240219
@@ -7636,13 +7636,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.14453125,
"learning_rate": 0.0009978027747102062,
- "loss": 0.0477,
+ "loss": 0.0479,
"macro_f1": 0.3333333432674408,
"num_tokens": 1300569.0,
"repeat_count": 0.0,
- "routers_loss": 0.012401525862514973,
+ "routers_loss": 0.014538386836647987,
"skip_count": 0.0,
"step": 804,
"text_loss": 0.4983852505683899
@@ -7655,13 +7655,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2080078125,
+ "grad_norm": 0.2109375,
"learning_rate": 0.0009977736943212584,
- "loss": 0.0735,
+ "loss": 0.0721,
"macro_f1": 0.32098764181137085,
"num_tokens": 1303969.0,
"repeat_count": 0.0,
- "routers_loss": 0.10736164450645447,
+ "routers_loss": 0.11164087057113647,
"skip_count": 2.0,
"step": 806,
"text_loss": 0.2910642921924591
@@ -7674,13 +7674,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2001953125,
+ "grad_norm": 0.1826171875,
"learning_rate": 0.000997744423184492,
- "loss": 0.0428,
+ "loss": 0.0424,
"macro_f1": 0.3272727429866791,
"num_tokens": 1307263.0,
"repeat_count": 0.0,
- "routers_loss": 0.0595436617732048,
+ "routers_loss": 0.06073406711220741,
"skip_count": 1.0,
"step": 808,
"text_loss": 0.18831779062747955
@@ -7693,13 +7693,13 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.26171875,
"learning_rate": 0.0009977149613111236,
- "loss": 0.0494,
+ "loss": 0.0486,
"macro_f1": 0.4400000274181366,
"num_tokens": 1309953.0,
"repeat_count": 1.0,
- "routers_loss": 0.12617000937461853,
+ "routers_loss": 0.11035524308681488,
"skip_count": 4.0,
"step": 810,
"text_loss": 0.7872759699821472
@@ -7712,13 +7712,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1669921875,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.0009976853087124433,
- "loss": 0.0537,
+ "loss": 0.0536,
"macro_f1": 0.3333333432674408,
"num_tokens": 1313243.0,
"repeat_count": 0.0,
- "routers_loss": 0.021242506802082062,
+ "routers_loss": 0.021804286167025566,
"skip_count": 0.0,
"step": 812,
"text_loss": 0.22349292039871216
@@ -7731,13 +7731,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.318359375,
+ "grad_norm": 0.28125,
"learning_rate": 0.0009976554653998138,
- "loss": 0.0617,
+ "loss": 0.0612,
"macro_f1": 0.31446540355682373,
"num_tokens": 1316165.0,
"repeat_count": 0.0,
- "routers_loss": 0.10387415438890457,
+ "routers_loss": 0.10715524107217789,
"skip_count": 2.0,
"step": 814,
"text_loss": 0.18035532534122467
@@ -7750,13 +7750,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.000997625431384671,
- "loss": 0.0565,
+ "loss": 0.0564,
"macro_f1": 0.3333333432674408,
"num_tokens": 1319206.0,
"repeat_count": 0.0,
- "routers_loss": 0.007816939614713192,
+ "routers_loss": 0.007173649035394192,
"skip_count": 0.0,
"step": 816,
"text_loss": 0.48928648233413696
@@ -7769,13 +7769,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.0009975952066785243,
- "loss": 0.0654,
+ "loss": 0.0655,
"macro_f1": 0.3006536364555359,
"num_tokens": 1322549.0,
"repeat_count": 1.0,
- "routers_loss": 0.22526368498802185,
+ "routers_loss": 0.22308112680912018,
"skip_count": 4.0,
"step": 818,
"text_loss": 0.5211259722709656
@@ -7788,13 +7788,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.0009975647912929557,
- "loss": 0.056,
+ "loss": 0.0564,
"macro_f1": 0.3333333432674408,
"num_tokens": 1325213.0,
"repeat_count": 0.0,
- "routers_loss": 0.010998851619660854,
+ "routers_loss": 0.00998698640614748,
"skip_count": 0.0,
"step": 820,
"text_loss": 0.7117052674293518
@@ -7807,13 +7807,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.15234375,
"learning_rate": 0.0009975341852396205,
- "loss": 0.0712,
+ "loss": 0.0723,
"macro_f1": 0.32098764181137085,
"num_tokens": 1328383.0,
"repeat_count": 0.0,
- "routers_loss": 0.07115054875612259,
+ "routers_loss": 0.07454588264226913,
"skip_count": 2.0,
"step": 822,
"text_loss": 0.34539610147476196
@@ -7826,13 +7826,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009975033885302469,
- "loss": 0.0611,
+ "loss": 0.0604,
"macro_f1": 0.3333333432674408,
"num_tokens": 1331406.0,
"repeat_count": 0.0,
- "routers_loss": 0.008062695153057575,
+ "routers_loss": 0.009157589636743069,
"skip_count": 0.0,
"step": 824,
"text_loss": 0.7484824657440186
@@ -7845,13 +7845,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.0009974724011766363,
- "loss": 0.0496,
+ "loss": 0.0474,
"macro_f1": 0.3272727429866791,
"num_tokens": 1334410.0,
"repeat_count": 1.0,
- "routers_loss": 0.16666285693645477,
+ "routers_loss": 0.17149391770362854,
"skip_count": 0.0,
"step": 826,
"text_loss": 0.5913820266723633
@@ -7864,13 +7864,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1708984375,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009974412231906632,
- "loss": 0.0567,
+ "loss": 0.058,
"macro_f1": 0.32098764181137085,
"num_tokens": 1337653.0,
"repeat_count": 1.0,
- "routers_loss": 0.0908689796924591,
+ "routers_loss": 0.09743282198905945,
"skip_count": 1.0,
"step": 828,
"text_loss": 0.2505693733692169
@@ -7883,13 +7883,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16015625,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0009974098545842748,
- "loss": 0.0648,
+ "loss": 0.0638,
"macro_f1": 0.3272727429866791,
"num_tokens": 1340860.0,
"repeat_count": 0.0,
- "routers_loss": 0.04364728182554245,
+ "routers_loss": 0.041490405797958374,
"skip_count": 1.0,
"step": 830,
"text_loss": 0.5585370063781738
@@ -7897,18 +7897,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 3.906369239800411,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2060546875,
+ "grad_norm": 0.193359375,
"learning_rate": 0.0009973782953694918,
- "loss": 0.0772,
- "macro_f1": 0.3076923191547394,
- "num_tokens": 1344232.0,
+ "loss": 0.0746,
+ "macro_f1": 0.3006536066532135,
+ "num_tokens": 1344232.0,
"repeat_count": 1.0,
- "routers_loss": 0.15315109491348267,
+ "routers_loss": 0.16080693900585175,
"skip_count": 3.0,
"step": 832,
"text_loss": 0.4782734513282776
@@ -7921,13 +7921,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.000997346545558408,
- "loss": 0.0527,
+ "loss": 0.0522,
"macro_f1": 0.3333333432674408,
"num_tokens": 1347667.0,
"repeat_count": 0.0,
- "routers_loss": 0.01342768594622612,
+ "routers_loss": 0.01173500344157219,
"skip_count": 0.0,
"step": 834,
"text_loss": 0.25036177039146423
@@ -7940,13 +7940,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1748046875,
+ "grad_norm": 0.173828125,
"learning_rate": 0.0009973146051631895,
- "loss": 0.0513,
+ "loss": 0.0522,
"macro_f1": 0.3333333432674408,
"num_tokens": 1350707.0,
"repeat_count": 0.0,
- "routers_loss": 0.01158806961029768,
+ "routers_loss": 0.011477196589112282,
"skip_count": 0.0,
"step": 836,
"text_loss": 0.5482863187789917
@@ -7959,13 +7959,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.0009972824741960764,
- "loss": 0.0549,
+ "loss": 0.0536,
"macro_f1": 0.3333333432674408,
"num_tokens": 1353704.0,
"repeat_count": 0.0,
- "routers_loss": 0.01255605649203062,
+ "routers_loss": 0.010528896935284138,
"skip_count": 0.0,
"step": 838,
"text_loss": 0.6732596158981323
@@ -7978,13 +7978,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12255859375,
+ "grad_norm": 0.1181640625,
"learning_rate": 0.000997250152669381,
- "loss": 0.0578,
+ "loss": 0.0573,
"macro_f1": 0.3333333432674408,
"num_tokens": 1356608.0,
"repeat_count": 0.0,
- "routers_loss": 0.010225459933280945,
+ "routers_loss": 0.010678744874894619,
"skip_count": 0.0,
"step": 840,
"text_loss": 0.5479338765144348
@@ -7997,13 +7997,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.181640625,
"learning_rate": 0.000997217640595489,
- "loss": 0.0633,
+ "loss": 0.0631,
"macro_f1": 0.3333333432674408,
"num_tokens": 1359809.0,
"repeat_count": 0.0,
- "routers_loss": 0.007837744429707527,
+ "routers_loss": 0.00835978239774704,
"skip_count": 0.0,
"step": 842,
"text_loss": 0.42543259263038635
@@ -8016,13 +8016,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.0009971849379868593,
- "loss": 0.0674,
+ "loss": 0.0653,
"macro_f1": 0.3333333432674408,
"num_tokens": 1362201.0,
"repeat_count": 0.0,
- "routers_loss": 0.008631376549601555,
+ "routers_loss": 0.009930923581123352,
"skip_count": 0.0,
"step": 844,
"text_loss": 0.720462441444397
@@ -8035,13 +8035,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009971520448560235,
- "loss": 0.0612,
+ "loss": 0.0615,
"macro_f1": 0.3272727429866791,
"num_tokens": 1365790.0,
"repeat_count": 0.0,
- "routers_loss": 0.06206027418375015,
+ "routers_loss": 0.06344373524188995,
"skip_count": 1.0,
"step": 846,
"text_loss": 0.8423607349395752
@@ -8049,18 +8049,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 3.9815086586439685,
- "f1_execute": 0.9411765336990356,
+ "f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
- "f1_skip": 0.5,
- "grad_norm": 0.16015625,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.16796875,
"learning_rate": 0.000997118961215586,
- "loss": 0.0678,
- "macro_f1": 0.480392187833786,
+ "loss": 0.0674,
+ "macro_f1": 0.4533333480358124,
"num_tokens": 1368387.0,
"repeat_count": 1.0,
- "routers_loss": 0.1463794708251953,
+ "routers_loss": 0.14688406884670258,
"skip_count": 3.0,
"step": 848,
"text_loss": 0.3933577537536621
@@ -8073,13 +8073,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000997085687078225,
- "loss": 0.052,
+ "loss": 0.0518,
"macro_f1": 0.3333333432674408,
"num_tokens": 1371189.0,
"repeat_count": 0.0,
- "routers_loss": 0.01140492781996727,
+ "routers_loss": 0.009953443892300129,
"skip_count": 0.0,
"step": 850,
"text_loss": 0.41469162702560425
@@ -8092,13 +8092,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.15625,
"learning_rate": 0.0009970522224566909,
- "loss": 0.0563,
+ "loss": 0.0555,
"macro_f1": 0.32098767161369324,
"num_tokens": 1374008.0,
"repeat_count": 0.0,
- "routers_loss": 0.05136030167341232,
+ "routers_loss": 0.048870690166950226,
"skip_count": 1.0,
"step": 852,
"text_loss": 0.613615870475769
@@ -8111,32 +8111,32 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.283203125,
"learning_rate": 0.0009970185673638075,
- "loss": 0.0627,
+ "loss": 0.0629,
"macro_f1": 0.32098764181137085,
"num_tokens": 1376662.0,
"repeat_count": 1.0,
- "routers_loss": 0.07274381071329117,
+ "routers_loss": 0.06865929812192917,
"skip_count": 1.0,
"step": 854,
"text_loss": 0.4392736256122589
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 4.01878485471089,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.162109375,
"learning_rate": 0.0009969847218124716,
- "loss": 0.0503,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0506,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1380049.0,
"repeat_count": 0.0,
- "routers_loss": 0.024335317313671112,
+ "routers_loss": 0.02382219396531582,
"skip_count": 1.0,
"step": 856,
"text_loss": 0.19115346670150757
@@ -8149,13 +8149,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009969506858156527,
- "loss": 0.0359,
+ "loss": 0.0344,
"macro_f1": 0.3272727429866791,
"num_tokens": 1383008.0,
"repeat_count": 0.0,
- "routers_loss": 0.046614740043878555,
+ "routers_loss": 0.03907281160354614,
"skip_count": 1.0,
"step": 858,
"text_loss": 0.34842637181282043
@@ -8168,13 +8168,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11181640625,
+ "grad_norm": 0.12060546875,
"learning_rate": 0.0009969164593863935,
- "loss": 0.0372,
+ "loss": 0.0365,
"macro_f1": 0.3333333432674408,
"num_tokens": 1387051.0,
"repeat_count": 0.0,
- "routers_loss": 0.006380240898579359,
+ "routers_loss": 0.007645803038030863,
"skip_count": 0.0,
"step": 860,
"text_loss": 0.3810436725616455
@@ -8187,13 +8187,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009968820425378098,
- "loss": 0.0473,
+ "loss": 0.0463,
"macro_f1": 0.3272727429866791,
"num_tokens": 1390244.0,
"repeat_count": 1.0,
- "routers_loss": 0.04770716652274132,
+ "routers_loss": 0.04435238987207413,
"skip_count": 0.0,
"step": 862,
"text_loss": 0.34853485226631165
@@ -8206,32 +8206,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3359375,
+ "grad_norm": 0.28515625,
"learning_rate": 0.00099684743528309,
- "loss": 0.0434,
+ "loss": 0.0424,
"macro_f1": 0.3333333432674408,
"num_tokens": 1392976.0,
"repeat_count": 0.0,
- "routers_loss": 0.006983708590269089,
+ "routers_loss": 0.006071661598980427,
"skip_count": 0.0,
"step": 864,
"text_loss": 0.6395178437232971
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 4.065746991488113,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.080078125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0810546875,
"learning_rate": 0.0009968126376354958,
- "loss": 0.0476,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0477,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1396061.0,
"repeat_count": 0.0,
- "routers_loss": 0.046313900500535965,
+ "routers_loss": 0.05011235550045967,
"skip_count": 2.0,
"step": 866,
"text_loss": 0.09103966504335403
@@ -8244,32 +8244,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009967776496083616,
"loss": 0.0509,
"macro_f1": 0.3272727429866791,
"num_tokens": 1398993.0,
"repeat_count": 1.0,
- "routers_loss": 0.0401870422065258,
+ "routers_loss": 0.03979124873876572,
"skip_count": 0.0,
"step": 868,
"text_loss": 0.27257058024406433
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 4.084531846199002,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.14453125,
"learning_rate": 0.000996742471215095,
- "loss": 0.0505,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0516,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1402080.0,
"repeat_count": 0.0,
- "routers_loss": 0.03313451260328293,
+ "routers_loss": 0.030823837965726852,
"skip_count": 2.0,
"step": 870,
"text_loss": 0.7047103047370911
@@ -8282,13 +8282,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009967071024691763,
- "loss": 0.0468,
+ "loss": 0.0461,
"macro_f1": 0.3333333432674408,
"num_tokens": 1404890.0,
"repeat_count": 0.0,
- "routers_loss": 0.010118982754647732,
+ "routers_loss": 0.009721715934574604,
"skip_count": 0.0,
"step": 872,
"text_loss": 0.959106981754303
@@ -8301,13 +8301,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.000996671543384159,
- "loss": 0.0498,
+ "loss": 0.05,
"macro_f1": 0.3333333432674408,
"num_tokens": 1407853.0,
"repeat_count": 0.0,
- "routers_loss": 0.005856200121343136,
+ "routers_loss": 0.006025883834809065,
"skip_count": 0.0,
"step": 874,
"text_loss": 0.47571972012519836
@@ -8320,13 +8320,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.09765625,
"learning_rate": 0.0009966357939736692,
- "loss": 0.0417,
+ "loss": 0.0416,
"macro_f1": 0.3272727429866791,
"num_tokens": 1410723.0,
"repeat_count": 0.0,
- "routers_loss": 0.02768322452902794,
+ "routers_loss": 0.025964925065636635,
"skip_count": 0.0,
"step": 876,
"text_loss": 0.4964611530303955
@@ -8339,13 +8339,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1025390625,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009965998542514065,
- "loss": 0.0419,
+ "loss": 0.0415,
"macro_f1": 0.32098764181137085,
"num_tokens": 1414008.0,
"repeat_count": 0.0,
- "routers_loss": 0.09382032603025436,
+ "routers_loss": 0.09509637206792831,
"skip_count": 2.0,
"step": 878,
"text_loss": 0.621494710445404
@@ -8358,32 +8358,32 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009965637242311427,
- "loss": 0.0466,
+ "loss": 0.0472,
"macro_f1": 0.542222261428833,
"num_tokens": 1417447.0,
"repeat_count": 0.0,
- "routers_loss": 0.026867631822824478,
+ "routers_loss": 0.02520318515598774,
"skip_count": 4.0,
"step": 880,
"text_loss": 0.40209758281707764
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6666666865348816,
- "avg_layers": 24.0,
+ "acc_skip": 0.5,
+ "avg_layers": 25.0,
"epoch": 4.14088641033167,
- "f1_execute": 0.95652174949646,
+ "f1_execute": 0.936170220375061,
"f1_repeat": 0.0,
- "f1_skip": 0.800000011920929,
- "grad_norm": 0.26171875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000996527403926723,
- "loss": 0.0496,
- "macro_f1": 0.5855072736740112,
+ "loss": 0.0495,
+ "macro_f1": 0.5342789888381958,
"num_tokens": 1419905.0,
"repeat_count": 0.0,
- "routers_loss": 0.12731307744979858,
+ "routers_loss": 0.13183781504631042,
"skip_count": 6.0,
"step": 882,
"text_loss": 0.642185389995575
@@ -8396,13 +8396,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.0009964908933520655,
- "loss": 0.039,
+ "loss": 0.0375,
"macro_f1": 0.3333333432674408,
"num_tokens": 1423436.0,
"repeat_count": 0.0,
- "routers_loss": 0.008483970537781715,
+ "routers_loss": 0.009429510682821274,
"skip_count": 0.0,
"step": 884,
"text_loss": 0.48232755064964294
@@ -8415,13 +8415,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.18359375,
+ "grad_norm": 0.1669921875,
"learning_rate": 0.0009964541925211613,
- "loss": 0.0348,
+ "loss": 0.0349,
"macro_f1": 0.32098764181137085,
"num_tokens": 1426842.0,
"repeat_count": 0.0,
- "routers_loss": 0.07847871631383896,
+ "routers_loss": 0.07629609107971191,
"skip_count": 2.0,
"step": 886,
"text_loss": 0.16620934009552002
@@ -8434,13 +8434,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0009964173014480738,
- "loss": 0.036,
+ "loss": 0.0348,
"macro_f1": 0.5492662787437439,
"num_tokens": 1430430.0,
"repeat_count": 0.0,
- "routers_loss": 0.04574459046125412,
+ "routers_loss": 0.036814019083976746,
"skip_count": 2.0,
"step": 888,
"text_loss": 0.4866008758544922
@@ -8453,13 +8453,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10595703125,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009963802201469398,
- "loss": 0.0485,
+ "loss": 0.0476,
"macro_f1": 0.3333333432674408,
"num_tokens": 1433821.0,
"repeat_count": 0.0,
- "routers_loss": 0.004683624487370253,
+ "routers_loss": 0.0041250260546803474,
"skip_count": 0.0,
"step": 890,
"text_loss": 0.578216552734375
@@ -8472,13 +8472,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2158203125,
+ "grad_norm": 0.2373046875,
"learning_rate": 0.0009963429486319693,
- "loss": 0.0476,
+ "loss": 0.0463,
"macro_f1": 0.32098764181137085,
"num_tokens": 1436976.0,
"repeat_count": 0.0,
- "routers_loss": 0.06499828398227692,
+ "routers_loss": 0.06213559955358505,
"skip_count": 2.0,
"step": 892,
"text_loss": 0.221701517701149
@@ -8486,18 +8486,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
- "avg_layers": 25.0,
+ "avg_layers": 26.0,
"epoch": 4.197240974464338,
- "f1_execute": 0.9411764740943909,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.4000000059604645,
- "grad_norm": 0.310546875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.361328125,
"learning_rate": 0.0009963054869174446,
- "loss": 0.0326,
- "macro_f1": 0.44705885648727417,
+ "loss": 0.0313,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 1440397.0,
"repeat_count": 0.0,
- "routers_loss": 0.08285653591156006,
+ "routers_loss": 0.07532428950071335,
"skip_count": 2.0,
"step": 894,
"text_loss": 0.6922838091850281
@@ -8510,13 +8510,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.154296875,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.0009962678350177209,
- "loss": 0.0497,
+ "loss": 0.0472,
"macro_f1": 0.3272727429866791,
"num_tokens": 1443604.0,
"repeat_count": 0.0,
- "routers_loss": 0.04252336546778679,
+ "routers_loss": 0.0419243648648262,
"skip_count": 1.0,
"step": 896,
"text_loss": 0.22092342376708984
@@ -8524,18 +8524,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 4.216025829175227,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10302734375,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009962299929472268,
- "loss": 0.0349,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.034,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 1446257.0,
"repeat_count": 2.0,
- "routers_loss": 0.126711905002594,
+ "routers_loss": 0.10849297791719437,
"skip_count": 0.0,
"step": 898,
"text_loss": 0.26394811272621155
@@ -8548,13 +8548,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.000996191960720463,
- "loss": 0.0392,
+ "loss": 0.0394,
"macro_f1": 0.3333333432674408,
"num_tokens": 1449669.0,
"repeat_count": 0.0,
- "routers_loss": 0.00955706462264061,
+ "routers_loss": 0.0092767970636487,
"skip_count": 0.0,
"step": 900,
"text_loss": 0.5338577628135681
@@ -8567,13 +8567,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009961537383520042,
- "loss": 0.0377,
+ "loss": 0.0354,
"macro_f1": 0.3272727429866791,
"num_tokens": 1452450.0,
"repeat_count": 1.0,
- "routers_loss": 0.03127318620681763,
+ "routers_loss": 0.02985367365181446,
"skip_count": 0.0,
"step": 902,
"text_loss": 0.5875228047370911
@@ -8586,13 +8586,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.0009961153258564966,
- "loss": 0.0389,
+ "loss": 0.0378,
"macro_f1": 0.3144654333591461,
"num_tokens": 1456909.0,
"repeat_count": 0.0,
- "routers_loss": 0.06743519753217697,
+ "routers_loss": 0.06794842332601547,
"skip_count": 3.0,
"step": 904,
"text_loss": 0.40959444642066956
@@ -8605,13 +8605,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009960767232486604,
- "loss": 0.0477,
+ "loss": 0.0476,
"macro_f1": 0.3333333432674408,
"num_tokens": 1461712.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025313226506114006,
+ "routers_loss": 0.0023562447167932987,
"skip_count": 0.0,
"step": 906,
"text_loss": 0.3932875096797943
@@ -8624,13 +8624,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.08203125,
"learning_rate": 0.000996037930543288,
- "loss": 0.052,
+ "loss": 0.0505,
"macro_f1": 0.3272727429866791,
"num_tokens": 1464817.0,
"repeat_count": 0.0,
- "routers_loss": 0.037147488445043564,
+ "routers_loss": 0.03880339860916138,
"skip_count": 1.0,
"step": 908,
"text_loss": 0.17482402920722961
@@ -8643,13 +8643,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.2119140625,
"learning_rate": 0.000995998947755245,
- "loss": 0.0501,
+ "loss": 0.0479,
"macro_f1": 0.3272727429866791,
"num_tokens": 1467810.0,
"repeat_count": 0.0,
- "routers_loss": 0.021232586354017258,
+ "routers_loss": 0.01736828312277794,
"skip_count": 1.0,
"step": 910,
"text_loss": 0.4140470325946808
@@ -8662,13 +8662,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009959597748994695,
- "loss": 0.0759,
+ "loss": 0.0752,
"macro_f1": 0.3333333432674408,
"num_tokens": 1470802.0,
"repeat_count": 0.0,
- "routers_loss": 0.010563847608864307,
+ "routers_loss": 0.011824851855635643,
"skip_count": 0.0,
"step": 912,
"text_loss": 0.7153383493423462
@@ -8681,13 +8681,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009959204119909726,
- "loss": 0.0425,
+ "loss": 0.0421,
"macro_f1": 0.3272727429866791,
"num_tokens": 1474539.0,
"repeat_count": 0.0,
- "routers_loss": 0.0267612524330616,
+ "routers_loss": 0.025456594303250313,
"skip_count": 0.0,
"step": 914,
"text_loss": 0.42812058329582214
@@ -8700,13 +8700,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009958808590448385,
- "loss": 0.0501,
+ "loss": 0.0489,
"macro_f1": 0.3333333432674408,
"num_tokens": 1477552.0,
"repeat_count": 0.0,
- "routers_loss": 0.005838244222104549,
+ "routers_loss": 0.006795851048082113,
"skip_count": 0.0,
"step": 916,
"text_loss": 0.5402814149856567
@@ -8719,13 +8719,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009958411160762234,
- "loss": 0.0383,
+ "loss": 0.039,
"macro_f1": 0.3333333432674408,
"num_tokens": 1482547.0,
"repeat_count": 0.0,
- "routers_loss": 0.014642171561717987,
+ "routers_loss": 0.015615932643413544,
"skip_count": 0.0,
"step": 918,
"text_loss": 0.3836168050765991
@@ -8738,32 +8738,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009958011831003577,
- "loss": 0.0457,
+ "loss": 0.0448,
"macro_f1": 0.3272727429866791,
"num_tokens": 1485807.0,
"repeat_count": 0.0,
- "routers_loss": 0.04119620472192764,
+ "routers_loss": 0.043541423976421356,
"skip_count": 1.0,
"step": 920,
"text_loss": 0.4333936274051666
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 4.328734957440563,
- "f1_execute": 0.943396270275116,
- "f1_repeat": 0.0,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.154296875,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.000995761060132543,
- "loss": 0.0433,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0418,
+ "macro_f1": 0.6538461446762085,
"num_tokens": 1488941.0,
"repeat_count": 1.0,
- "routers_loss": 0.06713195145130157,
+ "routers_loss": 0.05866432189941406,
"skip_count": 2.0,
"step": 922,
"text_loss": 0.4106994867324829
@@ -8776,13 +8776,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009957207471881552,
- "loss": 0.0533,
+ "loss": 0.0531,
"macro_f1": 0.5492662787437439,
"num_tokens": 1492026.0,
"repeat_count": 0.0,
- "routers_loss": 0.024023180827498436,
+ "routers_loss": 0.02714901603758335,
"skip_count": 2.0,
"step": 924,
"text_loss": 0.542091429233551
@@ -8795,13 +8795,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.17578125,
+ "grad_norm": 0.1796875,
"learning_rate": 0.0009956802442826415,
- "loss": 0.0373,
+ "loss": 0.0386,
"macro_f1": 0.3272727429866791,
"num_tokens": 1494543.0,
"repeat_count": 1.0,
- "routers_loss": 0.05399841442704201,
+ "routers_loss": 0.0563737191259861,
"skip_count": 0.0,
"step": 926,
"text_loss": 0.47209203243255615
@@ -8814,13 +8814,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0009956395514315235,
- "loss": 0.0488,
+ "loss": 0.0496,
"macro_f1": 0.3272727429866791,
"num_tokens": 1497831.0,
"repeat_count": 1.0,
- "routers_loss": 0.0299264844506979,
+ "routers_loss": 0.03285066783428192,
"skip_count": 0.0,
"step": 928,
"text_loss": 0.6628931164741516
@@ -8833,13 +8833,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009955986686503943,
- "loss": 0.0467,
+ "loss": 0.0466,
"macro_f1": 0.3272727429866791,
"num_tokens": 1501375.0,
"repeat_count": 0.0,
- "routers_loss": 0.023478010669350624,
+ "routers_loss": 0.024297121912240982,
"skip_count": 1.0,
"step": 930,
"text_loss": 0.495676189661026
@@ -8852,13 +8852,13 @@
"f1_execute": 0.9387754797935486,
"f1_repeat": 1.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0009955575959549202,
- "loss": 0.0447,
+ "loss": 0.0424,
"macro_f1": 0.7795917987823486,
"num_tokens": 1504363.0,
"repeat_count": 1.0,
- "routers_loss": 0.12116194516420364,
+ "routers_loss": 0.12196464836597443,
"skip_count": 4.0,
"step": 932,
"text_loss": 0.26123273372650146
@@ -8871,13 +8871,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.0009955163333608408,
- "loss": 0.053,
+ "loss": 0.0538,
"macro_f1": 0.3333333432674408,
"num_tokens": 1507178.0,
"repeat_count": 0.0,
- "routers_loss": 0.011879723519086838,
+ "routers_loss": 0.012947078794240952,
"skip_count": 0.0,
"step": 934,
"text_loss": 0.32552677392959595
@@ -8890,13 +8890,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009954748808839674,
- "loss": 0.0373,
+ "loss": 0.0379,
"macro_f1": 0.3333333432674408,
"num_tokens": 1509910.0,
"repeat_count": 0.0,
- "routers_loss": 0.009245929308235645,
+ "routers_loss": 0.008946365676820278,
"skip_count": 0.0,
"step": 936,
"text_loss": 0.533141016960144
@@ -8909,13 +8909,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.140625,
"learning_rate": 0.000995433238540185,
- "loss": 0.0461,
+ "loss": 0.0466,
"macro_f1": 0.6538461446762085,
"num_tokens": 1512826.0,
"repeat_count": 1.0,
- "routers_loss": 0.032464127987623215,
+ "routers_loss": 0.029975678771734238,
"skip_count": 1.0,
"step": 938,
"text_loss": 0.2953577935695648
@@ -8928,13 +8928,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009953914063454512,
- "loss": 0.0515,
+ "loss": 0.0497,
"macro_f1": 0.3144654333591461,
"num_tokens": 1517230.0,
"repeat_count": 1.0,
- "routers_loss": 0.08835392445325851,
+ "routers_loss": 0.0889134630560875,
"skip_count": 2.0,
"step": 940,
"text_loss": 0.5368834733963013
@@ -8947,13 +8947,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.193359375,
"learning_rate": 0.000995349384315796,
- "loss": 0.0405,
+ "loss": 0.0413,
"macro_f1": 0.3333333432674408,
"num_tokens": 1519876.0,
"repeat_count": 0.0,
- "routers_loss": 0.014307246543467045,
+ "routers_loss": 0.013458753935992718,
"skip_count": 0.0,
"step": 942,
"text_loss": 0.2005518227815628
@@ -8966,13 +8966,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.000995307172467322,
- "loss": 0.0449,
+ "loss": 0.0444,
"macro_f1": 0.31446540355682373,
"num_tokens": 1522998.0,
"repeat_count": 1.0,
- "routers_loss": 0.10261563211679459,
+ "routers_loss": 0.08850377053022385,
"skip_count": 1.0,
"step": 944,
"text_loss": 0.227926567196846
@@ -8985,13 +8985,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009952647708162054,
- "loss": 0.0507,
+ "loss": 0.0503,
"macro_f1": 0.3272727429866791,
"num_tokens": 1527100.0,
"repeat_count": 0.0,
- "routers_loss": 0.03316422924399376,
+ "routers_loss": 0.03199794515967369,
"skip_count": 1.0,
"step": 946,
"text_loss": 0.4859686493873596
@@ -9004,13 +9004,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009952221793786942,
- "loss": 0.0352,
+ "loss": 0.0354,
"macro_f1": 0.3333333432674408,
"num_tokens": 1530028.0,
"repeat_count": 0.0,
- "routers_loss": 0.00902469176799059,
+ "routers_loss": 0.006507779937237501,
"skip_count": 0.0,
"step": 948,
"text_loss": 0.6855354905128479
@@ -9023,13 +9023,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.0009951793981711097,
- "loss": 0.0581,
+ "loss": 0.0584,
"macro_f1": 0.6538461446762085,
"num_tokens": 1533254.0,
"repeat_count": 1.0,
- "routers_loss": 0.06710167229175568,
+ "routers_loss": 0.06175103038549423,
"skip_count": 1.0,
"step": 950,
"text_loss": 0.7590400576591492
@@ -9042,13 +9042,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009951364272098458,
- "loss": 0.0294,
+ "loss": 0.0295,
"macro_f1": 0.5492662787437439,
"num_tokens": 1536239.0,
"repeat_count": 0.0,
- "routers_loss": 0.04208769276738167,
+ "routers_loss": 0.03773383051156998,
"skip_count": 2.0,
"step": 952,
"text_loss": 0.669784665107727
@@ -9061,13 +9061,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009950932665113688,
- "loss": 0.0505,
+ "loss": 0.0507,
"macro_f1": 0.32098764181137085,
"num_tokens": 1539682.0,
"repeat_count": 0.0,
- "routers_loss": 0.06530380249023438,
+ "routers_loss": 0.07280613481998444,
"skip_count": 2.0,
"step": 954,
"text_loss": 0.3365570902824402
@@ -9080,13 +9080,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009950499160922184,
- "loss": 0.0545,
+ "loss": 0.0541,
"macro_f1": 0.3333333432674408,
"num_tokens": 1542875.0,
"repeat_count": 0.0,
- "routers_loss": 0.01803453080356121,
+ "routers_loss": 0.01770266517996788,
"skip_count": 0.0,
"step": 956,
"text_loss": 0.0921545997262001
@@ -9099,13 +9099,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.09375,
"learning_rate": 0.000995006375969006,
- "loss": 0.0481,
+ "loss": 0.0473,
"macro_f1": 0.3272727429866791,
"num_tokens": 1547135.0,
"repeat_count": 1.0,
- "routers_loss": 0.08461762219667435,
+ "routers_loss": 0.07672002166509628,
"skip_count": 0.0,
"step": 958,
"text_loss": 0.5887606739997864
@@ -9120,11 +9120,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1376953125,
"learning_rate": 0.0009949626461584165,
- "loss": 0.0441,
+ "loss": 0.043,
"macro_f1": 0.3333333432674408,
"num_tokens": 1550100.0,
"repeat_count": 0.0,
- "routers_loss": 0.007111486047506332,
+ "routers_loss": 0.006247182376682758,
"skip_count": 0.0,
"step": 960,
"text_loss": 0.5777931213378906
@@ -9137,13 +9137,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.11181640625,
+ "grad_norm": 0.119140625,
"learning_rate": 0.0009949187266772076,
- "loss": 0.0361,
+ "loss": 0.0366,
"macro_f1": 0.5492662787437439,
"num_tokens": 1553192.0,
"repeat_count": 0.0,
- "routers_loss": 0.029776185750961304,
+ "routers_loss": 0.030319908633828163,
"skip_count": 2.0,
"step": 962,
"text_loss": 0.2370252162218094
@@ -9156,13 +9156,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009948746175422088,
- "loss": 0.0506,
+ "loss": 0.0511,
"macro_f1": 0.3333333432674408,
"num_tokens": 1556318.0,
"repeat_count": 0.0,
- "routers_loss": 0.007108999416232109,
+ "routers_loss": 0.006004320923238993,
"skip_count": 0.0,
"step": 964,
"text_loss": 0.6271032094955444
@@ -9175,13 +9175,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000994830318770323,
- "loss": 0.0498,
+ "loss": 0.0514,
"macro_f1": 0.3333333432674408,
"num_tokens": 1559195.0,
"repeat_count": 0.0,
- "routers_loss": 0.01126947533339262,
+ "routers_loss": 0.011544366367161274,
"skip_count": 0.0,
"step": 966,
"text_loss": 0.47256720066070557
@@ -9194,13 +9194,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009947858303785255,
- "loss": 0.0366,
+ "loss": 0.0374,
"macro_f1": 0.6603773832321167,
"num_tokens": 1561813.0,
"repeat_count": 1.0,
- "routers_loss": 0.05142999067902565,
+ "routers_loss": 0.05258861929178238,
"skip_count": 1.0,
"step": 968,
"text_loss": 0.7703132629394531
@@ -9213,13 +9213,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.0009947411523838648,
- "loss": 0.0461,
+ "loss": 0.0453,
"macro_f1": 0.3333333432674408,
"num_tokens": 1564634.0,
"repeat_count": 0.0,
- "routers_loss": 0.010770819149911404,
+ "routers_loss": 0.011216280050575733,
"skip_count": 0.0,
"step": 970,
"text_loss": 0.4666804075241089
@@ -9232,13 +9232,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0009946962848034608,
- "loss": 0.0692,
+ "loss": 0.0696,
"macro_f1": 0.3333333432674408,
"num_tokens": 1567959.0,
"repeat_count": 0.0,
- "routers_loss": 0.008775795809924603,
+ "routers_loss": 0.009387624450027943,
"skip_count": 0.0,
"step": 972,
"text_loss": 0.4067264199256897
@@ -9251,13 +9251,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.203125,
"learning_rate": 0.0009946512276545075,
- "loss": 0.0403,
+ "loss": 0.0397,
"macro_f1": 0.3272727429866791,
"num_tokens": 1571221.0,
"repeat_count": 1.0,
- "routers_loss": 0.05100395902991295,
+ "routers_loss": 0.041713520884513855,
"skip_count": 0.0,
"step": 974,
"text_loss": 0.5242366194725037
@@ -9270,13 +9270,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.228515625,
"learning_rate": 0.0009946059809542705,
- "loss": 0.0503,
+ "loss": 0.0487,
"macro_f1": 0.7644445300102234,
"num_tokens": 1575033.0,
"repeat_count": 2.0,
- "routers_loss": 0.06653711199760437,
+ "routers_loss": 0.05748331546783447,
"skip_count": 2.0,
"step": 976,
"text_loss": 0.5704690217971802
@@ -9284,18 +9284,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 4.591722923393014,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0009945605447200887,
- "loss": 0.0435,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0445,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1579050.0,
"repeat_count": 0.0,
- "routers_loss": 0.009865665808320045,
+ "routers_loss": 0.016765203326940536,
"skip_count": 0.0,
"step": 978,
"text_loss": 0.4804173707962036
@@ -9308,13 +9308,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.0009945149189693732,
- "loss": 0.0399,
+ "loss": 0.0406,
"macro_f1": 0.5492662787437439,
"num_tokens": 1582967.0,
"repeat_count": 0.0,
- "routers_loss": 0.021175632253289223,
+ "routers_loss": 0.021518222987651825,
"skip_count": 2.0,
"step": 980,
"text_loss": 0.4138598144054413
@@ -9327,32 +9327,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11181640625,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.0009944691037196078,
- "loss": 0.0472,
+ "loss": 0.0456,
"macro_f1": 0.3333333432674408,
"num_tokens": 1586282.0,
"repeat_count": 0.0,
- "routers_loss": 0.011803832836449146,
+ "routers_loss": 0.012246460653841496,
"skip_count": 0.0,
"step": 982,
"text_loss": 0.22561736404895782
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 0.5,
"acc_skip": 0.800000011920929,
- "avg_layers": 23.0,
+ "avg_layers": 24.0,
"epoch": 4.6199002054593485,
- "f1_execute": 0.9090908765792847,
- "f1_repeat": 0.0,
+ "f1_execute": 0.930232584476471,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 0.8000000715255737,
- "grad_norm": 0.142578125,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009944230989883491,
- "loss": 0.0467,
- "macro_f1": 0.5696970224380493,
+ "loss": 0.0456,
+ "macro_f1": 0.7989664077758789,
"num_tokens": 1589279.0,
"repeat_count": 2.0,
- "routers_loss": 0.08856551349163055,
+ "routers_loss": 0.09344895929098129,
"skip_count": 5.0,
"step": 984,
"text_loss": 0.4416656494140625
@@ -9365,13 +9365,13 @@
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.111328125,
"learning_rate": 0.0009943769047932264,
- "loss": 0.0413,
+ "loss": 0.0404,
"macro_f1": 0.5359477400779724,
"num_tokens": 1592398.0,
"repeat_count": 2.0,
- "routers_loss": 0.08593414723873138,
+ "routers_loss": 0.08916857838630676,
"skip_count": 2.0,
"step": 986,
"text_loss": 0.5536438822746277
@@ -9384,13 +9384,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.154296875,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000994330521151941,
- "loss": 0.0399,
+ "loss": 0.039,
"macro_f1": 0.32098764181137085,
"num_tokens": 1596213.0,
"repeat_count": 1.0,
- "routers_loss": 0.07049509882926941,
+ "routers_loss": 0.06114347651600838,
"skip_count": 1.0,
"step": 988,
"text_loss": 0.5835405588150024
@@ -9403,13 +9403,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.1953125,
"learning_rate": 0.000994283948082267,
- "loss": 0.0595,
+ "loss": 0.0573,
"macro_f1": 0.3333333432674408,
"num_tokens": 1598827.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019258069805800915,
+ "routers_loss": 0.0017335431184619665,
"skip_count": 0.0,
"step": 990,
"text_loss": 0.5857380032539368
@@ -9422,13 +9422,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009942371856020522,
- "loss": 0.0335,
+ "loss": 0.0341,
"macro_f1": 0.3333333432674408,
"num_tokens": 1602915.0,
"repeat_count": 0.0,
- "routers_loss": 0.014094089157879353,
+ "routers_loss": 0.014606470242142677,
"skip_count": 0.0,
"step": 992,
"text_loss": 0.6939892768859863
@@ -9436,18 +9436,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 31.0,
"epoch": 4.666862342236572,
- "f1_execute": 0.9583333134651184,
+ "f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009941902337292155,
- "loss": 0.0603,
- "macro_f1": 0.6527777910232544,
+ "loss": 0.06,
+ "macro_f1": 0.6598639488220215,
"num_tokens": 1605776.0,
"repeat_count": 3.0,
- "routers_loss": 0.06360147893428802,
+ "routers_loss": 0.06297315657138824,
"skip_count": 1.0,
"step": 994,
"text_loss": 0.37616831064224243
@@ -9460,13 +9460,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009941430924817487,
- "loss": 0.0573,
+ "loss": 0.0572,
"macro_f1": 0.5492662787437439,
"num_tokens": 1609856.0,
"repeat_count": 0.0,
- "routers_loss": 0.0326208658516407,
+ "routers_loss": 0.03297794610261917,
"skip_count": 2.0,
"step": 996,
"text_loss": 0.2098303586244583
@@ -9479,13 +9479,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09912109375,
+ "grad_norm": 0.10107421875,
"learning_rate": 0.000994095761877717,
- "loss": 0.0502,
+ "loss": 0.0499,
"macro_f1": 0.3333333432674408,
"num_tokens": 1612904.0,
"repeat_count": 0.0,
- "routers_loss": 0.012660752050578594,
+ "routers_loss": 0.012901155278086662,
"skip_count": 0.0,
"step": 998,
"text_loss": 0.20103533565998077
@@ -9498,13 +9498,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.265625,
+ "grad_norm": 0.259765625,
"learning_rate": 0.000994048241935257,
- "loss": 0.0537,
+ "loss": 0.0535,
"macro_f1": 0.3272727429866791,
"num_tokens": 1615540.0,
"repeat_count": 0.0,
- "routers_loss": 0.021756287664175034,
+ "routers_loss": 0.020434845238924026,
"skip_count": 0.0,
"step": 1000,
"text_loss": 0.32709044218063354
@@ -9512,37 +9512,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 4.70443205165835,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1669921875,
"learning_rate": 0.0009940005326725789,
- "loss": 0.0447,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.0453,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 1618786.0,
"repeat_count": 0.0,
- "routers_loss": 0.07292548567056656,
+ "routers_loss": 0.07831378281116486,
"skip_count": 2.0,
"step": 1002,
"text_loss": 0.5789632797241211
},
{
- "acc_repeat": 0.5,
+ "acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 4.713824479013795,
- "f1_execute": 0.9811320900917053,
- "f1_repeat": 0.6666666865348816,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1787109375,
+ "grad_norm": 0.21875,
"learning_rate": 0.0009939526341079647,
- "loss": 0.0505,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0511,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 1621736.0,
"repeat_count": 2.0,
- "routers_loss": 0.03397528454661369,
+ "routers_loss": 0.04863874986767769,
"skip_count": 0.0,
"step": 1004,
"text_loss": 0.6128849387168884
@@ -9555,13 +9555,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009939045462597693,
- "loss": 0.0544,
+ "loss": 0.0538,
"macro_f1": 0.3333333432674408,
"num_tokens": 1624649.0,
"repeat_count": 0.0,
- "routers_loss": 0.005987613927572966,
+ "routers_loss": 0.00677989237010479,
"skip_count": 0.0,
"step": 1006,
"text_loss": 0.6168264150619507
@@ -9574,13 +9574,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009938562691464202,
- "loss": 0.0522,
+ "loss": 0.0524,
"macro_f1": 0.3333333432674408,
"num_tokens": 1627700.0,
"repeat_count": 0.0,
- "routers_loss": 0.021656684577465057,
+ "routers_loss": 0.019490402191877365,
"skip_count": 0.0,
"step": 1008,
"text_loss": 0.17463822662830353
@@ -9593,32 +9593,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.000993807802786417,
- "loss": 0.0487,
+ "loss": 0.0475,
"macro_f1": 0.3333333432674408,
"num_tokens": 1630714.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014992234064266086,
+ "routers_loss": 0.0019022391643375158,
"skip_count": 0.0,
"step": 1010,
"text_loss": 0.5675593018531799
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.5,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 4.751394188435574,
- "f1_execute": 0.9411764740943909,
- "f1_repeat": 0.0,
+ "f1_execute": 0.9599999785423279,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.158203125,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009937591471983322,
- "loss": 0.0491,
- "macro_f1": 0.5359477400779724,
+ "loss": 0.0501,
+ "macro_f1": 0.7644444704055786,
"num_tokens": 1633770.0,
"repeat_count": 1.0,
- "routers_loss": 0.03448791801929474,
+ "routers_loss": 0.042485643178224564,
"skip_count": 2.0,
"step": 1012,
"text_loss": 0.42387229204177856
@@ -9631,13 +9631,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1357421875,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0009937103024008109,
- "loss": 0.0541,
+ "loss": 0.0545,
"macro_f1": 0.3272727429866791,
"num_tokens": 1637120.0,
"repeat_count": 0.0,
- "routers_loss": 0.08285929262638092,
+ "routers_loss": 0.09427817165851593,
"skip_count": 1.0,
"step": 1014,
"text_loss": 0.49511051177978516
@@ -9650,13 +9650,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.125,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009936612684125702,
- "loss": 0.0515,
+ "loss": 0.0503,
"macro_f1": 0.3333333432674408,
"num_tokens": 1640165.0,
"repeat_count": 0.0,
- "routers_loss": 0.00486504752188921,
+ "routers_loss": 0.005106127820909023,
"skip_count": 0.0,
"step": 1016,
"text_loss": 0.5398799180984497
@@ -9669,13 +9669,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.2734375,
"learning_rate": 0.0009936120452524004,
- "loss": 0.051,
+ "loss": 0.0506,
"macro_f1": 0.3333333432674408,
"num_tokens": 1643251.0,
"repeat_count": 0.0,
- "routers_loss": 0.017805909737944603,
+ "routers_loss": 0.016914300620555878,
"skip_count": 0.0,
"step": 1018,
"text_loss": 0.20882178843021393
@@ -9688,13 +9688,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009935626329391637,
- "loss": 0.0547,
+ "loss": 0.0537,
"macro_f1": 0.32098764181137085,
"num_tokens": 1646560.0,
"repeat_count": 0.0,
- "routers_loss": 0.12958799302577972,
+ "routers_loss": 0.13481520116329193,
"skip_count": 2.0,
"step": 1020,
"text_loss": 0.5719883441925049
@@ -9707,13 +9707,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009935130314917948,
- "loss": 0.0595,
+ "loss": 0.0602,
"macro_f1": 0.5492662787437439,
"num_tokens": 1649538.0,
"repeat_count": 0.0,
- "routers_loss": 0.07447081059217453,
+ "routers_loss": 0.07700438797473907,
"skip_count": 2.0,
"step": 1022,
"text_loss": 0.1303367167711258
@@ -9726,13 +9726,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009934632409293015,
- "loss": 0.0619,
+ "loss": 0.0611,
"macro_f1": 0.32098764181137085,
"num_tokens": 1652397.0,
"repeat_count": 1.0,
- "routers_loss": 0.12529553472995758,
+ "routers_loss": 0.11416907608509064,
"skip_count": 1.0,
"step": 1024,
"text_loss": 0.24076920747756958
@@ -9745,13 +9745,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.306640625,
"learning_rate": 0.0009934132612707631,
- "loss": 0.0491,
+ "loss": 0.0507,
"macro_f1": 0.31446540355682373,
"num_tokens": 1654938.0,
"repeat_count": 0.0,
- "routers_loss": 0.08664281666278839,
+ "routers_loss": 0.09484589844942093,
"skip_count": 2.0,
"step": 1026,
"text_loss": 0.1652517318725586
@@ -9764,13 +9764,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009933630925353324,
- "loss": 0.0394,
+ "loss": 0.0395,
"macro_f1": 0.3333333432674408,
"num_tokens": 1658536.0,
"repeat_count": 0.0,
- "routers_loss": 0.0067965323105454445,
+ "routers_loss": 0.00741987070068717,
"skip_count": 0.0,
"step": 1028,
"text_loss": 0.49296700954437256
@@ -9783,13 +9783,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1845703125,
"learning_rate": 0.0009933127347422337,
- "loss": 0.0607,
+ "loss": 0.0602,
"macro_f1": 0.32098764181137085,
"num_tokens": 1661446.0,
"repeat_count": 0.0,
- "routers_loss": 0.08319470286369324,
+ "routers_loss": 0.08399344235658646,
"skip_count": 2.0,
"step": 1030,
"text_loss": 0.22363591194152832
@@ -9802,13 +9802,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.158203125,
"learning_rate": 0.0009932621879107648,
- "loss": 0.0476,
+ "loss": 0.0475,
"macro_f1": 0.3333333432674408,
"num_tokens": 1664612.0,
"repeat_count": 0.0,
- "routers_loss": 0.002826537238433957,
+ "routers_loss": 0.0031781597062945366,
"skip_count": 0.0,
"step": 1032,
"text_loss": 0.36083245277404785
@@ -9823,11 +9823,11 @@
"f1_skip": 0.0,
"grad_norm": 0.2275390625,
"learning_rate": 0.000993211452060295,
- "loss": 0.0431,
+ "loss": 0.042,
"macro_f1": 0.3272727429866791,
"num_tokens": 1667467.0,
"repeat_count": 0.0,
- "routers_loss": 0.03491095453500748,
+ "routers_loss": 0.03595469892024994,
"skip_count": 1.0,
"step": 1034,
"text_loss": 0.16372856497764587
@@ -9840,13 +9840,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.173828125,
+ "grad_norm": 0.189453125,
"learning_rate": 0.000993160527210266,
- "loss": 0.0616,
+ "loss": 0.061,
"macro_f1": 0.3144654333591461,
"num_tokens": 1670675.0,
"repeat_count": 3.0,
- "routers_loss": 0.1828247457742691,
+ "routers_loss": 0.1597205102443695,
"skip_count": 0.0,
"step": 1036,
"text_loss": 0.6049913763999939
@@ -9859,13 +9859,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2099609375,
+ "grad_norm": 0.2197265625,
"learning_rate": 0.000993109413380193,
- "loss": 0.0563,
+ "loss": 0.0562,
"macro_f1": 0.3333333432674408,
"num_tokens": 1673477.0,
"repeat_count": 0.0,
- "routers_loss": 0.010931054130196571,
+ "routers_loss": 0.009756010957062244,
"skip_count": 0.0,
"step": 1038,
"text_loss": 0.7034620642662048
@@ -9878,13 +9878,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.158203125,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.0009930581105896624,
- "loss": 0.0569,
+ "loss": 0.0559,
"macro_f1": 0.3272727429866791,
"num_tokens": 1676809.0,
"repeat_count": 0.0,
- "routers_loss": 0.023222090676426888,
+ "routers_loss": 0.020718922838568687,
"skip_count": 0.0,
"step": 1040,
"text_loss": 0.2814720571041107
@@ -9897,13 +9897,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.0009930066188583338,
- "loss": 0.0453,
+ "loss": 0.0445,
"macro_f1": 0.32098764181137085,
"num_tokens": 1679398.0,
"repeat_count": 1.0,
- "routers_loss": 0.07085686922073364,
+ "routers_loss": 0.04755603149533272,
"skip_count": 1.0,
"step": 1042,
"text_loss": 0.5445759296417236
@@ -9916,13 +9916,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.126953125,
"learning_rate": 0.0009929549382059388,
- "loss": 0.0515,
+ "loss": 0.0509,
"macro_f1": 0.3333333432674408,
"num_tokens": 1682269.0,
"repeat_count": 0.0,
- "routers_loss": 0.010158216580748558,
+ "routers_loss": 0.01040949858725071,
"skip_count": 0.0,
"step": 1044,
"text_loss": 0.2876914143562317
@@ -9935,13 +9935,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0009929030686522816,
- "loss": 0.0372,
+ "loss": 0.0363,
"macro_f1": 0.3333333432674408,
"num_tokens": 1685428.0,
"repeat_count": 0.0,
- "routers_loss": 0.007876895368099213,
+ "routers_loss": 0.008158888667821884,
"skip_count": 0.0,
"step": 1046,
"text_loss": 0.49053525924682617
@@ -9954,13 +9954,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009928510102172386,
- "loss": 0.0501,
+ "loss": 0.0498,
"macro_f1": 0.3333333432674408,
"num_tokens": 1688252.0,
"repeat_count": 0.0,
- "routers_loss": 0.004859173204749823,
+ "routers_loss": 0.005102572031319141,
"skip_count": 0.0,
"step": 1048,
"text_loss": 0.5274341106414795
@@ -9973,13 +9973,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.17578125,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.0009927987629207587,
- "loss": 0.0582,
+ "loss": 0.0564,
"macro_f1": 0.3333333432674408,
"num_tokens": 1691289.0,
"repeat_count": 0.0,
- "routers_loss": 0.01798083633184433,
+ "routers_loss": 0.016768503934144974,
"skip_count": 0.0,
"step": 1050,
"text_loss": 0.9935035109519958
@@ -9987,18 +9987,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 4.939242735544467,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009927463267828634,
"loss": 0.0488,
- "macro_f1": 0.3272727429866791,
+ "macro_f1": 0.3333333432674408,
"num_tokens": 1694148.0,
"repeat_count": 0.0,
- "routers_loss": 0.014295363798737526,
+ "routers_loss": 0.010905829258263111,
"skip_count": 0.0,
"step": 1052,
"text_loss": 0.20895758271217346
@@ -10011,13 +10011,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.000992693701823646,
- "loss": 0.0635,
+ "loss": 0.0624,
"macro_f1": 0.3272727429866791,
"num_tokens": 1698543.0,
"repeat_count": 1.0,
- "routers_loss": 0.1038367822766304,
+ "routers_loss": 0.10533971339464188,
"skip_count": 0.0,
"step": 1054,
"text_loss": 0.5776236653327942
@@ -10030,13 +10030,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.255859375,
"learning_rate": 0.0009926408880632726,
- "loss": 0.057,
+ "loss": 0.0556,
"macro_f1": 0.3272727429866791,
"num_tokens": 1702460.0,
"repeat_count": 0.0,
- "routers_loss": 0.029780643060803413,
+ "routers_loss": 0.026313411071896553,
"skip_count": 1.0,
"step": 1056,
"text_loss": 0.34990596771240234
@@ -10049,13 +10049,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10107421875,
+ "grad_norm": 0.099609375,
"learning_rate": 0.0009925878855219818,
- "loss": 0.0398,
+ "loss": 0.0391,
"macro_f1": 0.3333333432674408,
"num_tokens": 1705686.0,
"repeat_count": 0.0,
- "routers_loss": 0.008537676185369492,
+ "routers_loss": 0.007763393223285675,
"skip_count": 0.0,
"step": 1058,
"text_loss": 0.4980163276195526
@@ -10068,13 +10068,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.171875,
+ "grad_norm": 0.177734375,
"learning_rate": 0.000992534694220084,
- "loss": 0.0617,
+ "loss": 0.0613,
"macro_f1": 0.3272727429866791,
"num_tokens": 1708739.0,
"repeat_count": 0.0,
- "routers_loss": 0.03966755419969559,
+ "routers_loss": 0.03998444974422455,
"skip_count": 1.0,
"step": 1060,
"text_loss": 0.29092350602149963
@@ -10087,13 +10087,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1484375,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.000992481314177962,
- "loss": 0.0311,
+ "loss": 0.0312,
"macro_f1": 0.32098764181137085,
"num_tokens": 1711903.0,
"repeat_count": 1.0,
- "routers_loss": 0.06651833653450012,
+ "routers_loss": 0.06966045498847961,
"skip_count": 1.0,
"step": 1062,
"text_loss": 0.6267179250717163
@@ -10106,13 +10106,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2431640625,
+ "grad_norm": 0.244140625,
"learning_rate": 0.0009924277454160717,
- "loss": 0.0557,
+ "loss": 0.0548,
"macro_f1": 0.3272727429866791,
"num_tokens": 1715974.0,
"repeat_count": 0.0,
- "routers_loss": 0.05130369961261749,
+ "routers_loss": 0.05536063387989998,
"skip_count": 1.0,
"step": 1064,
"text_loss": 0.5813798904418945
@@ -10125,13 +10125,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009923739879549402,
- "loss": 0.0435,
+ "loss": 0.0423,
"macro_f1": 0.3333333432674408,
"num_tokens": 1718828.0,
"repeat_count": 0.0,
- "routers_loss": 0.020534176379442215,
+ "routers_loss": 0.020993782207369804,
"skip_count": 0.0,
"step": 1066,
"text_loss": 0.22665327787399292
@@ -10144,13 +10144,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.0888671875,
"learning_rate": 0.0009923200418151677,
- "loss": 0.0305,
+ "loss": 0.0301,
"macro_f1": 0.3333333432674408,
"num_tokens": 1722419.0,
"repeat_count": 0.0,
- "routers_loss": 0.007514918688684702,
+ "routers_loss": 0.007351701147854328,
"skip_count": 0.0,
"step": 1068,
"text_loss": 0.5796169638633728
@@ -10163,13 +10163,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009922659070174264,
- "loss": 0.0461,
+ "loss": 0.0452,
"macro_f1": 0.3272727429866791,
"num_tokens": 1725663.0,
"repeat_count": 1.0,
- "routers_loss": 0.024598751217126846,
+ "routers_loss": 0.026033315807580948,
"skip_count": 0.0,
"step": 1070,
"text_loss": 0.25742828845977783
@@ -10182,32 +10182,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009922115835824612,
- "loss": 0.0408,
+ "loss": 0.041,
"macro_f1": 0.3333333432674408,
"num_tokens": 1729239.0,
"repeat_count": 0.0,
- "routers_loss": 0.011866633780300617,
+ "routers_loss": 0.0118600158020854,
"skip_count": 0.0,
"step": 1072,
"text_loss": 0.21630282700061798
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 5.042265923099501,
- "f1_execute": 0.9818181991577148,
- "f1_repeat": 0.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009921570715310884,
- "loss": 0.036,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0364,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 1732507.0,
"repeat_count": 1.0,
- "routers_loss": 0.01755746826529503,
+ "routers_loss": 0.016118815168738365,
"skip_count": 0.0,
"step": 1074,
"text_loss": 0.5639925003051758
@@ -10220,13 +10220,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009921023708841974,
- "loss": 0.0415,
+ "loss": 0.0407,
"macro_f1": 0.3333333432674408,
"num_tokens": 1736182.0,
"repeat_count": 0.0,
- "routers_loss": 0.003976983483880758,
+ "routers_loss": 0.004275390412658453,
"skip_count": 0.0,
"step": 1076,
"text_loss": 0.5758615136146545
@@ -10239,13 +10239,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.0009920474816627496,
- "loss": 0.0378,
+ "loss": 0.037,
"macro_f1": 0.3333333432674408,
"num_tokens": 1739559.0,
"repeat_count": 0.0,
- "routers_loss": 0.013548235408961773,
+ "routers_loss": 0.01299292128533125,
"skip_count": 0.0,
"step": 1078,
"text_loss": 0.18221625685691833
@@ -10258,13 +10258,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009919924038877788,
"loss": 0.0343,
"macro_f1": 0.32098764181137085,
"num_tokens": 1742890.0,
"repeat_count": 0.0,
- "routers_loss": 0.03923165053129196,
+ "routers_loss": 0.038295745849609375,
"skip_count": 2.0,
"step": 1080,
"text_loss": 0.17354349792003632
@@ -10277,13 +10277,13 @@
"f1_execute": 0.9583333134651184,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009919371375803905,
- "loss": 0.0464,
+ "loss": 0.0455,
"macro_f1": 0.8194444179534912,
"num_tokens": 1746433.0,
"repeat_count": 2.0,
- "routers_loss": 0.046429626643657684,
+ "routers_loss": 0.04052971675992012,
"skip_count": 3.0,
"step": 1082,
"text_loss": 0.2250112146139145
@@ -10296,13 +10296,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1025390625,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009918816827617632,
- "loss": 0.0346,
+ "loss": 0.0353,
"macro_f1": 0.3333333432674408,
"num_tokens": 1750802.0,
"repeat_count": 0.0,
- "routers_loss": 0.008998732082545757,
+ "routers_loss": 0.009114136919379234,
"skip_count": 0.0,
"step": 1084,
"text_loss": 0.2526719272136688
@@ -10315,13 +10315,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.000991826039453147,
- "loss": 0.0386,
+ "loss": 0.0392,
"macro_f1": 0.3333333432674408,
"num_tokens": 1754272.0,
"repeat_count": 0.0,
- "routers_loss": 0.005173585377633572,
+ "routers_loss": 0.004904678091406822,
"skip_count": 0.0,
"step": 1086,
"text_loss": 0.7308789491653442
@@ -10334,13 +10334,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.138671875,
"learning_rate": 0.000991770207675865,
- "loss": 0.0308,
+ "loss": 0.0327,
"macro_f1": 0.6666666865348816,
"num_tokens": 1757231.0,
"repeat_count": 0.0,
- "routers_loss": 0.024098891764879227,
+ "routers_loss": 0.02129189297556877,
"skip_count": 2.0,
"step": 1088,
"text_loss": 0.21764220297336578
@@ -10353,13 +10353,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009917141874513113,
"loss": 0.0315,
"macro_f1": 0.3333333432674408,
"num_tokens": 1760003.0,
"repeat_count": 0.0,
- "routers_loss": 0.014002764597535133,
+ "routers_loss": 0.01310618408024311,
"skip_count": 0.0,
"step": 1090,
"text_loss": 0.33892181515693665
@@ -10372,32 +10372,32 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009916579788009537,
- "loss": 0.0462,
+ "loss": 0.0457,
"macro_f1": 0.5492662787437439,
"num_tokens": 1763052.0,
"repeat_count": 0.0,
- "routers_loss": 0.017871137708425522,
+ "routers_loss": 0.02059309557080269,
"skip_count": 2.0,
"step": 1092,
"text_loss": 0.6551769375801086
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 5.136190196653947,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1044921875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10546875,
"learning_rate": 0.0009916015817463312,
"loss": 0.0385,
- "macro_f1": 0.32098764181137085,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1766655.0,
"repeat_count": 0.0,
- "routers_loss": 0.033123619854450226,
+ "routers_loss": 0.0274797435849905,
"skip_count": 2.0,
"step": 1094,
"text_loss": 0.3984372019767761
@@ -10410,13 +10410,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.000991544996309055,
- "loss": 0.0267,
+ "loss": 0.0271,
"macro_f1": 0.3333333432674408,
"num_tokens": 1769997.0,
"repeat_count": 0.0,
- "routers_loss": 0.01279227901250124,
+ "routers_loss": 0.01437368243932724,
"skip_count": 0.0,
"step": 1096,
"text_loss": 0.4203338921070099
@@ -10429,13 +10429,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.000991488222510809,
- "loss": 0.0295,
+ "loss": 0.0292,
"macro_f1": 0.3333333432674408,
"num_tokens": 1773130.0,
"repeat_count": 0.0,
- "routers_loss": 0.001354650012217462,
+ "routers_loss": 0.001382062560878694,
"skip_count": 0.0,
"step": 1098,
"text_loss": 0.43132516741752625
@@ -10448,13 +10448,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.123046875,
"learning_rate": 0.000991431260373349,
- "loss": 0.0326,
+ "loss": 0.0329,
"macro_f1": 0.3144654333591461,
"num_tokens": 1775682.0,
"repeat_count": 1.0,
- "routers_loss": 0.1097714751958847,
+ "routers_loss": 0.1115434318780899,
"skip_count": 2.0,
"step": 1100,
"text_loss": 0.3218227028846741
@@ -10467,13 +10467,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.111328125,
"learning_rate": 0.000991374109918503,
- "loss": 0.0187,
+ "loss": 0.0185,
"macro_f1": 0.3333333432674408,
"num_tokens": 1778407.0,
"repeat_count": 0.0,
- "routers_loss": 0.009649592451751232,
+ "routers_loss": 0.009529678151011467,
"skip_count": 0.0,
"step": 1102,
"text_loss": 0.17183731496334076
@@ -10486,13 +10486,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.11083984375,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.000991316771168171,
- "loss": 0.0447,
+ "loss": 0.044,
"macro_f1": 0.5492662787437439,
"num_tokens": 1781518.0,
"repeat_count": 0.0,
- "routers_loss": 0.020858706906437874,
+ "routers_loss": 0.018668074160814285,
"skip_count": 2.0,
"step": 1104,
"text_loss": 1.1324785947799683
@@ -10505,13 +10505,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.125,
"learning_rate": 0.0009912592441443258,
- "loss": 0.0428,
+ "loss": 0.0411,
"macro_f1": 0.3272727429866791,
"num_tokens": 1784878.0,
"repeat_count": 0.0,
- "routers_loss": 0.048101235181093216,
+ "routers_loss": 0.04145100712776184,
"skip_count": 1.0,
"step": 1106,
"text_loss": 0.6082063317298889
@@ -10524,13 +10524,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009912015288690112,
- "loss": 0.0435,
+ "loss": 0.0421,
"macro_f1": 0.3272727429866791,
"num_tokens": 1788978.0,
"repeat_count": 0.0,
- "routers_loss": 0.02875671721994877,
+ "routers_loss": 0.021450644358992577,
"skip_count": 1.0,
"step": 1108,
"text_loss": 0.5597621202468872
@@ -10543,13 +10543,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08349609375,
+ "grad_norm": 0.083984375,
"learning_rate": 0.0009911436253643444,
- "loss": 0.0247,
+ "loss": 0.0238,
"macro_f1": 0.3333333432674408,
"num_tokens": 1792321.0,
"repeat_count": 0.0,
- "routers_loss": 0.019005145877599716,
+ "routers_loss": 0.017405325546860695,
"skip_count": 0.0,
"step": 1110,
"text_loss": 0.2560598850250244
@@ -10562,13 +10562,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.0009910855336525137,
- "loss": 0.0393,
+ "loss": 0.0383,
"macro_f1": 0.3333333432674408,
"num_tokens": 1795182.0,
"repeat_count": 0.0,
- "routers_loss": 0.007238700054585934,
+ "routers_loss": 0.007162237539887428,
"skip_count": 0.0,
"step": 1112,
"text_loss": 0.3438240587711334
@@ -10581,13 +10581,13 @@
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.125,
+ "grad_norm": 0.115234375,
"learning_rate": 0.00099102725375578,
"loss": 0.0326,
"macro_f1": 0.480392187833786,
"num_tokens": 1798987.0,
"repeat_count": 1.0,
- "routers_loss": 0.12206140905618668,
+ "routers_loss": 0.11149197816848755,
"skip_count": 3.0,
"step": 1114,
"text_loss": 0.20455503463745117
@@ -10595,18 +10595,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 5.239506897563839,
- "f1_execute": 0.8799999952316284,
+ "f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.10791015625,
"learning_rate": 0.0009909687856964767,
- "loss": 0.0366,
- "macro_f1": 0.29333335161209106,
+ "loss": 0.035,
+ "macro_f1": 0.3006536364555359,
"num_tokens": 1802064.0,
"repeat_count": 2.0,
- "routers_loss": 0.15721899271011353,
+ "routers_loss": 0.12679415941238403,
"skip_count": 3.0,
"step": 1116,
"text_loss": 0.11996729671955109
@@ -10619,32 +10619,32 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.125,
+ "grad_norm": 0.12451171875,
"learning_rate": 0.0009909101294970082,
- "loss": 0.0366,
+ "loss": 0.0365,
"macro_f1": 0.5492662787437439,
"num_tokens": 1805412.0,
"repeat_count": 0.0,
- "routers_loss": 0.05058665946125984,
+ "routers_loss": 0.05108053982257843,
"skip_count": 2.0,
"step": 1118,
"text_loss": 0.13224145770072937
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 5.258291752274729,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.123046875,
"learning_rate": 0.0009908512851798522,
- "loss": 0.0454,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0455,
+ "macro_f1": 0.6603773832321167,
"num_tokens": 1808196.0,
"repeat_count": 1.0,
- "routers_loss": 0.023021472617983818,
+ "routers_loss": 0.02131766639649868,
"skip_count": 1.0,
"step": 1120,
"text_loss": 0.7824069261550903
@@ -10657,13 +10657,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1435546875,
+ "grad_norm": 0.138671875,
"learning_rate": 0.0009907922527675576,
- "loss": 0.0409,
+ "loss": 0.0405,
"macro_f1": 0.3333333432674408,
"num_tokens": 1811622.0,
"repeat_count": 0.0,
- "routers_loss": 0.006660689599812031,
+ "routers_loss": 0.006226244382560253,
"skip_count": 0.0,
"step": 1122,
"text_loss": 0.5419743061065674
@@ -10676,13 +10676,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.12890625,
"learning_rate": 0.000990733032282746,
- "loss": 0.0547,
+ "loss": 0.0535,
"macro_f1": 0.5492662787437439,
"num_tokens": 1814628.0,
"repeat_count": 0.0,
- "routers_loss": 0.031727343797683716,
+ "routers_loss": 0.03088250942528248,
"skip_count": 2.0,
"step": 1124,
"text_loss": 0.37100958824157715
@@ -10695,13 +10695,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.0810546875,
"learning_rate": 0.000990673623748111,
- "loss": 0.0351,
+ "loss": 0.0348,
"macro_f1": 0.32098767161369324,
"num_tokens": 1817205.0,
"repeat_count": 0.0,
- "routers_loss": 0.06140992045402527,
+ "routers_loss": 0.05495348572731018,
"skip_count": 1.0,
"step": 1126,
"text_loss": 0.20241330564022064
@@ -10709,18 +10709,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
- "avg_layers": 25.0,
+ "avg_layers": 26.0,
"epoch": 5.295861461696507,
- "f1_execute": 0.9411764740943909,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.4000000059604645,
- "grad_norm": 0.09814453125,
+ "f1_skip": 0.5,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0009906140271864173,
- "loss": 0.0436,
- "macro_f1": 0.44705885648727417,
+ "loss": 0.0433,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 1820141.0,
"repeat_count": 0.0,
- "routers_loss": 0.03872275352478027,
+ "routers_loss": 0.037809282541275024,
"skip_count": 2.0,
"step": 1128,
"text_loss": 0.32965806126594543
@@ -10728,18 +10728,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 5.305253889051952,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09228515625,
+ "grad_norm": 0.0908203125,
"learning_rate": 0.0009905542426205032,
- "loss": 0.0353,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0348,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 1824011.0,
"repeat_count": 0.0,
- "routers_loss": 0.031013142317533493,
+ "routers_loss": 0.03320181369781494,
"skip_count": 1.0,
"step": 1130,
"text_loss": 0.36329755187034607
@@ -10752,13 +10752,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1123046875,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009904942700732777,
- "loss": 0.0333,
+ "loss": 0.0335,
"macro_f1": 0.3333333432674408,
"num_tokens": 1826873.0,
"repeat_count": 0.0,
- "routers_loss": 0.004357635974884033,
+ "routers_loss": 0.004102326463907957,
"skip_count": 0.0,
"step": 1132,
"text_loss": 0.6692602038383484
@@ -10771,13 +10771,13 @@
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11279296875,
+ "grad_norm": 0.08544921875,
"learning_rate": 0.0009904341095677226,
"loss": 0.03,
"macro_f1": 0.29333335161209106,
"num_tokens": 1830103.0,
"repeat_count": 2.0,
- "routers_loss": 0.2376353144645691,
+ "routers_loss": 0.2376193106174469,
"skip_count": 4.0,
"step": 1134,
"text_loss": 0.19212862849235535
@@ -10790,13 +10790,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10888671875,
+ "grad_norm": 0.119140625,
"learning_rate": 0.0009903737611268919,
- "loss": 0.0446,
+ "loss": 0.0445,
"macro_f1": 0.3333333432674408,
"num_tokens": 1833201.0,
"repeat_count": 0.0,
- "routers_loss": 0.004978097043931484,
+ "routers_loss": 0.005253395065665245,
"skip_count": 0.0,
"step": 1136,
"text_loss": 0.6773360371589661
@@ -10809,13 +10809,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009903132247739107,
- "loss": 0.0309,
+ "loss": 0.0305,
"macro_f1": 0.3076923191547394,
"num_tokens": 1836045.0,
"repeat_count": 1.0,
- "routers_loss": 0.14195409417152405,
+ "routers_loss": 0.14382585883140564,
"skip_count": 3.0,
"step": 1138,
"text_loss": 0.2882297933101654
@@ -10828,13 +10828,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.15234375,
+ "grad_norm": 0.150390625,
"learning_rate": 0.0009902525005319766,
- "loss": 0.0403,
+ "loss": 0.04,
"macro_f1": 0.5427350401878357,
"num_tokens": 1839721.0,
"repeat_count": 1.0,
- "routers_loss": 0.04005253314971924,
+ "routers_loss": 0.04033960774540901,
"skip_count": 2.0,
"step": 1140,
"text_loss": 0.7172559499740601
@@ -10847,13 +10847,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.12109375,
"learning_rate": 0.0009901915884243597,
- "loss": 0.0353,
+ "loss": 0.0351,
"macro_f1": 0.6666666865348816,
"num_tokens": 1842614.0,
"repeat_count": 1.0,
- "routers_loss": 0.006839688867330551,
+ "routers_loss": 0.005162308923900127,
"skip_count": 0.0,
"step": 1142,
"text_loss": 0.42892804741859436
@@ -10866,13 +10866,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.0009901304884744014,
- "loss": 0.0396,
+ "loss": 0.0386,
"macro_f1": 0.3144654333591461,
"num_tokens": 1845444.0,
"repeat_count": 1.0,
- "routers_loss": 0.10174567997455597,
+ "routers_loss": 0.10117656737565994,
"skip_count": 2.0,
"step": 1144,
"text_loss": 0.20806430280208588
@@ -10885,13 +10885,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009900692007055152,
- "loss": 0.0365,
+ "loss": 0.0357,
"macro_f1": 0.3333333432674408,
"num_tokens": 1848558.0,
"repeat_count": 0.0,
- "routers_loss": 0.014655748382210732,
+ "routers_loss": 0.014107038266956806,
"skip_count": 0.0,
"step": 1146,
"text_loss": 0.5355974435806274
@@ -10904,13 +10904,13 @@
"f1_execute": 0.9166666865348816,
"f1_repeat": 0.4000000059604645,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.158203125,
+ "grad_norm": 0.16015625,
"learning_rate": 0.000990007725141187,
- "loss": 0.0467,
+ "loss": 0.0449,
"macro_f1": 0.6611111164093018,
"num_tokens": 1852723.0,
"repeat_count": 4.0,
- "routers_loss": 0.16960746049880981,
+ "routers_loss": 0.15537866950035095,
"skip_count": 2.0,
"step": 1148,
"text_loss": 0.6388513445854187
@@ -10923,32 +10923,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1220703125,
+ "grad_norm": 0.1181640625,
"learning_rate": 0.0009899460618049741,
- "loss": 0.0399,
+ "loss": 0.0397,
"macro_f1": 0.3333333432674408,
"num_tokens": 1856181.0,
"repeat_count": 0.0,
- "routers_loss": 0.011591178365051746,
+ "routers_loss": 0.011800912208855152,
"skip_count": 0.0,
"step": 1150,
"text_loss": 0.6113069653511047
},
{
- "acc_repeat": 0.5,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 30.0,
"epoch": 5.408570589961843,
- "f1_execute": 0.9811320900917053,
- "f1_repeat": 0.6666666865348816,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09912109375,
+ "grad_norm": 0.1005859375,
"learning_rate": 0.000989884210720506,
- "loss": 0.0332,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0331,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 1859685.0,
"repeat_count": 2.0,
- "routers_loss": 0.04036068916320801,
+ "routers_loss": 0.022900646552443504,
"skip_count": 0.0,
"step": 1152,
"text_loss": 0.25718021392822266
@@ -10961,13 +10961,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009898221719114844,
- "loss": 0.0366,
+ "loss": 0.0354,
"macro_f1": 0.3272727429866791,
"num_tokens": 1862505.0,
"repeat_count": 0.0,
- "routers_loss": 0.030165785923600197,
+ "routers_loss": 0.026814989745616913,
"skip_count": 1.0,
"step": 1154,
"text_loss": 0.5426549911499023
@@ -10980,13 +10980,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0966796875,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009897599454016823,
- "loss": 0.0421,
+ "loss": 0.0401,
"macro_f1": 0.3333333432674408,
"num_tokens": 1866266.0,
"repeat_count": 0.0,
- "routers_loss": 0.003615695284679532,
+ "routers_loss": 0.0032623792067170143,
"skip_count": 0.0,
"step": 1156,
"text_loss": 0.37752896547317505
@@ -10999,13 +10999,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.07080078125,
"learning_rate": 0.0009896975312149454,
- "loss": 0.0377,
+ "loss": 0.0369,
"macro_f1": 0.3333333432674408,
"num_tokens": 1870216.0,
"repeat_count": 0.0,
- "routers_loss": 0.01679840311408043,
+ "routers_loss": 0.015617577359080315,
"skip_count": 0.0,
"step": 1158,
"text_loss": 0.18207129836082458
@@ -11018,13 +11018,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009896349293751906,
- "loss": 0.0422,
+ "loss": 0.0423,
"macro_f1": 0.3272727429866791,
"num_tokens": 1873338.0,
"repeat_count": 0.0,
- "routers_loss": 0.024936161935329437,
+ "routers_loss": 0.02250153198838234,
"skip_count": 1.0,
"step": 1160,
"text_loss": 0.548884391784668
@@ -11037,13 +11037,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009895721399064072,
- "loss": 0.0407,
+ "loss": 0.0388,
"macro_f1": 0.32098764181137085,
"num_tokens": 1876470.0,
"repeat_count": 1.0,
- "routers_loss": 0.06472968310117722,
+ "routers_loss": 0.055204521864652634,
"skip_count": 1.0,
"step": 1162,
"text_loss": 0.48052409291267395
@@ -11056,13 +11056,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009895091628326564,
- "loss": 0.031,
+ "loss": 0.0293,
"macro_f1": 0.3333333432674408,
"num_tokens": 1879354.0,
"repeat_count": 0.0,
- "routers_loss": 0.009633494541049004,
+ "routers_loss": 0.009093789383769035,
"skip_count": 0.0,
"step": 1164,
"text_loss": 0.3908069431781769
@@ -11075,13 +11075,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.140625,
"learning_rate": 0.000989445998178071,
"loss": 0.0323,
"macro_f1": 0.3272727429866791,
"num_tokens": 1881941.0,
"repeat_count": 0.0,
- "routers_loss": 0.01458993274718523,
+ "routers_loss": 0.015086972154676914,
"skip_count": 1.0,
"step": 1166,
"text_loss": 0.4884725511074066
@@ -11094,13 +11094,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009893826459668558,
- "loss": 0.0389,
+ "loss": 0.0386,
"macro_f1": 0.3144654333591461,
"num_tokens": 1885374.0,
"repeat_count": 0.0,
- "routers_loss": 0.06636982411146164,
+ "routers_loss": 0.06587666273117065,
"skip_count": 3.0,
"step": 1168,
"text_loss": 0.12760137021541595
@@ -11113,13 +11113,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.0009893191062232873,
- "loss": 0.0325,
+ "loss": 0.0322,
"macro_f1": 0.3333333432674408,
"num_tokens": 1888612.0,
"repeat_count": 0.0,
- "routers_loss": 0.005644182674586773,
+ "routers_loss": 0.006088624242693186,
"skip_count": 0.0,
"step": 1170,
"text_loss": 0.4821319580078125
@@ -11132,13 +11132,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.0009892553789717143,
- "loss": 0.0402,
+ "loss": 0.0389,
"macro_f1": 0.3333333432674408,
"num_tokens": 1891463.0,
"repeat_count": 0.0,
- "routers_loss": 0.010273848660290241,
+ "routers_loss": 0.010113578289747238,
"skip_count": 0.0,
"step": 1172,
"text_loss": 0.3613642454147339
@@ -11151,13 +11151,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009891914642365573,
- "loss": 0.0415,
+ "loss": 0.0404,
"macro_f1": 0.3333333432674408,
"num_tokens": 1894230.0,
"repeat_count": 0.0,
- "routers_loss": 0.004529652185738087,
+ "routers_loss": 0.004947459790855646,
"skip_count": 0.0,
"step": 1174,
"text_loss": 0.5037549138069153
@@ -11170,13 +11170,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.0009891273620423083,
- "loss": 0.045,
+ "loss": 0.0428,
"macro_f1": 0.3272727429866791,
"num_tokens": 1897294.0,
"repeat_count": 1.0,
- "routers_loss": 0.024671228602528572,
+ "routers_loss": 0.026075217872858047,
"skip_count": 0.0,
"step": 1176,
"text_loss": 0.32558977603912354
@@ -11189,13 +11189,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009890630724135314,
- "loss": 0.0354,
+ "loss": 0.0351,
"macro_f1": 0.3272727429866791,
"num_tokens": 1901553.0,
"repeat_count": 0.0,
- "routers_loss": 0.06466450542211533,
+ "routers_loss": 0.06650999188423157,
"skip_count": 1.0,
"step": 1178,
"text_loss": 0.23473620414733887
@@ -11208,13 +11208,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1767578125,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009889985953748625,
- "loss": 0.0278,
+ "loss": 0.0268,
"macro_f1": 0.6666666865348816,
"num_tokens": 1904556.0,
"repeat_count": 0.0,
- "routers_loss": 0.010566026903688908,
+ "routers_loss": 0.010361116379499435,
"skip_count": 1.0,
"step": 1180,
"text_loss": 0.6927042007446289
@@ -11227,13 +11227,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.103515625,
"learning_rate": 0.0009889339309510094,
- "loss": 0.037,
+ "loss": 0.0351,
"macro_f1": 0.3333333432674408,
"num_tokens": 1908053.0,
"repeat_count": 0.0,
- "routers_loss": 0.013842248357832432,
+ "routers_loss": 0.013286533765494823,
"skip_count": 0.0,
"step": 1182,
"text_loss": 0.19977325201034546
@@ -11246,13 +11246,13 @@
"f1_execute": 0.9387754797935486,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.5,
- "grad_norm": 0.07373046875,
+ "grad_norm": 0.058837890625,
"learning_rate": 0.0009888690791667518,
- "loss": 0.0215,
+ "loss": 0.0204,
"macro_f1": 0.7018141150474548,
"num_tokens": 1911754.0,
"repeat_count": 2.0,
- "routers_loss": 0.122759610414505,
+ "routers_loss": 0.11920545995235443,
"skip_count": 3.0,
"step": 1184,
"text_loss": 0.4072858691215515
@@ -11265,32 +11265,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009888040400469408,
- "loss": 0.0402,
+ "loss": 0.0391,
"macro_f1": 0.3272727429866791,
"num_tokens": 1914862.0,
"repeat_count": 0.0,
- "routers_loss": 0.035315629094839096,
+ "routers_loss": 0.03652849420905113,
"skip_count": 1.0,
"step": 1186,
"text_loss": 0.2654043138027191
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 5.577634282359847,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.0009887388136164996,
- "loss": 0.034,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0336,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1918542.0,
"repeat_count": 0.0,
- "routers_loss": 0.040048226714134216,
+ "routers_loss": 0.03991910070180893,
"skip_count": 2.0,
"step": 1188,
"text_loss": 0.21130657196044922
@@ -11298,18 +11298,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 5.587026709715292,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.09521484375,
"learning_rate": 0.000988673399900423,
- "loss": 0.044,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0429,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1921589.0,
"repeat_count": 0.0,
- "routers_loss": 0.012814820744097233,
+ "routers_loss": 0.014900135807693005,
"skip_count": 0.0,
"step": 1190,
"text_loss": 0.5519335865974426
@@ -11322,13 +11322,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009886077989237777,
- "loss": 0.0407,
+ "loss": 0.0405,
"macro_f1": 0.3272727429866791,
"num_tokens": 1924320.0,
"repeat_count": 0.0,
- "routers_loss": 0.05977959558367729,
+ "routers_loss": 0.06271552294492722,
"skip_count": 1.0,
"step": 1192,
"text_loss": 0.213813915848732
@@ -11341,13 +11341,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.0,
"f1_skip": 0.888888955116272,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.1875,
"learning_rate": 0.000988542010711702,
- "loss": 0.0334,
+ "loss": 0.0342,
"macro_f1": 0.6225374937057495,
"num_tokens": 1927178.0,
"repeat_count": 0.0,
- "routers_loss": 0.031448643654584885,
+ "routers_loss": 0.03081391751766205,
"skip_count": 5.0,
"step": 1194,
"text_loss": 0.7524349093437195
@@ -11360,13 +11360,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.265625,
+ "grad_norm": 0.255859375,
"learning_rate": 0.0009884760352894064,
- "loss": 0.0523,
+ "loss": 0.0518,
"macro_f1": 0.3333333432674408,
"num_tokens": 1930216.0,
"repeat_count": 0.0,
- "routers_loss": 0.008164947852492332,
+ "routers_loss": 0.008556773886084557,
"skip_count": 0.0,
"step": 1196,
"text_loss": 0.28230375051498413
@@ -11379,32 +11379,32 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.5,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.1064453125,
"learning_rate": 0.0009884098726821726,
- "loss": 0.0478,
+ "loss": 0.0472,
"macro_f1": 0.4871794879436493,
"num_tokens": 1933312.0,
"repeat_count": 3.0,
- "routers_loss": 0.04045635461807251,
+ "routers_loss": 0.05344727262854576,
"skip_count": 0.0,
"step": 1198,
"text_loss": 0.5509607195854187
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6666666865348816,
- "avg_layers": 26.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
"epoch": 5.633988846492516,
- "f1_execute": 0.9600000381469727,
+ "f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
- "f1_skip": 0.800000011920929,
- "grad_norm": 0.1240234375,
+ "f1_skip": 0.5,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.000988343522915354,
- "loss": 0.0447,
- "macro_f1": 0.5866667032241821,
+ "loss": 0.0441,
+ "macro_f1": 0.480392187833786,
"num_tokens": 1936160.0,
"repeat_count": 1.0,
- "routers_loss": 0.06872973591089249,
+ "routers_loss": 0.07324771583080292,
"skip_count": 3.0,
"step": 1200,
"text_loss": 0.30565372109413147
@@ -11412,18 +11412,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
- "avg_layers": 24.0,
+ "avg_layers": 25.0,
"epoch": 5.64338127384796,
- "f1_execute": 0.8695651888847351,
+ "f1_execute": 0.8936169743537903,
"f1_repeat": 0.0,
- "f1_skip": 0.4000000059604645,
- "grad_norm": 0.25390625,
+ "f1_skip": 0.444444477558136,
+ "grad_norm": 0.2470703125,
"learning_rate": 0.0009882769860143764,
- "loss": 0.0331,
- "macro_f1": 0.4231884181499481,
+ "loss": 0.0317,
+ "macro_f1": 0.4460204839706421,
"num_tokens": 1939266.0,
"repeat_count": 0.0,
- "routers_loss": 0.20964151620864868,
+ "routers_loss": 0.18620699644088745,
"skip_count": 6.0,
"step": 1202,
"text_loss": 0.976121723651886
@@ -11442,26 +11442,26 @@
"macro_f1": 0.6666666865348816,
"num_tokens": 1942173.0,
"repeat_count": 0.0,
- "routers_loss": 0.00690250750631094,
+ "routers_loss": 0.007703613489866257,
"skip_count": 1.0,
"step": 1204,
"text_loss": 0.5647401809692383
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 5.66216612855885,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009881433509120036,
- "loss": 0.0372,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0376,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1945071.0,
"repeat_count": 0.0,
- "routers_loss": 0.022315658628940582,
+ "routers_loss": 0.02162683941423893,
"skip_count": 2.0,
"step": 1206,
"text_loss": 0.24229218065738678
@@ -11474,13 +11474,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1083984375,
+ "grad_norm": 0.0966796875,
"learning_rate": 0.0009880762527618176,
- "loss": 0.0388,
+ "loss": 0.0383,
"macro_f1": 0.3333333432674408,
"num_tokens": 1949060.0,
"repeat_count": 0.0,
- "routers_loss": 0.017015069723129272,
+ "routers_loss": 0.017667081207036972,
"skip_count": 0.0,
"step": 1208,
"text_loss": 0.4035970866680145
@@ -11493,13 +11493,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009880089675798908,
- "loss": 0.0372,
+ "loss": 0.0367,
"macro_f1": 0.3333333432674408,
"num_tokens": 1951698.0,
"repeat_count": 0.0,
- "routers_loss": 0.006532609928399324,
+ "routers_loss": 0.006405784282833338,
"skip_count": 0.0,
"step": 1210,
"text_loss": 0.5319879055023193
@@ -11512,13 +11512,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009879414953920071,
- "loss": 0.0301,
+ "loss": 0.0294,
"macro_f1": 0.3333333432674408,
"num_tokens": 1955266.0,
"repeat_count": 0.0,
- "routers_loss": 0.009720963425934315,
+ "routers_loss": 0.009859707206487656,
"skip_count": 0.0,
"step": 1212,
"text_loss": 0.6687407493591309
@@ -11531,32 +11531,32 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009878738362240219,
- "loss": 0.046,
+ "loss": 0.045,
"macro_f1": 0.5492662787437439,
"num_tokens": 1958538.0,
"repeat_count": 0.0,
- "routers_loss": 0.03176085278391838,
+ "routers_loss": 0.030890554189682007,
"skip_count": 2.0,
"step": 1214,
"text_loss": 0.20820017158985138
},
{
"acc_repeat": 0.5,
- "acc_skip": 0.5,
- "avg_layers": 29.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
"epoch": 5.709128265336073,
- "f1_execute": 0.9387754797935486,
+ "f1_execute": 0.9200000166893005,
"f1_repeat": 0.5,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.2021484375,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.000987805990101862,
- "loss": 0.0323,
- "macro_f1": 0.7018141150474548,
+ "loss": 0.0317,
+ "macro_f1": 0.47333335876464844,
"num_tokens": 1961419.0,
"repeat_count": 2.0,
- "routers_loss": 0.08626245707273483,
+ "routers_loss": 0.10383198410272598,
"skip_count": 2.0,
"step": 1216,
"text_loss": 0.8664976358413696
@@ -11569,13 +11569,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009877379570515268,
- "loss": 0.0374,
+ "loss": 0.0366,
"macro_f1": 0.3333333432674408,
"num_tokens": 1964836.0,
"repeat_count": 0.0,
- "routers_loss": 0.012099343352019787,
+ "routers_loss": 0.013376163318753242,
"skip_count": 0.0,
"step": 1218,
"text_loss": 0.4223395884037018
@@ -11588,13 +11588,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.0859375,
"learning_rate": 0.0009876697370990865,
- "loss": 0.0342,
+ "loss": 0.0343,
"macro_f1": 0.3333333432674408,
"num_tokens": 1967620.0,
"repeat_count": 0.0,
- "routers_loss": 0.007713846862316132,
+ "routers_loss": 0.008577900938689709,
"skip_count": 0.0,
"step": 1220,
"text_loss": 0.4789901375770569
@@ -11607,13 +11607,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.0009876013302706828,
- "loss": 0.0499,
+ "loss": 0.049,
"macro_f1": 0.3333333432674408,
"num_tokens": 1971100.0,
"repeat_count": 0.0,
- "routers_loss": 0.004629489034414291,
+ "routers_loss": 0.004730266984552145,
"skip_count": 0.0,
"step": 1222,
"text_loss": 0.6799837946891785
@@ -11626,13 +11626,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08837890625,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009875327365925295,
- "loss": 0.035,
+ "loss": 0.0341,
"macro_f1": 0.3333333432674408,
"num_tokens": 1974408.0,
"repeat_count": 0.0,
- "routers_loss": 0.010654795914888382,
+ "routers_loss": 0.010849526152014732,
"skip_count": 0.0,
"step": 1224,
"text_loss": 0.18967926502227783
@@ -11640,18 +11640,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 26.0,
+ "avg_layers": 27.0,
"epoch": 5.756090402113296,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009874639560909118,
- "loss": 0.0516,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.0498,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 1977046.0,
"repeat_count": 0.0,
- "routers_loss": 0.05963074415922165,
+ "routers_loss": 0.04841252416372299,
"skip_count": 1.0,
"step": 1226,
"text_loss": 0.6133310198783875
@@ -11664,13 +11664,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1328125,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.0009873949887921867,
- "loss": 0.04,
+ "loss": 0.0402,
"macro_f1": 0.3272727429866791,
"num_tokens": 1980330.0,
"repeat_count": 0.0,
- "routers_loss": 0.028920643031597137,
+ "routers_loss": 0.029638588428497314,
"skip_count": 1.0,
"step": 1228,
"text_loss": 0.15649555623531342
@@ -11678,18 +11678,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 5.774875256824186,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10595703125,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.0009873258347227823,
- "loss": 0.0327,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0331,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1983173.0,
"repeat_count": 0.0,
- "routers_loss": 0.006852717138826847,
+ "routers_loss": 0.009955910965800285,
"skip_count": 0.0,
"step": 1230,
"text_loss": 0.4741005599498749
@@ -11702,13 +11702,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009872564939091989,
- "loss": 0.0346,
+ "loss": 0.0342,
"macro_f1": 0.3333333432674408,
"num_tokens": 1986825.0,
"repeat_count": 0.0,
- "routers_loss": 0.010968753136694431,
+ "routers_loss": 0.010205300524830818,
"skip_count": 0.0,
"step": 1232,
"text_loss": 0.5315462350845337
@@ -11721,13 +11721,13 @@
"f1_execute": 0.9302325248718262,
"f1_repeat": 1.0,
"f1_skip": 0.7272727489471436,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.11865234375,
"learning_rate": 0.0009871869663780077,
- "loss": 0.0344,
+ "loss": 0.0336,
"macro_f1": 0.8858351111412048,
"num_tokens": 1990448.0,
"repeat_count": 1.0,
- "routers_loss": 0.0906950980424881,
+ "routers_loss": 0.09120134264230728,
"skip_count": 7.0,
"step": 1234,
"text_loss": 0.6187508702278137
@@ -11740,13 +11740,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.125,
"learning_rate": 0.0009871172521558522,
- "loss": 0.0484,
+ "loss": 0.0475,
"macro_f1": 0.6666666865348816,
"num_tokens": 1993474.0,
"repeat_count": 0.0,
- "routers_loss": 0.016306072473526,
+ "routers_loss": 0.016188839450478554,
"skip_count": 1.0,
"step": 1236,
"text_loss": 0.20783066749572754
@@ -11759,13 +11759,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.208984375,
+ "grad_norm": 0.216796875,
"learning_rate": 0.0009870473512694465,
- "loss": 0.038,
+ "loss": 0.0373,
"macro_f1": 0.5934640765190125,
"num_tokens": 1996536.0,
"repeat_count": 0.0,
- "routers_loss": 0.05804471671581268,
+ "routers_loss": 0.05046704784035683,
"skip_count": 3.0,
"step": 1238,
"text_loss": 0.247748002409935
@@ -11773,18 +11773,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 5.821837393601409,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.091796875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.09033203125,
"learning_rate": 0.0009869772637455772,
- "loss": 0.0256,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0251,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 1999530.0,
"repeat_count": 0.0,
- "routers_loss": 0.045395996421575546,
+ "routers_loss": 0.044926248490810394,
"skip_count": 2.0,
"step": 1240,
"text_loss": 0.26001980900764465
@@ -11797,13 +11797,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11767578125,
+ "grad_norm": 0.1513671875,
"learning_rate": 0.000986906989611102,
- "loss": 0.0438,
+ "loss": 0.0446,
"macro_f1": 0.3272727429866791,
"num_tokens": 2002782.0,
"repeat_count": 0.0,
- "routers_loss": 0.020834850147366524,
+ "routers_loss": 0.025911526754498482,
"skip_count": 0.0,
"step": 1242,
"text_loss": 0.9009982943534851
@@ -11816,13 +11816,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1123046875,
+ "grad_norm": 0.115234375,
"learning_rate": 0.0009868365288929492,
- "loss": 0.0377,
+ "loss": 0.0371,
"macro_f1": 0.3333333432674408,
"num_tokens": 2005331.0,
"repeat_count": 0.0,
- "routers_loss": 0.005241698585450649,
+ "routers_loss": 0.0043760035187006,
"skip_count": 0.0,
"step": 1244,
"text_loss": 0.5547386407852173
@@ -11835,13 +11835,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0966796875,
+ "grad_norm": 0.1005859375,
"learning_rate": 0.0009867658816181206,
- "loss": 0.038,
+ "loss": 0.0374,
"macro_f1": 0.3333333432674408,
"num_tokens": 2008115.0,
"repeat_count": 0.0,
- "routers_loss": 0.008387803100049496,
+ "routers_loss": 0.009227181784808636,
"skip_count": 0.0,
"step": 1246,
"text_loss": 1.0067731142044067
@@ -11854,13 +11854,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.126953125,
"learning_rate": 0.000986695047813688,
- "loss": 0.0256,
+ "loss": 0.0261,
"macro_f1": 0.3272727429866791,
"num_tokens": 2011137.0,
"repeat_count": 1.0,
- "routers_loss": 0.02261745184659958,
+ "routers_loss": 0.023822437971830368,
"skip_count": 0.0,
"step": 1248,
"text_loss": 0.30058956146240234
@@ -11873,32 +11873,32 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009866240275067948,
- "loss": 0.0435,
+ "loss": 0.044,
"macro_f1": 0.47333335876464844,
"num_tokens": 2014159.0,
"repeat_count": 2.0,
- "routers_loss": 0.21678555011749268,
+ "routers_loss": 0.21523773670196533,
"skip_count": 3.0,
"step": 1250,
"text_loss": 0.39072203636169434
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 5.878191957734077,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.0009865528207246563,
- "loss": 0.0358,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0351,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 2017731.0,
"repeat_count": 0.0,
- "routers_loss": 0.06554054468870163,
+ "routers_loss": 0.06184682995080948,
"skip_count": 2.0,
"step": 1252,
"text_loss": 0.35751575231552124
@@ -11911,13 +11911,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.166015625,
"learning_rate": 0.000986481427494559,
- "loss": 0.0337,
+ "loss": 0.0336,
"macro_f1": 0.3333333432674408,
"num_tokens": 2020485.0,
"repeat_count": 0.0,
- "routers_loss": 0.007237187586724758,
+ "routers_loss": 0.007573372684419155,
"skip_count": 0.0,
"step": 1254,
"text_loss": 0.4061077833175659
@@ -11930,13 +11930,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1845703125,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.000986409847843861,
- "loss": 0.0387,
+ "loss": 0.0382,
"macro_f1": 0.3272727429866791,
"num_tokens": 2024149.0,
"repeat_count": 1.0,
- "routers_loss": 0.08003793656826019,
+ "routers_loss": 0.07447971403598785,
"skip_count": 0.0,
"step": 1256,
"text_loss": 0.41876497864723206
@@ -11949,13 +11949,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000986338081799992,
- "loss": 0.0341,
+ "loss": 0.0351,
"macro_f1": 0.3333333432674408,
"num_tokens": 2026545.0,
"repeat_count": 0.0,
- "routers_loss": 0.006424390245229006,
+ "routers_loss": 0.006609147880226374,
"skip_count": 0.0,
"step": 1258,
"text_loss": 0.4673794209957123
@@ -11968,13 +11968,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10009765625,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009862661293904523,
- "loss": 0.0482,
+ "loss": 0.0498,
"macro_f1": 0.32098764181137085,
"num_tokens": 2029581.0,
"repeat_count": 0.0,
- "routers_loss": 0.10797854512929916,
+ "routers_loss": 0.10624702274799347,
"skip_count": 2.0,
"step": 1260,
"text_loss": 0.3483233153820038
@@ -11987,13 +11987,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.111328125,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.0009861939906428145,
- "loss": 0.053,
+ "loss": 0.0525,
"macro_f1": 0.3333333432674408,
"num_tokens": 2033936.0,
"repeat_count": 0.0,
- "routers_loss": 0.006734046153724194,
+ "routers_loss": 0.007944886572659016,
"skip_count": 0.0,
"step": 1262,
"text_loss": 0.16362667083740234
@@ -12006,13 +12006,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009861216655847225,
- "loss": 0.0373,
+ "loss": 0.0376,
"macro_f1": 0.6666666865348816,
"num_tokens": 2037876.0,
"repeat_count": 1.0,
- "routers_loss": 0.00564212491735816,
+ "routers_loss": 0.007004092447459698,
"skip_count": 0.0,
"step": 1264,
"text_loss": 0.43228110671043396
@@ -12025,13 +12025,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1044921875,
+ "grad_norm": 0.1005859375,
"learning_rate": 0.0009860491542438912,
- "loss": 0.0472,
+ "loss": 0.047,
"macro_f1": 0.3272727429866791,
"num_tokens": 2040842.0,
"repeat_count": 0.0,
- "routers_loss": 0.026137735694646835,
+ "routers_loss": 0.026916226372122765,
"skip_count": 1.0,
"step": 1266,
"text_loss": 0.5901188850402832
@@ -12044,13 +12044,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.000985976456648107,
- "loss": 0.0343,
+ "loss": 0.0353,
"macro_f1": 0.3333333432674408,
"num_tokens": 2043890.0,
"repeat_count": 0.0,
- "routers_loss": 0.0069669694639742374,
+ "routers_loss": 0.007325216196477413,
"skip_count": 0.0,
"step": 1268,
"text_loss": 0.8780109882354736
@@ -12063,13 +12063,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.000985903572825228,
- "loss": 0.0323,
+ "loss": 0.0306,
"macro_f1": 0.4871794879436493,
"num_tokens": 2048848.0,
"repeat_count": 0.0,
- "routers_loss": 0.05618409812450409,
+ "routers_loss": 0.05007527023553848,
"skip_count": 2.0,
"step": 1270,
"text_loss": 0.5863722562789917
@@ -12084,11 +12084,11 @@
"f1_skip": 0.0,
"grad_norm": 0.173828125,
"learning_rate": 0.000985830502803183,
- "loss": 0.0391,
+ "loss": 0.0396,
"macro_f1": 0.3272727429866791,
"num_tokens": 2051561.0,
"repeat_count": 0.0,
- "routers_loss": 0.025900620967149734,
+ "routers_loss": 0.023995524272322655,
"skip_count": 0.0,
"step": 1272,
"text_loss": 0.7460709810256958
@@ -12101,13 +12101,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.0009857572466099732,
- "loss": 0.0426,
+ "loss": 0.0431,
"macro_f1": 0.3333333432674408,
"num_tokens": 2054752.0,
"repeat_count": 0.0,
- "routers_loss": 0.006236737594008446,
+ "routers_loss": 0.006928362417966127,
"skip_count": 0.0,
"step": 1274,
"text_loss": 0.5130293369293213
@@ -12120,13 +12120,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.171875,
+ "grad_norm": 0.162109375,
"learning_rate": 0.0009856838042736698,
- "loss": 0.0503,
+ "loss": 0.0501,
"macro_f1": 0.3333333432674408,
"num_tokens": 2058151.0,
"repeat_count": 0.0,
- "routers_loss": 0.006367063149809837,
+ "routers_loss": 0.006969396956264973,
"skip_count": 0.0,
"step": 1276,
"text_loss": 0.5911393761634827
@@ -12139,13 +12139,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.0009856101758224166,
- "loss": 0.0442,
+ "loss": 0.0441,
"macro_f1": 0.3333333432674408,
"num_tokens": 2061012.0,
"repeat_count": 0.0,
- "routers_loss": 0.003392914542928338,
+ "routers_loss": 0.003499418031424284,
"skip_count": 0.0,
"step": 1278,
"text_loss": 0.25347545742988586
@@ -12158,13 +12158,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.000985536361284428,
- "loss": 0.0231,
+ "loss": 0.0229,
"macro_f1": 0.3333333432674408,
"num_tokens": 2064597.0,
"repeat_count": 0.0,
- "routers_loss": 0.007376343477517366,
+ "routers_loss": 0.007856054231524467,
"skip_count": 0.0,
"step": 1280,
"text_loss": 0.7476963400840759
@@ -12177,13 +12177,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.0888671875,
"learning_rate": 0.0009854623606879898,
- "loss": 0.0243,
+ "loss": 0.0245,
"macro_f1": 0.3272727429866791,
"num_tokens": 2067972.0,
"repeat_count": 0.0,
- "routers_loss": 0.02773376554250717,
+ "routers_loss": 0.02617792971432209,
"skip_count": 1.0,
"step": 1282,
"text_loss": 0.5775872468948364
@@ -12196,13 +12196,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.09033203125,
"learning_rate": 0.000985388174061459,
- "loss": 0.0363,
+ "loss": 0.0356,
"macro_f1": 0.32098767161369324,
"num_tokens": 2071812.0,
"repeat_count": 0.0,
- "routers_loss": 0.03535797819495201,
+ "routers_loss": 0.035979997366666794,
"skip_count": 1.0,
"step": 1284,
"text_loss": 0.2933400869369507
@@ -12215,13 +12215,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08447265625,
"learning_rate": 0.0009853138014332646,
- "loss": 0.0269,
+ "loss": 0.0273,
"macro_f1": 0.3333333432674408,
"num_tokens": 2074868.0,
"repeat_count": 0.0,
- "routers_loss": 0.004910993855446577,
+ "routers_loss": 0.005142854526638985,
"skip_count": 0.0,
"step": 1286,
"text_loss": 0.29085102677345276
@@ -12234,13 +12234,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0888671875,
+ "grad_norm": 0.09033203125,
"learning_rate": 0.0009852392428319058,
- "loss": 0.0301,
+ "loss": 0.0306,
"macro_f1": 0.3333333432674408,
"num_tokens": 2078225.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032444109674543142,
+ "routers_loss": 0.0032799106556922197,
"skip_count": 0.0,
"step": 1288,
"text_loss": 0.7293626070022583
@@ -12253,13 +12253,13 @@
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.08935546875,
"learning_rate": 0.0009851644982859537,
- "loss": 0.0272,
+ "loss": 0.0273,
"macro_f1": 0.480392187833786,
"num_tokens": 2081495.0,
"repeat_count": 1.0,
- "routers_loss": 0.12451831251382828,
+ "routers_loss": 0.12224318832159042,
"skip_count": 3.0,
"step": 1290,
"text_loss": 0.26125892996788025
@@ -12272,13 +12272,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009850895678240508,
- "loss": 0.0289,
+ "loss": 0.0283,
"macro_f1": 0.6666666865348816,
"num_tokens": 2084390.0,
"repeat_count": 1.0,
- "routers_loss": 0.011074979789555073,
+ "routers_loss": 0.010662888176739216,
"skip_count": 0.0,
"step": 1292,
"text_loss": 0.3510764539241791
@@ -12291,13 +12291,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.0009850144514749104,
- "loss": 0.0336,
+ "loss": 0.0332,
"macro_f1": 0.5492662787437439,
"num_tokens": 2087210.0,
"repeat_count": 0.0,
- "routers_loss": 0.01774786226451397,
+ "routers_loss": 0.01979079470038414,
"skip_count": 2.0,
"step": 1294,
"text_loss": 0.40202176570892334
@@ -12310,13 +12310,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.000984939149267317,
- "loss": 0.0251,
+ "loss": 0.0253,
"macro_f1": 0.6666666865348816,
"num_tokens": 2090777.0,
"repeat_count": 0.0,
- "routers_loss": 0.0052874404937028885,
+ "routers_loss": 0.005172552540898323,
"skip_count": 1.0,
"step": 1296,
"text_loss": 0.5275651216506958
@@ -12329,13 +12329,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10107421875,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009848636612301272,
- "loss": 0.031,
+ "loss": 0.0299,
"macro_f1": 0.3333333432674408,
"num_tokens": 2094248.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034106262028217316,
+ "routers_loss": 0.0029599082190543413,
"skip_count": 0.0,
"step": 1298,
"text_loss": 0.4517653286457062
@@ -12348,13 +12348,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2177734375,
+ "grad_norm": 0.23046875,
"learning_rate": 0.0009847879873922675,
"loss": 0.0357,
"macro_f1": 0.3333333432674408,
"num_tokens": 2097139.0,
"repeat_count": 0.0,
- "routers_loss": 0.010383229702711105,
+ "routers_loss": 0.011455860920250416,
"skip_count": 0.0,
"step": 1300,
"text_loss": 0.16888445615768433
@@ -12367,13 +12367,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0966796875,
+ "grad_norm": 0.09619140625,
"learning_rate": 0.0009847121277827366,
- "loss": 0.0304,
+ "loss": 0.0301,
"macro_f1": 0.3333333432674408,
"num_tokens": 2100415.0,
"repeat_count": 0.0,
- "routers_loss": 0.0076674893498420715,
+ "routers_loss": 0.008091195486485958,
"skip_count": 0.0,
"step": 1302,
"text_loss": 0.40061676502227783
@@ -12386,13 +12386,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.109375,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.000984636082430604,
- "loss": 0.0287,
+ "loss": 0.0285,
"macro_f1": 0.3333333432674408,
"num_tokens": 2103285.0,
"repeat_count": 0.0,
- "routers_loss": 0.010486516170203686,
+ "routers_loss": 0.009593960829079151,
"skip_count": 0.0,
"step": 1304,
"text_loss": 0.7211073637008667
@@ -12405,13 +12405,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.107421875,
"learning_rate": 0.0009845598513650103,
- "loss": 0.0237,
+ "loss": 0.0231,
"macro_f1": 0.3333333432674408,
"num_tokens": 2106255.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023783023934811354,
+ "routers_loss": 0.0023068038281053305,
"skip_count": 0.0,
"step": 1306,
"text_loss": 0.7077119946479797
@@ -12424,13 +12424,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009844834346151674,
- "loss": 0.044,
+ "loss": 0.043,
"macro_f1": 0.3333333432674408,
"num_tokens": 2109305.0,
"repeat_count": 0.0,
- "routers_loss": 0.006714595016092062,
+ "routers_loss": 0.007703019306063652,
"skip_count": 0.0,
"step": 1308,
"text_loss": 0.3534316122531891
@@ -12443,13 +12443,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009844068322103585,
- "loss": 0.0281,
+ "loss": 0.0287,
"macro_f1": 0.3272727429866791,
"num_tokens": 2112216.0,
"repeat_count": 0.0,
- "routers_loss": 0.022373953834176064,
+ "routers_loss": 0.023549847304821014,
"skip_count": 1.0,
"step": 1310,
"text_loss": 0.6792599558830261
@@ -12462,13 +12462,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.150390625,
"learning_rate": 0.0009843300441799378,
- "loss": 0.0205,
+ "loss": 0.0211,
"macro_f1": 0.3333333432674408,
"num_tokens": 2114925.0,
"repeat_count": 0.0,
- "routers_loss": 0.007452849764376879,
+ "routers_loss": 0.007605871185660362,
"skip_count": 0.0,
"step": 1312,
"text_loss": 0.1571389138698578
@@ -12481,13 +12481,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009842530705533304,
- "loss": 0.0251,
+ "loss": 0.0253,
"macro_f1": 0.3272727429866791,
"num_tokens": 2117744.0,
"repeat_count": 0.0,
- "routers_loss": 0.016413308680057526,
+ "routers_loss": 0.014964760281145573,
"skip_count": 0.0,
"step": 1314,
"text_loss": 0.7840361595153809
@@ -12500,13 +12500,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.000984175911360033,
- "loss": 0.0243,
+ "loss": 0.0238,
"macro_f1": 0.3333333432674408,
"num_tokens": 2120848.0,
"repeat_count": 0.0,
- "routers_loss": 0.004676427226513624,
+ "routers_loss": 0.004663798492401838,
"skip_count": 0.0,
"step": 1316,
"text_loss": 0.536246120929718
@@ -12519,13 +12519,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.000984098566629613,
- "loss": 0.0284,
+ "loss": 0.0288,
"macro_f1": 0.5492662787437439,
"num_tokens": 2123651.0,
"repeat_count": 0.0,
- "routers_loss": 0.024454625323414803,
+ "routers_loss": 0.022852955386042595,
"skip_count": 2.0,
"step": 1318,
"text_loss": 0.43372172117233276
@@ -12538,13 +12538,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.07958984375,
"learning_rate": 0.0009840210363917087,
- "loss": 0.022,
+ "loss": 0.0216,
"macro_f1": 0.3333333432674408,
"num_tokens": 2128011.0,
"repeat_count": 0.0,
- "routers_loss": 0.013495884835720062,
+ "routers_loss": 0.012578422203660011,
"skip_count": 0.0,
"step": 1320,
"text_loss": 0.28190380334854126
@@ -12557,13 +12557,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.0009839433206760306,
- "loss": 0.0213,
+ "loss": 0.0204,
"macro_f1": 0.3333333432674408,
"num_tokens": 2131035.0,
"repeat_count": 0.0,
- "routers_loss": 0.006397814955562353,
+ "routers_loss": 0.006863643880933523,
"skip_count": 0.0,
"step": 1322,
"text_loss": 0.6340444087982178
@@ -12576,13 +12576,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1796875,
"learning_rate": 0.0009838654195123589,
- "loss": 0.0246,
+ "loss": 0.0243,
"macro_f1": 0.3333333432674408,
"num_tokens": 2133856.0,
"repeat_count": 0.0,
- "routers_loss": 0.00503434706479311,
+ "routers_loss": 0.00468854233622551,
"skip_count": 0.0,
"step": 1324,
"text_loss": 0.5138425827026367
@@ -12595,13 +12595,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.115234375,
"learning_rate": 0.0009837873329305458,
- "loss": 0.0402,
+ "loss": 0.0396,
"macro_f1": 0.6666666865348816,
"num_tokens": 2136451.0,
"repeat_count": 1.0,
- "routers_loss": 0.005150494631379843,
+ "routers_loss": 0.005731126759201288,
"skip_count": 0.0,
"step": 1326,
"text_loss": 0.742124617099762
@@ -12614,13 +12614,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000983709060960514,
- "loss": 0.041,
+ "loss": 0.0416,
"macro_f1": 0.3333333432674408,
"num_tokens": 2139496.0,
"repeat_count": 0.0,
- "routers_loss": 0.004570818971842527,
+ "routers_loss": 0.0056343949399888515,
"skip_count": 0.0,
"step": 1328,
"text_loss": 0.7317464351654053
@@ -12633,13 +12633,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09326171875,
+ "grad_norm": 0.10791015625,
"learning_rate": 0.0009836306036322576,
- "loss": 0.0314,
+ "loss": 0.0312,
"macro_f1": 0.3333333432674408,
"num_tokens": 2143120.0,
"repeat_count": 0.0,
- "routers_loss": 0.005299333017319441,
+ "routers_loss": 0.005127966403961182,
"skip_count": 0.0,
"step": 1330,
"text_loss": 0.538652241230011
@@ -12652,13 +12652,13 @@
"f1_execute": 0.9130434989929199,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.111328125,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009835519609758415,
- "loss": 0.0303,
+ "loss": 0.0301,
"macro_f1": 0.590062141418457,
"num_tokens": 2145807.0,
"repeat_count": 3.0,
- "routers_loss": 0.168672576546669,
+ "routers_loss": 0.1673707216978073,
"skip_count": 4.0,
"step": 1332,
"text_loss": 0.3498198091983795
@@ -12671,32 +12671,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009834731330214017,
- "loss": 0.0302,
+ "loss": 0.0293,
"macro_f1": 0.3272727429866791,
"num_tokens": 2148397.0,
"repeat_count": 1.0,
- "routers_loss": 0.05187409743666649,
+ "routers_loss": 0.04026653990149498,
"skip_count": 0.0,
"step": 1334,
"text_loss": 0.8153424859046936
},
{
"acc_repeat": 1.0,
- "acc_skip": 1.0,
- "avg_layers": 26.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 27.0,
"epoch": 6.272380393307896,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.8999999761581421,
"f1_repeat": 0.6666666865348816,
- "f1_skip": 0.9090909361839294,
- "grad_norm": 0.1669921875,
+ "f1_skip": 0.8000000715255737,
+ "grad_norm": 0.16015625,
"learning_rate": 0.0009833941197991455,
- "loss": 0.0339,
- "macro_f1": 0.8329448699951172,
+ "loss": 0.0329,
+ "macro_f1": 0.7888889312744141,
"num_tokens": 2152226.0,
"repeat_count": 2.0,
- "routers_loss": 0.05786697566509247,
+ "routers_loss": 0.05481519177556038,
"skip_count": 5.0,
"step": 1336,
"text_loss": 0.7802760004997253
@@ -12709,13 +12709,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009833149213393506,
- "loss": 0.0315,
+ "loss": 0.0304,
"macro_f1": 0.3272727429866791,
"num_tokens": 2156023.0,
"repeat_count": 0.0,
- "routers_loss": 0.017055779695510864,
+ "routers_loss": 0.01760484278202057,
"skip_count": 0.0,
"step": 1338,
"text_loss": 0.19721226394176483
@@ -12728,13 +12728,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.000983235537672366,
- "loss": 0.0249,
+ "loss": 0.0256,
"macro_f1": 0.3333333432674408,
"num_tokens": 2160037.0,
"repeat_count": 0.0,
- "routers_loss": 0.011614206247031689,
+ "routers_loss": 0.013206037692725658,
"skip_count": 0.0,
"step": 1340,
"text_loss": 0.5003817081451416
@@ -12747,13 +12747,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.000983155968828612,
- "loss": 0.033,
+ "loss": 0.0315,
"macro_f1": 0.6666666865348816,
"num_tokens": 2163910.0,
"repeat_count": 1.0,
- "routers_loss": 0.012611300684511662,
+ "routers_loss": 0.01256406120955944,
"skip_count": 0.0,
"step": 1342,
"text_loss": 0.5996923446655273
@@ -12766,13 +12766,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.11962890625,
"learning_rate": 0.0009830762148385793,
- "loss": 0.0315,
+ "loss": 0.0313,
"macro_f1": 0.3272727429866791,
"num_tokens": 2166921.0,
"repeat_count": 0.0,
- "routers_loss": 0.018757276237010956,
+ "routers_loss": 0.015086234547197819,
"skip_count": 1.0,
"step": 1344,
"text_loss": 0.45356282591819763
@@ -12785,13 +12785,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08447265625,
"learning_rate": 0.0009829962757328297,
- "loss": 0.0229,
+ "loss": 0.0223,
"macro_f1": 0.32098764181137085,
"num_tokens": 2170135.0,
"repeat_count": 0.0,
- "routers_loss": 0.08197146654129028,
+ "routers_loss": 0.07909081131219864,
"skip_count": 2.0,
"step": 1346,
"text_loss": 0.2874644994735718
@@ -12804,13 +12804,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0009829161515419959,
- "loss": 0.0256,
+ "loss": 0.0246,
"macro_f1": 0.6666666865348816,
"num_tokens": 2173029.0,
"repeat_count": 0.0,
- "routers_loss": 0.014122758992016315,
+ "routers_loss": 0.013569854199886322,
"skip_count": 2.0,
"step": 1348,
"text_loss": 0.25533875823020935
@@ -12823,13 +12823,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06005859375,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0009828358422967823,
- "loss": 0.0221,
+ "loss": 0.0226,
"macro_f1": 0.32098764181137085,
"num_tokens": 2176605.0,
"repeat_count": 1.0,
- "routers_loss": 0.08215996623039246,
+ "routers_loss": 0.08111091703176498,
"skip_count": 1.0,
"step": 1350,
"text_loss": 0.32827726006507874
@@ -12842,13 +12842,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09375,
+ "grad_norm": 0.091796875,
"learning_rate": 0.0009827553480279627,
- "loss": 0.0312,
+ "loss": 0.03,
"macro_f1": 0.5427350401878357,
"num_tokens": 2179406.0,
"repeat_count": 0.0,
- "routers_loss": 0.026304977014660835,
+ "routers_loss": 0.026550088077783585,
"skip_count": 2.0,
"step": 1352,
"text_loss": 0.2966301143169403
@@ -12861,13 +12861,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009826746687663832,
- "loss": 0.0302,
+ "loss": 0.0301,
"macro_f1": 0.3333333432674408,
"num_tokens": 2182353.0,
"repeat_count": 0.0,
- "routers_loss": 0.003616038942709565,
+ "routers_loss": 0.003914554137736559,
"skip_count": 0.0,
"step": 1354,
"text_loss": 0.7596251964569092
@@ -12880,13 +12880,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.0849609375,
+ "grad_norm": 0.0859375,
"learning_rate": 0.0009825938045429602,
- "loss": 0.0323,
+ "loss": 0.0324,
"macro_f1": 0.5866667032241821,
"num_tokens": 2185786.0,
"repeat_count": 1.0,
- "routers_loss": 0.060399893671274185,
+ "routers_loss": 0.059612665325403214,
"skip_count": 3.0,
"step": 1356,
"text_loss": 0.12325898557901382
@@ -12899,13 +12899,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10302734375,
+ "grad_norm": 0.10009765625,
"learning_rate": 0.0009825127553886807,
- "loss": 0.0384,
+ "loss": 0.0375,
"macro_f1": 0.3333333432674408,
"num_tokens": 2190157.0,
"repeat_count": 0.0,
- "routers_loss": 0.007164204493165016,
+ "routers_loss": 0.0071132429875433445,
"skip_count": 0.0,
"step": 1358,
"text_loss": 0.9287898540496826
@@ -12918,13 +12918,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0009824315213346033,
- "loss": 0.0343,
+ "loss": 0.0348,
"macro_f1": 0.3333333432674408,
"num_tokens": 2193077.0,
"repeat_count": 0.0,
- "routers_loss": 0.010965060442686081,
+ "routers_loss": 0.009611099027097225,
"skip_count": 0.0,
"step": 1360,
"text_loss": 0.20427259802818298
@@ -12937,13 +12937,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009823501024118569,
- "loss": 0.0276,
+ "loss": 0.0285,
"macro_f1": 0.3333333432674408,
"num_tokens": 2196494.0,
"repeat_count": 0.0,
- "routers_loss": 0.00784136913716793,
+ "routers_loss": 0.006913455203175545,
"skip_count": 0.0,
"step": 1362,
"text_loss": 0.574759840965271
@@ -12956,13 +12956,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009822684986516411,
- "loss": 0.0251,
+ "loss": 0.0245,
"macro_f1": 0.3333333432674408,
"num_tokens": 2199839.0,
"repeat_count": 0.0,
- "routers_loss": 0.009101065807044506,
+ "routers_loss": 0.009208920411765575,
"skip_count": 0.0,
"step": 1364,
"text_loss": 0.42422571778297424
@@ -12970,37 +12970,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 6.413266803639566,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.000982186710085227,
- "loss": 0.0206,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.0208,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 2203212.0,
"repeat_count": 1.0,
- "routers_loss": 0.05967295169830322,
+ "routers_loss": 0.059975091367959976,
"skip_count": 1.0,
"step": 1366,
"text_loss": 0.29213017225265503
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 26.0,
+ "acc_skip": 0.25,
+ "avg_layers": 27.0,
"epoch": 6.42265923099501,
- "f1_execute": 0.9600000381469727,
+ "f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.1875,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.181640625,
"learning_rate": 0.0009821047367439561,
- "loss": 0.0356,
- "macro_f1": 0.542222261428833,
+ "loss": 0.0358,
+ "macro_f1": 0.44705885648727417,
"num_tokens": 2206240.0,
"repeat_count": 0.0,
- "routers_loss": 0.05016552656888962,
+ "routers_loss": 0.048244867473840714,
"skip_count": 4.0,
"step": 1368,
"text_loss": 0.3072395324707031
@@ -13013,13 +13013,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0009820225786592405,
- "loss": 0.038,
+ "loss": 0.0375,
"macro_f1": 0.3272727429866791,
"num_tokens": 2209903.0,
"repeat_count": 1.0,
- "routers_loss": 0.02483060024678707,
+ "routers_loss": 0.026068156585097313,
"skip_count": 0.0,
"step": 1370,
"text_loss": 0.5961400270462036
@@ -13032,13 +13032,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.109375,
"learning_rate": 0.0009819402358625634,
- "loss": 0.0373,
+ "loss": 0.0366,
"macro_f1": 0.3272727429866791,
"num_tokens": 2213439.0,
"repeat_count": 0.0,
- "routers_loss": 0.01982821337878704,
+ "routers_loss": 0.022615568712353706,
"skip_count": 1.0,
"step": 1372,
"text_loss": 0.19375644624233246
@@ -13051,13 +13051,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.000981857708385479,
- "loss": 0.0353,
+ "loss": 0.0346,
"macro_f1": 0.3333333432674408,
"num_tokens": 2216457.0,
"repeat_count": 0.0,
- "routers_loss": 0.004753436427563429,
+ "routers_loss": 0.005855285096913576,
"skip_count": 0.0,
"step": 1374,
"text_loss": 0.5123368501663208
@@ -13070,13 +13070,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09912109375,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009817749962596114,
- "loss": 0.0246,
+ "loss": 0.0249,
"macro_f1": 0.3272727429866791,
"num_tokens": 2219975.0,
"repeat_count": 1.0,
- "routers_loss": 0.06541594862937927,
+ "routers_loss": 0.0651634931564331,
"skip_count": 0.0,
"step": 1376,
"text_loss": 0.5999220609664917
@@ -13089,13 +13089,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.0009816920995166568,
- "loss": 0.0376,
+ "loss": 0.0371,
"macro_f1": 0.6666666865348816,
"num_tokens": 2222833.0,
"repeat_count": 1.0,
- "routers_loss": 0.01156456395983696,
+ "routers_loss": 0.011408994905650616,
"skip_count": 0.0,
"step": 1378,
"text_loss": 0.5323230624198914
@@ -13108,13 +13108,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2392578125,
+ "grad_norm": 0.205078125,
"learning_rate": 0.0009816090181883807,
- "loss": 0.033,
+ "loss": 0.0313,
"macro_f1": 0.32098764181137085,
"num_tokens": 2225842.0,
"repeat_count": 0.0,
- "routers_loss": 0.05175521597266197,
+ "routers_loss": 0.039720915257930756,
"skip_count": 2.0,
"step": 1380,
"text_loss": 0.23363439738750458
@@ -13127,13 +13127,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009815257523066204,
- "loss": 0.0251,
+ "loss": 0.0249,
"macro_f1": 0.3333333432674408,
"num_tokens": 2229430.0,
"repeat_count": 0.0,
- "routers_loss": 0.002684591803699732,
+ "routers_loss": 0.002765297656878829,
"skip_count": 0.0,
"step": 1382,
"text_loss": 0.718977689743042
@@ -13146,13 +13146,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.12890625,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009814423019032835,
- "loss": 0.0397,
+ "loss": 0.0396,
"macro_f1": 0.5492662787437439,
"num_tokens": 2232594.0,
"repeat_count": 2.0,
- "routers_loss": 0.054509978741407394,
+ "routers_loss": 0.05362323671579361,
"skip_count": 0.0,
"step": 1384,
"text_loss": 0.6392166614532471
@@ -13165,13 +13165,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.150390625,
"learning_rate": 0.0009813586670103483,
"loss": 0.0426,
"macro_f1": 0.6603773832321167,
"num_tokens": 2236327.0,
"repeat_count": 1.0,
- "routers_loss": 0.04031623527407646,
+ "routers_loss": 0.031728316098451614,
"skip_count": 1.0,
"step": 1386,
"text_loss": 0.5951619148254395
@@ -13184,13 +13184,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.126953125,
"learning_rate": 0.0009812748476598638,
- "loss": 0.0308,
+ "loss": 0.031,
"macro_f1": 0.5492662787437439,
"num_tokens": 2239746.0,
"repeat_count": 0.0,
- "routers_loss": 0.039687711745500565,
+ "routers_loss": 0.03981253132224083,
"skip_count": 2.0,
"step": 1388,
"text_loss": 0.22756551206111908
@@ -13203,13 +13203,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.12451171875,
"learning_rate": 0.0009811908438839498,
- "loss": 0.0329,
+ "loss": 0.0331,
"macro_f1": 0.5492662787437439,
"num_tokens": 2242786.0,
"repeat_count": 0.0,
- "routers_loss": 0.04785723611712456,
+ "routers_loss": 0.04617162421345711,
"skip_count": 2.0,
"step": 1390,
"text_loss": 0.3233799934387207
@@ -13222,13 +13222,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.154296875,
"learning_rate": 0.000981106655714797,
- "loss": 0.0359,
+ "loss": 0.0358,
"macro_f1": 0.3272727429866791,
"num_tokens": 2245696.0,
"repeat_count": 0.0,
- "routers_loss": 0.046765491366386414,
+ "routers_loss": 0.046828847378492355,
"skip_count": 1.0,
"step": 1392,
"text_loss": 0.24273279309272766
@@ -13241,13 +13241,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009810222831846656,
- "loss": 0.0303,
+ "loss": 0.0307,
"macro_f1": 0.5492662787437439,
"num_tokens": 2249326.0,
"repeat_count": 0.0,
- "routers_loss": 0.015151665546000004,
+ "routers_loss": 0.010921589098870754,
"skip_count": 2.0,
"step": 1394,
"text_loss": 0.3921460807323456
@@ -13260,13 +13260,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009809377263258882,
- "loss": 0.0321,
+ "loss": 0.0315,
"macro_f1": 0.32098767161369324,
"num_tokens": 2253393.0,
"repeat_count": 0.0,
- "routers_loss": 0.04431106895208359,
+ "routers_loss": 0.04564022272825241,
"skip_count": 1.0,
"step": 1396,
"text_loss": 0.582602858543396
@@ -13279,13 +13279,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09814453125,
+ "grad_norm": 0.103515625,
"learning_rate": 0.000980852985170867,
- "loss": 0.0317,
+ "loss": 0.0328,
"macro_f1": 0.3272727429866791,
"num_tokens": 2256626.0,
"repeat_count": 0.0,
- "routers_loss": 0.012700649909675121,
+ "routers_loss": 0.013289985246956348,
"skip_count": 0.0,
"step": 1398,
"text_loss": 0.41031694412231445
@@ -13298,13 +13298,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.0009807680597520745,
- "loss": 0.0256,
+ "loss": 0.0264,
"macro_f1": 0.3333333432674408,
"num_tokens": 2259326.0,
"repeat_count": 0.0,
- "routers_loss": 0.005919010378420353,
+ "routers_loss": 0.0065213534981012344,
"skip_count": 0.0,
"step": 1400,
"text_loss": 0.2888098657131195
@@ -13317,13 +13317,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.23046875,
"learning_rate": 0.0009806829501020546,
- "loss": 0.0372,
+ "loss": 0.0358,
"macro_f1": 0.3272727429866791,
"num_tokens": 2262344.0,
"repeat_count": 0.0,
- "routers_loss": 0.04717765748500824,
+ "routers_loss": 0.04199840500950813,
"skip_count": 1.0,
"step": 1402,
"text_loss": 0.31973034143447876
@@ -13336,13 +13336,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.08935546875,
"learning_rate": 0.0009805976562534215,
"loss": 0.0317,
"macro_f1": 0.6603773832321167,
"num_tokens": 2266354.0,
"repeat_count": 1.0,
- "routers_loss": 0.015415813773870468,
+ "routers_loss": 0.015434930101037025,
"skip_count": 1.0,
"step": 1404,
"text_loss": 0.508630633354187
@@ -13355,13 +13355,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009805121782388599,
"loss": 0.0339,
"macro_f1": 0.6533333659172058,
"num_tokens": 2269660.0,
"repeat_count": 2.0,
- "routers_loss": 0.06812979280948639,
+ "routers_loss": 0.0720924660563469,
"skip_count": 2.0,
"step": 1406,
"text_loss": 0.40927737951278687
@@ -13374,13 +13374,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05908203125,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009804265160911253,
- "loss": 0.0265,
+ "loss": 0.0266,
"macro_f1": 0.5492662787437439,
"num_tokens": 2273335.0,
"repeat_count": 0.0,
- "routers_loss": 0.025383235886693,
+ "routers_loss": 0.02400495670735836,
"skip_count": 2.0,
"step": 1408,
"text_loss": 0.1777762621641159
@@ -13393,13 +13393,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.2314453125,
"learning_rate": 0.0009803406698430433,
- "loss": 0.0367,
+ "loss": 0.0371,
"macro_f1": 0.3272727429866791,
"num_tokens": 2277107.0,
"repeat_count": 0.0,
- "routers_loss": 0.026493225246667862,
+ "routers_loss": 0.02560107782483101,
"skip_count": 1.0,
"step": 1410,
"text_loss": 0.17955881357192993
@@ -13412,13 +13412,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0009802546395275104,
- "loss": 0.0342,
+ "loss": 0.0349,
"macro_f1": 0.3333333432674408,
"num_tokens": 2281638.0,
"repeat_count": 0.0,
- "routers_loss": 0.006616846192628145,
+ "routers_loss": 0.006655813194811344,
"skip_count": 0.0,
"step": 1412,
"text_loss": 0.20882295072078705
@@ -13431,32 +13431,32 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.0888671875,
+ "grad_norm": 0.08740234375,
"learning_rate": 0.000980168425177494,
- "loss": 0.0328,
+ "loss": 0.0342,
"macro_f1": 0.8200000524520874,
"num_tokens": 2284876.0,
"repeat_count": 1.0,
- "routers_loss": 0.060631848871707916,
+ "routers_loss": 0.06325097382068634,
"skip_count": 3.0,
"step": 1414,
"text_loss": 0.26035264134407043
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 6.648077487525683,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.138671875,
"learning_rate": 0.000980082026826031,
- "loss": 0.0317,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0315,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2288938.0,
"repeat_count": 1.0,
- "routers_loss": 0.011199389584362507,
+ "routers_loss": 0.013436575420200825,
"skip_count": 0.0,
"step": 1416,
"text_loss": 0.5502325892448425
@@ -13469,13 +13469,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0009799954445062296,
- "loss": 0.0192,
+ "loss": 0.0193,
"macro_f1": 0.6603773832321167,
"num_tokens": 2292317.0,
"repeat_count": 1.0,
- "routers_loss": 0.01120354700833559,
+ "routers_loss": 0.011264479719102383,
"skip_count": 1.0,
"step": 1418,
"text_loss": 0.48075684905052185
@@ -13488,13 +13488,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009799086782512686,
- "loss": 0.0294,
+ "loss": 0.0292,
"macro_f1": 0.5492662787437439,
"num_tokens": 2295935.0,
"repeat_count": 0.0,
- "routers_loss": 0.030204148963093758,
+ "routers_loss": 0.02833271212875843,
"skip_count": 2.0,
"step": 1420,
"text_loss": 0.18221206963062286
@@ -13507,13 +13507,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0888671875,
+ "grad_norm": 0.09375,
"learning_rate": 0.0009798217280943967,
- "loss": 0.0348,
+ "loss": 0.0356,
"macro_f1": 0.6666666865348816,
"num_tokens": 2298927.0,
"repeat_count": 0.0,
- "routers_loss": 0.008244800381362438,
+ "routers_loss": 0.009208574891090393,
"skip_count": 1.0,
"step": 1422,
"text_loss": 0.48686322569847107
@@ -13526,32 +13526,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009797345940689335,
- "loss": 0.0269,
+ "loss": 0.0267,
"macro_f1": 0.3272727429866791,
"num_tokens": 2301541.0,
"repeat_count": 0.0,
- "routers_loss": 0.015340043231844902,
+ "routers_loss": 0.015011847950518131,
"skip_count": 0.0,
"step": 1424,
"text_loss": 0.49446266889572144
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6000000238418579,
- "avg_layers": 25.0,
+ "acc_skip": 0.4000000059604645,
+ "avg_layers": 26.0,
"epoch": 6.695039624302906,
- "f1_execute": 0.9583333134651184,
+ "f1_execute": 0.9387754797935486,
"f1_repeat": 0.0,
- "f1_skip": 0.75,
- "grad_norm": 0.1318359375,
+ "f1_skip": 0.5714285969734192,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.0009796472762082687,
- "loss": 0.0341,
- "macro_f1": 0.5694444179534912,
+ "loss": 0.0338,
+ "macro_f1": 0.5034013986587524,
"num_tokens": 2304589.0,
"repeat_count": 0.0,
- "routers_loss": 0.058681465685367584,
+ "routers_loss": 0.05912091210484505,
"skip_count": 5.0,
"step": 1426,
"text_loss": 0.23945684731006622
@@ -13564,32 +13564,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.09765625,
"learning_rate": 0.000979559774545863,
- "loss": 0.0423,
+ "loss": 0.0405,
"macro_f1": 0.3272727429866791,
"num_tokens": 2307860.0,
"repeat_count": 0.0,
- "routers_loss": 0.020810559391975403,
+ "routers_loss": 0.021242303773760796,
"skip_count": 1.0,
"step": 1428,
"text_loss": 0.531273365020752
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 6.713824479013795,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.09033203125,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.000979472089115247,
- "loss": 0.0268,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0276,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 2311581.0,
"repeat_count": 0.0,
- "routers_loss": 0.030001837760210037,
+ "routers_loss": 0.02768544852733612,
"skip_count": 2.0,
"step": 1430,
"text_loss": 0.2497459501028061
@@ -13602,13 +13602,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1318359375,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.000979384219950022,
- "loss": 0.034,
+ "loss": 0.0346,
"macro_f1": 0.3333333432674408,
"num_tokens": 2314639.0,
"repeat_count": 0.0,
- "routers_loss": 0.010381575673818588,
+ "routers_loss": 0.008678150363266468,
"skip_count": 0.0,
"step": 1432,
"text_loss": 0.6579355001449585
@@ -13621,32 +13621,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08056640625,
"learning_rate": 0.0009792961670838595,
- "loss": 0.0365,
+ "loss": 0.0362,
"macro_f1": 0.3272727429866791,
"num_tokens": 2317927.0,
"repeat_count": 1.0,
- "routers_loss": 0.03234704211354256,
+ "routers_loss": 0.03325597569346428,
"skip_count": 0.0,
"step": 1434,
"text_loss": 0.5209436416625977
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 6.742001761080129,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009792079305505016,
- "loss": 0.0303,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0306,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2321065.0,
"repeat_count": 1.0,
- "routers_loss": 0.015481291338801384,
+ "routers_loss": 0.019228918477892876,
"skip_count": 0.0,
"step": 1436,
"text_loss": 0.41087067127227783
@@ -13659,13 +13659,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.000979119510383761,
- "loss": 0.0366,
+ "loss": 0.0371,
"macro_f1": 0.3333333432674408,
"num_tokens": 2323714.0,
"repeat_count": 0.0,
- "routers_loss": 0.018170451745390892,
+ "routers_loss": 0.017071325331926346,
"skip_count": 0.0,
"step": 1438,
"text_loss": 0.21490029990673065
@@ -13678,13 +13678,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.2060546875,
"learning_rate": 0.00097903090661752,
- "loss": 0.0306,
+ "loss": 0.0309,
"macro_f1": 0.3333333432674408,
"num_tokens": 2326454.0,
"repeat_count": 0.0,
- "routers_loss": 0.010385681875050068,
+ "routers_loss": 0.00991755723953247,
"skip_count": 0.0,
"step": 1440,
"text_loss": 0.23847346007823944
@@ -13697,13 +13697,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.232421875,
"learning_rate": 0.000978942119285732,
- "loss": 0.0407,
+ "loss": 0.0404,
"macro_f1": 0.3272727429866791,
"num_tokens": 2329462.0,
"repeat_count": 0.0,
- "routers_loss": 0.04976538568735123,
+ "routers_loss": 0.04908733069896698,
"skip_count": 1.0,
"step": 1442,
"text_loss": 0.23343028128147125
@@ -13716,13 +13716,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009788531484224204,
- "loss": 0.0255,
+ "loss": 0.0264,
"macro_f1": 0.3333333432674408,
"num_tokens": 2332146.0,
"repeat_count": 0.0,
- "routers_loss": 0.0030266831163316965,
+ "routers_loss": 0.0032628148328512907,
"skip_count": 0.0,
"step": 1444,
"text_loss": 0.47423800826072693
@@ -13730,18 +13730,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.3333333432674408,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 6.788963897857353,
- "f1_execute": 0.9600000381469727,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 0.5,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009787639940616788,
- "loss": 0.0411,
- "macro_f1": 0.8200000524520874,
+ "loss": 0.0405,
+ "macro_f1": 0.7018141150474548,
"num_tokens": 2335738.0,
"repeat_count": 1.0,
- "routers_loss": 0.13420957326889038,
+ "routers_loss": 0.14336998760700226,
"skip_count": 3.0,
"step": 1446,
"text_loss": 0.21837592124938965
@@ -13754,13 +13754,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.189453125,
"learning_rate": 0.0009786746562376717,
- "loss": 0.0251,
+ "loss": 0.0241,
"macro_f1": 0.6666666865348816,
"num_tokens": 2338488.0,
"repeat_count": 0.0,
- "routers_loss": 0.012779864482581615,
+ "routers_loss": 0.010542908683419228,
"skip_count": 1.0,
"step": 1448,
"text_loss": 1.0614757537841797
@@ -13773,13 +13773,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.0009785851349846334,
- "loss": 0.0266,
+ "loss": 0.0268,
"macro_f1": 0.3333333432674408,
"num_tokens": 2342074.0,
"repeat_count": 0.0,
- "routers_loss": 0.005545398220419884,
+ "routers_loss": 0.005998016335070133,
"skip_count": 0.0,
"step": 1450,
"text_loss": 0.4269719421863556
@@ -13792,13 +13792,13 @@
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.09814453125,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009784954303368686,
- "loss": 0.0395,
+ "loss": 0.0384,
"macro_f1": 0.44705885648727417,
"num_tokens": 2345838.0,
"repeat_count": 0.0,
- "routers_loss": 0.0899835154414177,
+ "routers_loss": 0.0959126204252243,
"skip_count": 3.0,
"step": 1452,
"text_loss": 0.3315916955471039
@@ -13811,13 +13811,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.1005859375,
"learning_rate": 0.0009784055423287521,
"loss": 0.0218,
"macro_f1": 0.3333333432674408,
"num_tokens": 2348939.0,
"repeat_count": 0.0,
- "routers_loss": 0.002738836221396923,
+ "routers_loss": 0.0025467623490840197,
"skip_count": 0.0,
"step": 1454,
"text_loss": 0.6162732839584351
@@ -13830,13 +13830,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.115234375,
"learning_rate": 0.0009783154709947293,
- "loss": 0.0266,
+ "loss": 0.0256,
"macro_f1": 0.3272727429866791,
"num_tokens": 2352232.0,
"repeat_count": 0.0,
- "routers_loss": 0.020522192120552063,
+ "routers_loss": 0.01860538125038147,
"skip_count": 1.0,
"step": 1456,
"text_loss": 0.23928768932819366
@@ -13844,18 +13844,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 6.84531846199002,
- "f1_execute": 0.9629629850387573,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.0009782252163693158,
- "loss": 0.0197,
- "macro_f1": 0.32098767161369324,
+ "loss": 0.0201,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2355159.0,
"repeat_count": 0.0,
- "routers_loss": 0.04245268926024437,
+ "routers_loss": 0.04412713274359703,
"skip_count": 1.0,
"step": 1458,
"text_loss": 0.3371323347091675
@@ -13868,13 +13868,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.224609375,
+ "grad_norm": 0.21484375,
"learning_rate": 0.0009781347784870973,
- "loss": 0.0376,
+ "loss": 0.0379,
"macro_f1": 0.3333333432674408,
"num_tokens": 2358175.0,
"repeat_count": 0.0,
- "routers_loss": 0.009142685681581497,
+ "routers_loss": 0.006809141952544451,
"skip_count": 0.0,
"step": 1460,
"text_loss": 0.547267735004425
@@ -13887,13 +13887,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009780441573827296,
- "loss": 0.0295,
+ "loss": 0.03,
"macro_f1": 0.3076923191547394,
"num_tokens": 2360991.0,
"repeat_count": 0.0,
- "routers_loss": 0.08038893342018127,
+ "routers_loss": 0.08924390375614166,
"skip_count": 4.0,
"step": 1462,
"text_loss": 0.7026563882827759
@@ -13906,13 +13906,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.1865234375,
"learning_rate": 0.000977953353090939,
- "loss": 0.027,
+ "loss": 0.0272,
"macro_f1": 0.3333333432674408,
"num_tokens": 2363894.0,
"repeat_count": 0.0,
- "routers_loss": 0.02107175625860691,
+ "routers_loss": 0.021858472377061844,
"skip_count": 0.0,
"step": 1464,
"text_loss": 0.2718065083026886
@@ -13925,13 +13925,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.0009778623656465219,
- "loss": 0.0349,
+ "loss": 0.0338,
"macro_f1": 0.32098764181137085,
"num_tokens": 2367265.0,
"repeat_count": 0.0,
- "routers_loss": 0.042030055075883865,
+ "routers_loss": 0.044781096279621124,
"skip_count": 0.0,
"step": 1466,
"text_loss": 0.5008095502853394
@@ -13944,13 +13944,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07470703125,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009777711950843448,
- "loss": 0.022,
+ "loss": 0.0212,
"macro_f1": 0.3333333432674408,
"num_tokens": 2370186.0,
"repeat_count": 0.0,
- "routers_loss": 0.004230673424899578,
+ "routers_loss": 0.0040459707379341125,
"skip_count": 0.0,
"step": 1468,
"text_loss": 0.5242461562156677
@@ -13963,13 +13963,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009776798414393446,
- "loss": 0.0284,
+ "loss": 0.0279,
"macro_f1": 0.6598639488220215,
"num_tokens": 2373314.0,
"repeat_count": 1.0,
- "routers_loss": 0.06986775249242783,
+ "routers_loss": 0.0708528608083725,
"skip_count": 3.0,
"step": 1470,
"text_loss": 0.2821732461452484
@@ -13982,13 +13982,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.142578125,
+ "grad_norm": 0.1328125,
"learning_rate": 0.0009775883047465279,
- "loss": 0.0431,
+ "loss": 0.0414,
"macro_f1": 0.31446540355682373,
"num_tokens": 2376435.0,
"repeat_count": 1.0,
- "routers_loss": 0.0439564548432827,
+ "routers_loss": 0.0290578193962574,
"skip_count": 1.0,
"step": 1472,
"text_loss": 0.8438440561294556
@@ -14001,13 +14001,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1123046875,
+ "grad_norm": 0.10546875,
"learning_rate": 0.000977496585040972,
- "loss": 0.0376,
+ "loss": 0.0373,
"macro_f1": 0.3333333432674408,
"num_tokens": 2380244.0,
"repeat_count": 0.0,
- "routers_loss": 0.011889892630279064,
+ "routers_loss": 0.010360375046730042,
"skip_count": 0.0,
"step": 1474,
"text_loss": 0.4356135427951813
@@ -14020,13 +14020,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.000977404682357824,
- "loss": 0.0295,
+ "loss": 0.0294,
"macro_f1": 0.3272727429866791,
"num_tokens": 2383498.0,
"repeat_count": 0.0,
- "routers_loss": 0.022536326199769974,
+ "routers_loss": 0.023518972098827362,
"skip_count": 0.0,
"step": 1476,
"text_loss": 0.25195425748825073
@@ -14039,13 +14039,13 @@
"f1_execute": 0.9743589162826538,
"f1_repeat": 0.888888955116272,
"f1_skip": 1.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.000977312596732301,
- "loss": 0.0388,
+ "loss": 0.0375,
"macro_f1": 0.9544159770011902,
"num_tokens": 2386414.0,
"repeat_count": 5.0,
- "routers_loss": 0.07959948480129242,
+ "routers_loss": 0.08190606534481049,
"skip_count": 4.0,
"step": 1478,
"text_loss": 0.6586798429489136
@@ -14058,13 +14058,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.10546875,
"learning_rate": 0.0009772203281996905,
- "loss": 0.0341,
+ "loss": 0.0336,
"macro_f1": 1.0,
"num_tokens": 2389399.0,
"repeat_count": 1.0,
- "routers_loss": 0.019112225621938705,
+ "routers_loss": 0.016441475600004196,
"skip_count": 2.0,
"step": 1480,
"text_loss": 0.3671986758708954
@@ -14077,13 +14077,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0888671875,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009771278767953502,
- "loss": 0.0345,
+ "loss": 0.0357,
"macro_f1": 0.3333333432674408,
"num_tokens": 2392400.0,
"repeat_count": 0.0,
- "routers_loss": 0.018750866875052452,
+ "routers_loss": 0.019211363047361374,
"skip_count": 0.0,
"step": 1482,
"text_loss": 0.27418580651283264
@@ -14096,32 +14096,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09228515625,
+ "grad_norm": 0.0947265625,
"learning_rate": 0.0009770352425547072,
- "loss": 0.0291,
+ "loss": 0.0292,
"macro_f1": 0.3333333432674408,
"num_tokens": 2395123.0,
"repeat_count": 0.0,
- "routers_loss": 0.015407348051667213,
+ "routers_loss": 0.015800386667251587,
"skip_count": 0.0,
"step": 1484,
"text_loss": 0.19896622002124786
},
{
- "acc_repeat": 0.6666666865348816,
+ "acc_repeat": 0.3333333432674408,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 29.0,
"epoch": 6.976812444966246,
- "f1_execute": 0.9803921580314636,
- "f1_repeat": 0.800000011920929,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.5,
"f1_skip": 0.0,
- "grad_norm": 0.11474609375,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009769424255132596,
- "loss": 0.0258,
- "macro_f1": 0.5934640765190125,
+ "loss": 0.0256,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 2397359.0,
"repeat_count": 3.0,
- "routers_loss": 0.06514479219913483,
+ "routers_loss": 0.06670158356428146,
"skip_count": 0.0,
"step": 1486,
"text_loss": 0.4229799509048462
@@ -14134,13 +14134,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.111328125,
+ "grad_norm": 0.1162109375,
"learning_rate": 0.0009768494257065747,
- "loss": 0.0217,
+ "loss": 0.0218,
"macro_f1": 0.3272727429866791,
"num_tokens": 2400387.0,
"repeat_count": 0.0,
- "routers_loss": 0.013567833229899406,
+ "routers_loss": 0.011144762858748436,
"skip_count": 1.0,
"step": 1488,
"text_loss": 0.4264226257801056
@@ -14153,13 +14153,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12255859375,
+ "grad_norm": 0.12353515625,
"learning_rate": 0.0009767562431702904,
- "loss": 0.0389,
+ "loss": 0.0387,
"macro_f1": 0.3006536364555359,
"num_tokens": 2403241.0,
"repeat_count": 2.0,
- "routers_loss": 0.13762018084526062,
+ "routers_loss": 0.12339717149734497,
"skip_count": 3.0,
"step": 1490,
"text_loss": 0.2850193977355957
@@ -14172,13 +14172,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0009766628779401142,
- "loss": 0.0214,
+ "loss": 0.0215,
"macro_f1": 0.6666666865348816,
"num_tokens": 2406087.0,
"repeat_count": 0.0,
- "routers_loss": 0.008640666492283344,
+ "routers_loss": 0.008174685761332512,
"skip_count": 1.0,
"step": 1492,
"text_loss": 0.6756544709205627
@@ -14191,13 +14191,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05712890625,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.000976569330051824,
- "loss": 0.0182,
+ "loss": 0.0186,
"macro_f1": 0.3333333432674408,
"num_tokens": 2409312.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018257038900628686,
+ "routers_loss": 0.0021256296895444393,
"skip_count": 0.0,
"step": 1494,
"text_loss": 0.4789894223213196
@@ -14210,13 +14210,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0009764755995412677,
"loss": 0.0193,
"macro_f1": 0.3333333432674408,
"num_tokens": 2412758.0,
"repeat_count": 0.0,
- "routers_loss": 0.003656312357634306,
+ "routers_loss": 0.003944927826523781,
"skip_count": 0.0,
"step": 1496,
"text_loss": 0.5157490968704224
@@ -14229,13 +14229,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009763816864443627,
- "loss": 0.0246,
+ "loss": 0.0239,
"macro_f1": 0.3272727429866791,
"num_tokens": 2416079.0,
"repeat_count": 1.0,
- "routers_loss": 0.044268425554037094,
+ "routers_loss": 0.03893325850367546,
"skip_count": 0.0,
"step": 1498,
"text_loss": 0.28045418858528137
@@ -14248,13 +14248,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.0009762875907970968,
- "loss": 0.0207,
+ "loss": 0.0199,
"macro_f1": 0.3333333432674408,
"num_tokens": 2420340.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018966116476804018,
+ "routers_loss": 0.0017725443467497826,
"skip_count": 0.0,
"step": 1500,
"text_loss": 0.35550856590270996
@@ -14267,32 +14267,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.06298828125,
"learning_rate": 0.0009761933126355277,
- "loss": 0.0249,
+ "loss": 0.0245,
"macro_f1": 0.3272727429866791,
"num_tokens": 2424735.0,
"repeat_count": 0.0,
- "routers_loss": 0.01729201152920723,
+ "routers_loss": 0.01393749937415123,
"skip_count": 1.0,
"step": 1502,
"text_loss": 0.38840189576148987
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 7.06105077781039,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.11962890625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009760988519957828,
- "loss": 0.0248,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0249,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 2428132.0,
"repeat_count": 0.0,
- "routers_loss": 0.01693531684577465,
+ "routers_loss": 0.01687910407781601,
"skip_count": 2.0,
"step": 1504,
"text_loss": 0.3031681478023529
@@ -14305,13 +14305,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0009760042089140598,
- "loss": 0.0197,
+ "loss": 0.0193,
"macro_f1": 0.3144654333591461,
"num_tokens": 2431592.0,
"repeat_count": 1.0,
- "routers_loss": 0.04939094930887222,
+ "routers_loss": 0.04704280197620392,
"skip_count": 2.0,
"step": 1506,
"text_loss": 0.16355200111865997
@@ -14324,13 +14324,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0009759093834266259,
- "loss": 0.0213,
+ "loss": 0.0206,
"macro_f1": 0.3333333432674408,
"num_tokens": 2434236.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016892930725589395,
+ "routers_loss": 0.0016075772000476718,
"skip_count": 0.0,
"step": 1508,
"text_loss": 0.6080073118209839
@@ -14343,13 +14343,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10009765625,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009758143755698186,
- "loss": 0.0147,
+ "loss": 0.015,
"macro_f1": 0.3333333432674408,
"num_tokens": 2437170.0,
"repeat_count": 0.0,
- "routers_loss": 0.008671467192471027,
+ "routers_loss": 0.008451299741864204,
"skip_count": 0.0,
"step": 1510,
"text_loss": 0.22100484371185303
@@ -14362,13 +14362,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009757191853800449,
- "loss": 0.0228,
+ "loss": 0.0227,
"macro_f1": 0.5866667032241821,
"num_tokens": 2441187.0,
"repeat_count": 1.0,
- "routers_loss": 0.042682576924562454,
+ "routers_loss": 0.046565692871809006,
"skip_count": 3.0,
"step": 1512,
"text_loss": 0.25098952651023865
@@ -14381,13 +14381,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.11279296875,
"learning_rate": 0.000975623812893782,
- "loss": 0.028,
+ "loss": 0.0276,
"macro_f1": 0.3272727429866791,
"num_tokens": 2444664.0,
"repeat_count": 0.0,
- "routers_loss": 0.02905822917819023,
+ "routers_loss": 0.02872578240931034,
"skip_count": 1.0,
"step": 1514,
"text_loss": 0.4952253997325897
@@ -14400,13 +14400,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09326171875,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.0009755282581475768,
- "loss": 0.0223,
+ "loss": 0.0233,
"macro_f1": 0.3333333432674408,
"num_tokens": 2447748.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018810008186846972,
+ "routers_loss": 0.002055214950814843,
"skip_count": 0.0,
"step": 1516,
"text_loss": 0.7465500831604004
@@ -14419,13 +14419,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.10302734375,
"learning_rate": 0.000975432521178046,
- "loss": 0.0219,
+ "loss": 0.0216,
"macro_f1": 0.3272727429866791,
"num_tokens": 2450834.0,
"repeat_count": 1.0,
- "routers_loss": 0.04308714717626572,
+ "routers_loss": 0.04498551785945892,
"skip_count": 0.0,
"step": 1518,
"text_loss": 0.28144413232803345
@@ -14438,13 +14438,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.0009753366020218763,
- "loss": 0.0232,
+ "loss": 0.0234,
"macro_f1": 0.3333333432674408,
"num_tokens": 2454233.0,
"repeat_count": 0.0,
- "routers_loss": 0.003754811594262719,
+ "routers_loss": 0.003669742727652192,
"skip_count": 0.0,
"step": 1520,
"text_loss": 0.5667551755905151
@@ -14457,32 +14457,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08837890625,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009752405007158238,
- "loss": 0.0246,
+ "loss": 0.0238,
"macro_f1": 0.3333333432674408,
"num_tokens": 2457331.0,
"repeat_count": 0.0,
- "routers_loss": 0.010853761807084084,
+ "routers_loss": 0.010455607436597347,
"skip_count": 0.0,
"step": 1522,
"text_loss": 0.19575810432434082
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.5,
"acc_skip": 1.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 7.154975051364837,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0009751442172967151,
- "loss": 0.0196,
- "macro_f1": 1.0,
+ "loss": 0.0193,
+ "macro_f1": 0.8823530077934265,
"num_tokens": 2459935.0,
"repeat_count": 2.0,
- "routers_loss": 0.015100379474461079,
+ "routers_loss": 0.025189083069562912,
"skip_count": 1.0,
"step": 1524,
"text_loss": 0.45453405380249023
@@ -14495,13 +14495,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.000975047751801446,
- "loss": 0.0189,
+ "loss": 0.0187,
"macro_f1": 0.3272727429866791,
"num_tokens": 2463008.0,
"repeat_count": 0.0,
- "routers_loss": 0.011991916224360466,
+ "routers_loss": 0.012297490611672401,
"skip_count": 0.0,
"step": 1526,
"text_loss": 0.31437572836875916
@@ -14514,32 +14514,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009749511042669823,
- "loss": 0.0226,
+ "loss": 0.0233,
"macro_f1": 0.3333333432674408,
"num_tokens": 2466475.0,
"repeat_count": 0.0,
- "routers_loss": 0.008201062679290771,
+ "routers_loss": 0.011026266030967236,
"skip_count": 0.0,
"step": 1528,
"text_loss": 0.46604859828948975
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 7.183152333431171,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.1181640625,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1376953125,
"learning_rate": 0.0009748542747303595,
- "loss": 0.0174,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0182,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2469320.0,
"repeat_count": 0.0,
- "routers_loss": 0.008513177745044231,
+ "routers_loss": 0.011934996582567692,
"skip_count": 1.0,
"step": 1530,
"text_loss": 0.7764923572540283
@@ -14552,13 +14552,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.0966796875,
"learning_rate": 0.0009747572632286827,
- "loss": 0.02,
+ "loss": 0.0203,
"macro_f1": 0.3333333432674408,
"num_tokens": 2472468.0,
"repeat_count": 0.0,
- "routers_loss": 0.004850955214351416,
+ "routers_loss": 0.005786920432001352,
"skip_count": 0.0,
"step": 1532,
"text_loss": 0.3555782437324524
@@ -14571,32 +14571,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.087890625,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009746600697991271,
- "loss": 0.0206,
+ "loss": 0.02,
"macro_f1": 0.6666666865348816,
"num_tokens": 2475736.0,
"repeat_count": 1.0,
- "routers_loss": 0.0027650354895740747,
+ "routers_loss": 0.0026990731712430716,
"skip_count": 0.0,
"step": 1534,
"text_loss": 0.49561792612075806
},
{
"acc_repeat": 1.0,
- "acc_skip": 0.0,
- "avg_layers": 29.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
"epoch": 7.2113296154975055,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
- "f1_skip": 0.0,
- "grad_norm": 0.0615234375,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0556640625,
"learning_rate": 0.0009745626944789375,
- "loss": 0.0209,
- "macro_f1": 0.6538461446762085,
+ "loss": 0.0204,
+ "macro_f1": 0.8823530077934265,
"num_tokens": 2478887.0,
"repeat_count": 1.0,
- "routers_loss": 0.023268593475222588,
+ "routers_loss": 0.020221207290887833,
"skip_count": 2.0,
"step": 1536,
"text_loss": 0.5375416278839111
@@ -14609,13 +14609,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11669921875,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009744651373054279,
"loss": 0.0286,
"macro_f1": 0.3272727429866791,
"num_tokens": 2481293.0,
"repeat_count": 0.0,
- "routers_loss": 0.031235001981258392,
+ "routers_loss": 0.03131086751818657,
"skip_count": 1.0,
"step": 1538,
"text_loss": 0.5241039395332336
@@ -14628,13 +14628,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.080078125,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009743673983159828,
- "loss": 0.023,
+ "loss": 0.0241,
"macro_f1": 0.6122449040412903,
"num_tokens": 2484403.0,
"repeat_count": 0.0,
- "routers_loss": 0.042398080229759216,
+ "routers_loss": 0.04448170214891434,
"skip_count": 4.0,
"step": 1540,
"text_loss": 0.7465724349021912
@@ -14647,13 +14647,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.08935546875,
"learning_rate": 0.0009742694775480557,
- "loss": 0.0268,
+ "loss": 0.0265,
"macro_f1": 0.6666666865348816,
"num_tokens": 2487952.0,
"repeat_count": 0.0,
- "routers_loss": 0.007361465133726597,
+ "routers_loss": 0.007171491626650095,
"skip_count": 1.0,
"step": 1542,
"text_loss": 0.2877117097377777
@@ -14666,13 +14666,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0009741713750391703,
- "loss": 0.0166,
+ "loss": 0.0171,
"macro_f1": 0.6666666865348816,
"num_tokens": 2490815.0,
"repeat_count": 1.0,
- "routers_loss": 0.0052334014326334,
+ "routers_loss": 0.004559285007417202,
"skip_count": 0.0,
"step": 1544,
"text_loss": 0.6097800135612488
@@ -14685,13 +14685,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.06787109375,
"learning_rate": 0.0009740730908269193,
"loss": 0.0174,
"macro_f1": 0.3333333432674408,
"num_tokens": 2494727.0,
"repeat_count": 0.0,
- "routers_loss": 0.004993532784283161,
+ "routers_loss": 0.005271553061902523,
"skip_count": 0.0,
"step": 1546,
"text_loss": 0.5431114435195923
@@ -14704,13 +14704,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009739746249489658,
- "loss": 0.0248,
+ "loss": 0.0239,
"macro_f1": 0.3333333432674408,
"num_tokens": 2499266.0,
"repeat_count": 0.0,
- "routers_loss": 0.001611889572814107,
+ "routers_loss": 0.0015409323386847973,
"skip_count": 0.0,
"step": 1548,
"text_loss": 0.4702678322792053
@@ -14723,13 +14723,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.1171875,
"learning_rate": 0.0009738759774430417,
- "loss": 0.0209,
+ "loss": 0.0216,
"macro_f1": 0.32098764181137085,
"num_tokens": 2502273.0,
"repeat_count": 1.0,
- "routers_loss": 0.03059260919690132,
+ "routers_loss": 0.030183158814907074,
"skip_count": 1.0,
"step": 1550,
"text_loss": 0.3239189088344574
@@ -14742,32 +14742,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056396484375,
+ "grad_norm": 0.0498046875,
"learning_rate": 0.0009737771483469493,
- "loss": 0.0195,
+ "loss": 0.0196,
"macro_f1": 0.3333333432674408,
"num_tokens": 2507624.0,
"repeat_count": 0.0,
- "routers_loss": 0.00508903618901968,
+ "routers_loss": 0.005410848651081324,
"skip_count": 0.0,
"step": 1552,
"text_loss": 0.4014642834663391
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 7.295861461696507,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
+ "f1_skip": 1.0,
"grad_norm": 0.07763671875,
"learning_rate": 0.0009736781376985598,
- "loss": 0.0174,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0168,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 2510366.0,
"repeat_count": 0.0,
- "routers_loss": 0.007860450074076653,
+ "routers_loss": 0.0066976165398955345,
"skip_count": 1.0,
"step": 1554,
"text_loss": 0.5924848914146423
@@ -14780,13 +14780,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11669921875,
+ "grad_norm": 0.13671875,
"learning_rate": 0.0009735789455358144,
- "loss": 0.0217,
+ "loss": 0.022,
"macro_f1": 0.3333333432674408,
"num_tokens": 2513317.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027370608877390623,
+ "routers_loss": 0.002763477386906743,
"skip_count": 0.0,
"step": 1556,
"text_loss": 0.3222943842411041
@@ -14799,13 +14799,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10302734375,
+ "grad_norm": 0.11767578125,
"learning_rate": 0.0009734795718967237,
- "loss": 0.0276,
+ "loss": 0.0283,
"macro_f1": 0.32098764181137085,
"num_tokens": 2516628.0,
"repeat_count": 0.0,
- "routers_loss": 0.061584725975990295,
+ "routers_loss": 0.061566028743982315,
"skip_count": 2.0,
"step": 1558,
"text_loss": 0.3249334692955017
@@ -14818,13 +14818,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009733800168193679,
"loss": 0.0228,
"macro_f1": 1.0,
"num_tokens": 2519424.0,
"repeat_count": 2.0,
- "routers_loss": 0.01694316789507866,
+ "routers_loss": 0.017976421862840652,
"skip_count": 4.0,
"step": 1560,
"text_loss": 0.3341919481754303
@@ -14837,13 +14837,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1826171875,
"learning_rate": 0.0009732802803418966,
- "loss": 0.0234,
+ "loss": 0.023,
"macro_f1": 0.3333333432674408,
"num_tokens": 2522922.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023331891279667616,
+ "routers_loss": 0.002525332849472761,
"skip_count": 0.0,
"step": 1562,
"text_loss": 0.3176332712173462
@@ -14856,13 +14856,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.07861328125,
"learning_rate": 0.0009731803625025292,
- "loss": 0.0203,
+ "loss": 0.0196,
"macro_f1": 0.3272727429866791,
"num_tokens": 2525811.0,
"repeat_count": 0.0,
- "routers_loss": 0.021300682798027992,
+ "routers_loss": 0.015524424612522125,
"skip_count": 1.0,
"step": 1564,
"text_loss": 0.532774031162262
@@ -14875,13 +14875,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.0009730802633395541,
- "loss": 0.026,
+ "loss": 0.0257,
"macro_f1": 0.6603773832321167,
"num_tokens": 2529157.0,
"repeat_count": 1.0,
- "routers_loss": 0.08335043489933014,
+ "routers_loss": 0.08138631284236908,
"skip_count": 1.0,
"step": 1566,
"text_loss": 0.529487133026123
@@ -14894,13 +14894,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009729799828913298,
- "loss": 0.0224,
+ "loss": 0.0223,
"macro_f1": 0.3333333432674408,
"num_tokens": 2532249.0,
"repeat_count": 0.0,
- "routers_loss": 0.003535634372383356,
+ "routers_loss": 0.0035867292899638414,
"skip_count": 0.0,
"step": 1568,
"text_loss": 0.503160297870636
@@ -14913,13 +14913,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009728795211962838,
"loss": 0.0259,
"macro_f1": 0.5492662787437439,
"num_tokens": 2535904.0,
"repeat_count": 0.0,
- "routers_loss": 0.025729363784193993,
+ "routers_loss": 0.02987455204129219,
"skip_count": 2.0,
"step": 1570,
"text_loss": 0.9170270562171936
@@ -14932,13 +14932,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1357421875,
+ "grad_norm": 0.11865234375,
"learning_rate": 0.0009727788782929131,
- "loss": 0.0287,
+ "loss": 0.0273,
"macro_f1": 0.3272727429866791,
"num_tokens": 2538943.0,
"repeat_count": 1.0,
- "routers_loss": 0.059166863560676575,
+ "routers_loss": 0.04676021635532379,
"skip_count": 0.0,
"step": 1572,
"text_loss": 0.29146310687065125
@@ -14951,13 +14951,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0009726780542197844,
- "loss": 0.0173,
+ "loss": 0.0169,
"macro_f1": 0.3333333432674408,
"num_tokens": 2541805.0,
"repeat_count": 0.0,
- "routers_loss": 0.002580022206529975,
+ "routers_loss": 0.002127803163602948,
"skip_count": 0.0,
"step": 1574,
"text_loss": 1.0126502513885498
@@ -14970,13 +14970,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009725770490155338,
- "loss": 0.0257,
+ "loss": 0.0262,
"macro_f1": 0.3333333432674408,
"num_tokens": 2546213.0,
"repeat_count": 0.0,
- "routers_loss": 0.007746981456875801,
+ "routers_loss": 0.007609677035361528,
"skip_count": 0.0,
"step": 1576,
"text_loss": 0.190168559551239
@@ -14989,13 +14989,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.083984375,
"learning_rate": 0.0009724758627188665,
- "loss": 0.0344,
+ "loss": 0.0356,
"macro_f1": 0.3272727429866791,
"num_tokens": 2549554.0,
"repeat_count": 0.0,
- "routers_loss": 0.027308562770485878,
+ "routers_loss": 0.033554721623659134,
"skip_count": 1.0,
"step": 1578,
"text_loss": 0.2977406084537506
@@ -15008,13 +15008,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009723744953685572,
- "loss": 0.0277,
+ "loss": 0.028,
"macro_f1": 0.3272727429866791,
"num_tokens": 2552785.0,
"repeat_count": 1.0,
- "routers_loss": 0.029863199219107628,
+ "routers_loss": 0.027864238247275352,
"skip_count": 0.0,
"step": 1580,
"text_loss": 0.2700682580471039
@@ -15027,13 +15027,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.19921875,
"learning_rate": 0.0009722729470034503,
- "loss": 0.0218,
+ "loss": 0.0224,
"macro_f1": 0.3333333432674408,
"num_tokens": 2556550.0,
"repeat_count": 0.0,
- "routers_loss": 0.004019706044346094,
+ "routers_loss": 0.004798175301402807,
"skip_count": 0.0,
"step": 1582,
"text_loss": 0.6559903025627136
@@ -15046,32 +15046,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07177734375,
+ "grad_norm": 0.078125,
"learning_rate": 0.0009721712176624591,
- "loss": 0.0239,
+ "loss": 0.0242,
"macro_f1": 0.3333333432674408,
"num_tokens": 2559862.0,
"repeat_count": 0.0,
- "routers_loss": 0.014162382110953331,
+ "routers_loss": 0.013764148578047752,
"skip_count": 0.0,
"step": 1584,
"text_loss": 0.2257535308599472
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 7.446140299383622,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.0009720693073845667,
- "loss": 0.0338,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.032,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 2562766.0,
"repeat_count": 0.0,
- "routers_loss": 0.023485012352466583,
+ "routers_loss": 0.01937069371342659,
"skip_count": 2.0,
"step": 1586,
"text_loss": 0.178413525223732
@@ -15079,37 +15079,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 7.455532726739067,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.150390625,
"learning_rate": 0.0009719672162088252,
- "loss": 0.0308,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0306,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 2566583.0,
"repeat_count": 1.0,
- "routers_loss": 0.05822715163230896,
+ "routers_loss": 0.06224144622683525,
"skip_count": 0.0,
"step": 1588,
"text_loss": 0.3992367684841156
},
{
- "acc_repeat": 0.5,
- "acc_skip": 0.5,
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
"avg_layers": 27.0,
"epoch": 7.464925154094511,
- "f1_execute": 0.936170220375061,
- "f1_repeat": 0.6666666865348816,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.189453125,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.185546875,
"learning_rate": 0.0009718649441743559,
- "loss": 0.0243,
- "macro_f1": 0.7565011978149414,
+ "loss": 0.0239,
+ "macro_f1": 0.9449735879898071,
"num_tokens": 2569516.0,
"repeat_count": 2.0,
- "routers_loss": 0.07448136061429977,
+ "routers_loss": 0.06937911361455917,
"skip_count": 4.0,
"step": 1590,
"text_loss": 0.1945122629404068
@@ -15122,13 +15122,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.00097176249132035,
- "loss": 0.0228,
+ "loss": 0.0229,
"macro_f1": 0.3333333432674408,
"num_tokens": 2572418.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038424162194132805,
+ "routers_loss": 0.0034326619934290648,
"skip_count": 0.0,
"step": 1592,
"text_loss": 0.6259906888008118
@@ -15141,13 +15141,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.08642578125,
"learning_rate": 0.0009716598576860676,
- "loss": 0.0277,
+ "loss": 0.0278,
"macro_f1": 0.6666666865348816,
"num_tokens": 2575235.0,
"repeat_count": 1.0,
- "routers_loss": 0.005674343090504408,
+ "routers_loss": 0.004557516425848007,
"skip_count": 0.0,
"step": 1594,
"text_loss": 0.6638736724853516
@@ -15160,13 +15160,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.193359375,
"learning_rate": 0.0009715570433108378,
- "loss": 0.0209,
+ "loss": 0.0198,
"macro_f1": 1.0,
"num_tokens": 2578157.0,
"repeat_count": 1.0,
- "routers_loss": 0.015544800087809563,
+ "routers_loss": 0.015363055281341076,
"skip_count": 1.0,
"step": 1596,
"text_loss": 0.6530464887619019
@@ -15179,13 +15179,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009714540482340595,
- "loss": 0.0279,
+ "loss": 0.0268,
"macro_f1": 0.6666666865348816,
"num_tokens": 2581801.0,
"repeat_count": 1.0,
- "routers_loss": 0.013199405744671822,
+ "routers_loss": 0.01257144846022129,
"skip_count": 0.0,
"step": 1598,
"text_loss": 0.5916110277175903
@@ -15198,13 +15198,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059326171875,
+ "grad_norm": 0.058837890625,
"learning_rate": 0.0009713508724952006,
- "loss": 0.0178,
+ "loss": 0.0177,
"macro_f1": 0.3333333432674408,
"num_tokens": 2585204.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032487998250871897,
+ "routers_loss": 0.003175645601004362,
"skip_count": 0.0,
"step": 1600,
"text_loss": 0.27901601791381836
@@ -15217,13 +15217,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12255859375,
+ "grad_norm": 0.12353515625,
"learning_rate": 0.0009712475161337981,
- "loss": 0.0253,
+ "loss": 0.0261,
"macro_f1": 0.3333333432674408,
"num_tokens": 2588286.0,
"repeat_count": 0.0,
- "routers_loss": 0.0041928659193217754,
+ "routers_loss": 0.004122321493923664,
"skip_count": 0.0,
"step": 1602,
"text_loss": 0.42420244216918945
@@ -15236,13 +15236,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0009711439791894585,
- "loss": 0.0343,
+ "loss": 0.0341,
"macro_f1": 0.6666666865348816,
"num_tokens": 2591476.0,
"repeat_count": 0.0,
- "routers_loss": 0.011576149612665176,
+ "routers_loss": 0.011215819045901299,
"skip_count": 1.0,
"step": 1604,
"text_loss": 0.5549933910369873
@@ -15255,13 +15255,13 @@
"f1_execute": 0.9599999785423279,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009710402617018574,
- "loss": 0.0179,
+ "loss": 0.0172,
"macro_f1": 0.8200000524520874,
"num_tokens": 2594336.0,
"repeat_count": 1.0,
- "routers_loss": 0.03026912547647953,
+ "routers_loss": 0.02916567400097847,
"skip_count": 2.0,
"step": 1606,
"text_loss": 0.3263779282569885
@@ -15276,11 +15276,11 @@
"f1_skip": 1.0,
"grad_norm": 0.068359375,
"learning_rate": 0.0009709363637107393,
- "loss": 0.021,
+ "loss": 0.0209,
"macro_f1": 0.6666666865348816,
"num_tokens": 2597462.0,
"repeat_count": 0.0,
- "routers_loss": 0.014957098290324211,
+ "routers_loss": 0.015897957608103752,
"skip_count": 1.0,
"step": 1608,
"text_loss": 0.20917139947414398
@@ -15293,13 +15293,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009708322852559184,
- "loss": 0.0226,
+ "loss": 0.0229,
"macro_f1": 0.3333333432674408,
"num_tokens": 2601543.0,
"repeat_count": 0.0,
- "routers_loss": 0.00254683755338192,
+ "routers_loss": 0.002211357234045863,
"skip_count": 0.0,
"step": 1610,
"text_loss": 0.450550377368927
@@ -15312,13 +15312,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1748046875,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.0009707280263772776,
- "loss": 0.0286,
+ "loss": 0.0277,
"macro_f1": 0.6666666865348816,
"num_tokens": 2604462.0,
"repeat_count": 0.0,
- "routers_loss": 0.018759876489639282,
+ "routers_loss": 0.01615734025835991,
"skip_count": 2.0,
"step": 1612,
"text_loss": 0.6908381581306458
@@ -15337,7 +15337,7 @@
"macro_f1": 0.5492662787437439,
"num_tokens": 2607484.0,
"repeat_count": 0.0,
- "routers_loss": 0.022694367915391922,
+ "routers_loss": 0.022048067301511765,
"skip_count": 2.0,
"step": 1614,
"text_loss": 0.36691340804100037
@@ -15350,13 +15350,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.10546875,
"learning_rate": 0.0009705189675084138,
- "loss": 0.0181,
+ "loss": 0.0176,
"macro_f1": 0.6666666865348816,
"num_tokens": 2610204.0,
"repeat_count": 0.0,
- "routers_loss": 0.010102321393787861,
+ "routers_loss": 0.008503952994942665,
"skip_count": 1.0,
"step": 1616,
"text_loss": 0.5226598381996155
@@ -15369,13 +15369,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08984375,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009704141675983029,
- "loss": 0.0252,
+ "loss": 0.0248,
"macro_f1": 0.3333333432674408,
"num_tokens": 2613128.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020994991064071655,
+ "routers_loss": 0.0019020626787096262,
"skip_count": 0.0,
"step": 1618,
"text_loss": 0.6465088725090027
@@ -15388,13 +15388,13 @@
"f1_execute": 0.9333333373069763,
"f1_repeat": 0.0,
"f1_skip": 0.7272727489471436,
- "grad_norm": 0.10009765625,
+ "grad_norm": 0.107421875,
"learning_rate": 0.0009703091874245956,
- "loss": 0.0323,
+ "loss": 0.032,
"macro_f1": 0.5535354018211365,
"num_tokens": 2616360.0,
"repeat_count": 0.0,
- "routers_loss": 0.11748704314231873,
+ "routers_loss": 0.11837691068649292,
"skip_count": 7.0,
"step": 1620,
"text_loss": 0.2987039089202881
@@ -15407,32 +15407,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009702040270275204,
- "loss": 0.018,
+ "loss": 0.0181,
"macro_f1": 0.3333333432674408,
"num_tokens": 2619606.0,
"repeat_count": 0.0,
- "routers_loss": 0.007642311509698629,
+ "routers_loss": 0.0065958453342318535,
"skip_count": 0.0,
"step": 1622,
"text_loss": 0.6262096166610718
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 7.62459641913707,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.10595703125,
+ "f1_skip": 1.0,
+ "grad_norm": 0.103515625,
"learning_rate": 0.000970098686447375,
- "loss": 0.0258,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0257,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 2622499.0,
"repeat_count": 0.0,
- "routers_loss": 0.016890225932002068,
+ "routers_loss": 0.013632026500999928,
"skip_count": 1.0,
"step": 1624,
"text_loss": 0.2392602562904358
@@ -15445,13 +15445,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.125,
"learning_rate": 0.0009699931657245264,
- "loss": 0.0242,
+ "loss": 0.0245,
"macro_f1": 0.5492662787437439,
"num_tokens": 2626002.0,
"repeat_count": 0.0,
- "routers_loss": 0.010900186374783516,
+ "routers_loss": 0.012147823348641396,
"skip_count": 2.0,
"step": 1626,
"text_loss": 0.4742976129055023
@@ -15464,13 +15464,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0810546875,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009698874648994098,
- "loss": 0.0279,
+ "loss": 0.0285,
"macro_f1": 1.0,
"num_tokens": 2629847.0,
"repeat_count": 1.0,
- "routers_loss": 0.011229799129068851,
+ "routers_loss": 0.010692884214222431,
"skip_count": 3.0,
"step": 1628,
"text_loss": 0.5090685486793518
@@ -15483,13 +15483,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.0009697815840125304,
- "loss": 0.0275,
+ "loss": 0.0265,
"macro_f1": 0.3333333432674408,
"num_tokens": 2633529.0,
"repeat_count": 0.0,
- "routers_loss": 0.0105878422036767,
+ "routers_loss": 0.011442207731306553,
"skip_count": 0.0,
"step": 1630,
"text_loss": 0.1874329298734665
@@ -15502,13 +15502,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.2119140625,
"learning_rate": 0.0009696755231044618,
- "loss": 0.0209,
+ "loss": 0.0207,
"macro_f1": 0.3333333432674408,
"num_tokens": 2636321.0,
"repeat_count": 0.0,
- "routers_loss": 0.002953991526737809,
+ "routers_loss": 0.0026681360322982073,
"skip_count": 0.0,
"step": 1632,
"text_loss": 0.7650400400161743
@@ -15521,13 +15521,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10888671875,
+ "grad_norm": 0.10498046875,
"learning_rate": 0.0009695692822158466,
- "loss": 0.0241,
+ "loss": 0.0242,
"macro_f1": 0.3272727429866791,
"num_tokens": 2638840.0,
"repeat_count": 1.0,
- "routers_loss": 0.04717390984296799,
+ "routers_loss": 0.033965807408094406,
"skip_count": 0.0,
"step": 1634,
"text_loss": 0.6175784468650818
@@ -15540,13 +15540,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0009694628613873968,
- "loss": 0.0179,
+ "loss": 0.018,
"macro_f1": 0.3333333432674408,
"num_tokens": 2641886.0,
"repeat_count": 0.0,
- "routers_loss": 0.0073657832108438015,
+ "routers_loss": 0.007568214554339647,
"skip_count": 0.0,
"step": 1636,
"text_loss": 0.43139931559562683
@@ -15559,13 +15559,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.193359375,
"learning_rate": 0.0009693562606598929,
- "loss": 0.0259,
+ "loss": 0.025,
"macro_f1": 0.3333333432674408,
"num_tokens": 2645028.0,
"repeat_count": 0.0,
- "routers_loss": 0.005212752148509026,
+ "routers_loss": 0.004973865579813719,
"skip_count": 0.0,
"step": 1638,
"text_loss": 0.6430339217185974
@@ -15578,13 +15578,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0009692494800741844,
- "loss": 0.0304,
+ "loss": 0.0313,
"macro_f1": 0.3272727429866791,
"num_tokens": 2648209.0,
"repeat_count": 1.0,
- "routers_loss": 0.04311618581414223,
+ "routers_loss": 0.049863800406455994,
"skip_count": 0.0,
"step": 1640,
"text_loss": 0.28138160705566406
@@ -15597,13 +15597,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08251953125,
+ "grad_norm": 0.08544921875,
"learning_rate": 0.0009691425196711901,
- "loss": 0.039,
+ "loss": 0.0398,
"macro_f1": 0.3272727429866791,
"num_tokens": 2651171.0,
"repeat_count": 0.0,
- "routers_loss": 0.02027471922338009,
+ "routers_loss": 0.02112230286002159,
"skip_count": 0.0,
"step": 1642,
"text_loss": 0.3745322525501251
@@ -15616,13 +15616,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009690353794918971,
- "loss": 0.0279,
+ "loss": 0.0275,
"macro_f1": 0.3333333432674408,
"num_tokens": 2654093.0,
"repeat_count": 0.0,
- "routers_loss": 0.003074956126511097,
+ "routers_loss": 0.0024304776452481747,
"skip_count": 0.0,
"step": 1644,
"text_loss": 0.4275154173374176
@@ -15635,13 +15635,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.0771484375,
"learning_rate": 0.000968928059577362,
- "loss": 0.0241,
+ "loss": 0.0244,
"macro_f1": 0.6666666865348816,
"num_tokens": 2657079.0,
"repeat_count": 0.0,
- "routers_loss": 0.009374706074595451,
+ "routers_loss": 0.009320619516074657,
"skip_count": 1.0,
"step": 1646,
"text_loss": 0.46650025248527527
@@ -15654,13 +15654,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1162109375,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009688205599687099,
- "loss": 0.0218,
+ "loss": 0.0209,
"macro_f1": 0.3272727429866791,
"num_tokens": 2660951.0,
"repeat_count": 0.0,
- "routers_loss": 0.01204691268503666,
+ "routers_loss": 0.011913162656128407,
"skip_count": 0.0,
"step": 1648,
"text_loss": 0.46644100546836853
@@ -15673,13 +15673,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009687128807071347,
"loss": 0.0284,
"macro_f1": 0.3333333432674408,
"num_tokens": 2663823.0,
"repeat_count": 0.0,
- "routers_loss": 0.01376053225249052,
+ "routers_loss": 0.013754756189882755,
"skip_count": 0.0,
"step": 1650,
"text_loss": 0.40808847546577454
@@ -15692,13 +15692,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.103515625,
"learning_rate": 0.0009686050218338996,
- "loss": 0.0285,
+ "loss": 0.0286,
"macro_f1": 0.3333333432674408,
"num_tokens": 2667079.0,
"repeat_count": 0.0,
- "routers_loss": 0.009346984326839447,
+ "routers_loss": 0.009099726565182209,
"skip_count": 0.0,
"step": 1652,
"text_loss": 0.2389989197254181
@@ -15711,13 +15711,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.08837890625,
"learning_rate": 0.0009684969833903359,
- "loss": 0.0291,
+ "loss": 0.0283,
"macro_f1": 0.6666666865348816,
"num_tokens": 2670162.0,
"repeat_count": 0.0,
- "routers_loss": 0.002724624238908291,
+ "routers_loss": 0.0034928603563457727,
"skip_count": 1.0,
"step": 1654,
"text_loss": 0.6930749416351318
@@ -15730,13 +15730,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009683887654178445,
- "loss": 0.0271,
+ "loss": 0.0261,
"macro_f1": 0.6666666865348816,
"num_tokens": 2673031.0,
"repeat_count": 0.0,
- "routers_loss": 0.00823777075856924,
+ "routers_loss": 0.008340462110936642,
"skip_count": 1.0,
"step": 1656,
"text_loss": 0.277752548456192
@@ -15749,32 +15749,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07373046875,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009682803679578947,
- "loss": 0.0262,
+ "loss": 0.0259,
"macro_f1": 0.3333333432674408,
"num_tokens": 2676092.0,
"repeat_count": 0.0,
- "routers_loss": 0.004393119364976883,
+ "routers_loss": 0.004337446764111519,
"skip_count": 0.0,
"step": 1658,
"text_loss": 0.5176776051521301
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 7.7936601115350745,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.1513671875,
+ "f1_skip": 0.0,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009681717910520244,
- "loss": 0.024,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0242,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 2679479.0,
"repeat_count": 0.0,
- "routers_loss": 0.031827569007873535,
+ "routers_loss": 0.034611742943525314,
"skip_count": 2.0,
"step": 1660,
"text_loss": 0.21485982835292816
@@ -15789,11 +15789,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.07958984375,
"learning_rate": 0.0009680630347418406,
- "loss": 0.0216,
+ "loss": 0.022,
"macro_f1": 0.5492662787437439,
"num_tokens": 2683289.0,
"repeat_count": 0.0,
- "routers_loss": 0.03329647704958916,
+ "routers_loss": 0.03297121450304985,
"skip_count": 2.0,
"step": 1662,
"text_loss": 0.33801013231277466
@@ -15806,13 +15806,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.000967954099069019,
- "loss": 0.0415,
+ "loss": 0.0411,
"macro_f1": 0.32098764181137085,
"num_tokens": 2685879.0,
"repeat_count": 1.0,
- "routers_loss": 0.047317031770944595,
+ "routers_loss": 0.04551183059811592,
"skip_count": 1.0,
"step": 1664,
"text_loss": 0.41123488545417786
@@ -15827,11 +15827,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1240234375,
"learning_rate": 0.0009678449840753038,
- "loss": 0.0325,
+ "loss": 0.0324,
"macro_f1": 0.32098764181137085,
"num_tokens": 2688910.0,
"repeat_count": 0.0,
- "routers_loss": 0.05649980902671814,
+ "routers_loss": 0.05866450071334839,
"skip_count": 2.0,
"step": 1666,
"text_loss": 0.1740892380475998
@@ -15844,13 +15844,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009677356898025082,
- "loss": 0.0229,
+ "loss": 0.023,
"macro_f1": 0.3333333432674408,
"num_tokens": 2691680.0,
"repeat_count": 0.0,
- "routers_loss": 0.01004624180495739,
+ "routers_loss": 0.009243223816156387,
"skip_count": 0.0,
"step": 1668,
"text_loss": 0.2512350380420685
@@ -15863,13 +15863,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.09619140625,
"learning_rate": 0.000967626216292514,
- "loss": 0.0194,
+ "loss": 0.0195,
"macro_f1": 0.3333333432674408,
"num_tokens": 2694895.0,
"repeat_count": 0.0,
- "routers_loss": 0.0054973396472632885,
+ "routers_loss": 0.005576452240347862,
"skip_count": 0.0,
"step": 1670,
"text_loss": 0.43294376134872437
@@ -15882,13 +15882,13 @@
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.09130859375,
"learning_rate": 0.0009675165635872715,
- "loss": 0.031,
+ "loss": 0.0306,
"macro_f1": 0.44705885648727417,
"num_tokens": 2697806.0,
"repeat_count": 0.0,
- "routers_loss": 0.05615650862455368,
+ "routers_loss": 0.05372785031795502,
"skip_count": 3.0,
"step": 1672,
"text_loss": 0.1614082306623459
@@ -15901,13 +15901,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009674067317288,
- "loss": 0.0301,
+ "loss": 0.0296,
"macro_f1": 0.6666666865348816,
"num_tokens": 2700529.0,
"repeat_count": 1.0,
- "routers_loss": 0.012819192372262478,
+ "routers_loss": 0.018131591379642487,
"skip_count": 0.0,
"step": 1674,
"text_loss": 0.2093173861503601
@@ -15920,13 +15920,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.08203125,
"learning_rate": 0.0009672967207591869,
- "loss": 0.0253,
+ "loss": 0.0257,
"macro_f1": 0.3272727429866791,
"num_tokens": 2703650.0,
"repeat_count": 0.0,
- "routers_loss": 0.07059332728385925,
+ "routers_loss": 0.0673515796661377,
"skip_count": 1.0,
"step": 1676,
"text_loss": 0.3029400110244751
@@ -15939,13 +15939,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009671865307205892,
- "loss": 0.0198,
+ "loss": 0.021,
"macro_f1": 0.32098767161369324,
"num_tokens": 2707615.0,
"repeat_count": 0.0,
- "routers_loss": 0.029778441414237022,
+ "routers_loss": 0.03821169584989548,
"skip_count": 1.0,
"step": 1678,
"text_loss": 0.2262786477804184
@@ -15958,13 +15958,13 @@
"f1_execute": 0.9756097793579102,
"f1_repeat": 1.0,
"f1_skip": 0.9090909361839294,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0009670761616552315,
- "loss": 0.0474,
+ "loss": 0.0465,
"macro_f1": 0.9615669250488281,
"num_tokens": 2710894.0,
"repeat_count": 2.0,
- "routers_loss": 0.04371272772550583,
+ "routers_loss": 0.042625464498996735,
"skip_count": 6.0,
"step": 1680,
"text_loss": 0.29623574018478394
@@ -15977,13 +15977,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009669656136054074,
- "loss": 0.0293,
+ "loss": 0.0289,
"macro_f1": 0.3333333432674408,
"num_tokens": 2714330.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033591394312679768,
+ "routers_loss": 0.0037571541033685207,
"skip_count": 0.0,
"step": 1682,
"text_loss": 0.7510389089584351
@@ -15996,13 +15996,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.07421875,
"learning_rate": 0.0009668548866134795,
- "loss": 0.0259,
+ "loss": 0.0256,
"macro_f1": 0.3333333432674408,
"num_tokens": 2717176.0,
"repeat_count": 0.0,
- "routers_loss": 0.005085585173219442,
+ "routers_loss": 0.004142968449741602,
"skip_count": 0.0,
"step": 1684,
"text_loss": 0.3273485600948334
@@ -16015,13 +16015,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0712890625,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009667439807218783,
- "loss": 0.0243,
+ "loss": 0.0233,
"macro_f1": 0.6666666865348816,
"num_tokens": 2720628.0,
"repeat_count": 0.0,
- "routers_loss": 0.008569681085646152,
+ "routers_loss": 0.008753842674195766,
"skip_count": 2.0,
"step": 1686,
"text_loss": 0.4314708709716797
@@ -16034,32 +16034,32 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0009666328959731033,
- "loss": 0.022,
+ "loss": 0.0211,
"macro_f1": 0.6603773832321167,
"num_tokens": 2723739.0,
"repeat_count": 1.0,
- "routers_loss": 0.024587804451584816,
+ "routers_loss": 0.022674910724163055,
"skip_count": 1.0,
"step": 1688,
"text_loss": 0.25734150409698486
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.3333333432674408,
- "avg_layers": 27.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
"epoch": 7.934546521866745,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
- "f1_skip": 0.5,
- "grad_norm": 0.169921875,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009665216324097222,
- "loss": 0.0332,
- "macro_f1": 0.4871794879436493,
+ "loss": 0.0324,
+ "macro_f1": 0.5934640765190125,
"num_tokens": 2726644.0,
"repeat_count": 0.0,
- "routers_loss": 0.037516288459300995,
+ "routers_loss": 0.03932750225067139,
"skip_count": 3.0,
"step": 1690,
"text_loss": 0.24511034786701202
@@ -16072,13 +16072,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.09765625,
"learning_rate": 0.0009664101900743714,
- "loss": 0.0262,
+ "loss": 0.0255,
"macro_f1": 0.3272727429866791,
"num_tokens": 2729662.0,
"repeat_count": 0.0,
- "routers_loss": 0.01287431176751852,
+ "routers_loss": 0.012672754004597664,
"skip_count": 1.0,
"step": 1692,
"text_loss": 0.39431414008140564
@@ -16091,13 +16091,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.07763671875,
+ "grad_norm": 0.076171875,
"learning_rate": 0.000966298569009756,
- "loss": 0.0227,
+ "loss": 0.0231,
"macro_f1": 0.5492662787437439,
"num_tokens": 2732578.0,
"repeat_count": 0.0,
- "routers_loss": 0.015499880537390709,
+ "routers_loss": 0.01548632513731718,
"skip_count": 2.0,
"step": 1694,
"text_loss": 0.12439999729394913
@@ -16110,13 +16110,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.080078125,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009661867692586494,
- "loss": 0.0144,
+ "loss": 0.0153,
"macro_f1": 0.32098764181137085,
"num_tokens": 2735887.0,
"repeat_count": 0.0,
- "routers_loss": 0.049878787249326706,
+ "routers_loss": 0.05622401833534241,
"skip_count": 2.0,
"step": 1696,
"text_loss": 0.29024389386177063
@@ -16129,13 +16129,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10009765625,
+ "grad_norm": 0.087890625,
"learning_rate": 0.0009660747908638933,
- "loss": 0.0206,
+ "loss": 0.0205,
"macro_f1": 0.3272727429866791,
"num_tokens": 2739293.0,
"repeat_count": 0.0,
- "routers_loss": 0.04108169302344322,
+ "routers_loss": 0.041060201823711395,
"skip_count": 1.0,
"step": 1698,
"text_loss": 0.39461007714271545
@@ -16148,13 +16148,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.1767578125,
"learning_rate": 0.0009659626338683981,
- "loss": 0.0367,
+ "loss": 0.0369,
"macro_f1": 0.3333333432674408,
"num_tokens": 2742468.0,
"repeat_count": 0.0,
- "routers_loss": 0.007651917636394501,
+ "routers_loss": 0.007251353468745947,
"skip_count": 0.0,
"step": 1700,
"text_loss": 0.2751767635345459
@@ -16167,13 +16167,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.07763671875,
"learning_rate": 0.0009658502983151427,
- "loss": 0.0182,
+ "loss": 0.0186,
"macro_f1": 0.3272727429866791,
"num_tokens": 2745123.0,
"repeat_count": 0.0,
- "routers_loss": 0.015448091551661491,
+ "routers_loss": 0.012847424484789371,
"skip_count": 1.0,
"step": 1702,
"text_loss": 0.4756404757499695
@@ -16186,13 +16186,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.11767578125,
"learning_rate": 0.0009657377842471742,
- "loss": 0.0324,
+ "loss": 0.0313,
"macro_f1": 0.6666666865348816,
"num_tokens": 2748016.0,
"repeat_count": 0.0,
- "routers_loss": 0.009139287285506725,
+ "routers_loss": 0.007060411386191845,
"skip_count": 1.0,
"step": 1704,
"text_loss": 0.9571210145950317
@@ -16205,13 +16205,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.10009765625,
"learning_rate": 0.0009656250917076081,
- "loss": 0.0191,
+ "loss": 0.0188,
"macro_f1": 0.5492662787437439,
"num_tokens": 2750717.0,
"repeat_count": 0.0,
- "routers_loss": 0.015412120148539543,
+ "routers_loss": 0.016748681664466858,
"skip_count": 2.0,
"step": 1706,
"text_loss": 0.14542843401432037
@@ -16224,13 +16224,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.060302734375,
"learning_rate": 0.0009655122207396285,
- "loss": 0.0175,
+ "loss": 0.017,
"macro_f1": 0.3333333432674408,
"num_tokens": 2753635.0,
"repeat_count": 0.0,
- "routers_loss": 0.012735052965581417,
+ "routers_loss": 0.013607042841613293,
"skip_count": 0.0,
"step": 1708,
"text_loss": 0.21836471557617188
@@ -16243,13 +16243,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07177734375,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0009653991713864878,
- "loss": 0.0192,
+ "loss": 0.0205,
"macro_f1": 0.3333333432674408,
"num_tokens": 2756643.0,
"repeat_count": 0.0,
- "routers_loss": 0.00114025070797652,
+ "routers_loss": 0.0012097888393327594,
"skip_count": 0.0,
"step": 1710,
"text_loss": 0.635187029838562
@@ -16262,13 +16262,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1171875,
"learning_rate": 0.0009652859436915066,
- "loss": 0.0243,
+ "loss": 0.0231,
"macro_f1": 0.3333333432674408,
"num_tokens": 2759432.0,
"repeat_count": 0.0,
- "routers_loss": 0.006401443853974342,
+ "routers_loss": 0.006196760106831789,
"skip_count": 0.0,
"step": 1712,
"text_loss": 0.5629420876502991
@@ -16281,13 +16281,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0009651725376980743,
- "loss": 0.0185,
+ "loss": 0.0177,
"macro_f1": 0.3333333432674408,
"num_tokens": 2762538.0,
"repeat_count": 0.0,
- "routers_loss": 0.004316259175539017,
+ "routers_loss": 0.0042513771913945675,
"skip_count": 0.0,
"step": 1714,
"text_loss": 0.39522525668144226
@@ -16300,13 +16300,13 @@
"f1_execute": 0.9583333134651184,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.125,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009650589534496479,
- "loss": 0.0201,
+ "loss": 0.0194,
"macro_f1": 0.8194444179534912,
"num_tokens": 2765571.0,
"repeat_count": 2.0,
- "routers_loss": 0.043461959809064865,
+ "routers_loss": 0.03596706688404083,
"skip_count": 3.0,
"step": 1716,
"text_loss": 0.6252416968345642
@@ -16319,13 +16319,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.04833984375,
"learning_rate": 0.0009649451909897532,
"loss": 0.0178,
"macro_f1": 0.3333333432674408,
"num_tokens": 2769206.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024530428927391768,
+ "routers_loss": 0.0025788163766264915,
"skip_count": 0.0,
"step": 1718,
"text_loss": 0.8851634860038757
@@ -16338,13 +16338,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.10791015625,
"learning_rate": 0.0009648312503619843,
- "loss": 0.026,
+ "loss": 0.0265,
"macro_f1": 0.3333333432674408,
"num_tokens": 2772488.0,
"repeat_count": 0.0,
- "routers_loss": 0.0046626063995063305,
+ "routers_loss": 0.004443451762199402,
"skip_count": 0.0,
"step": 1720,
"text_loss": 0.8568580746650696
@@ -16357,13 +16357,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009647171316100034,
- "loss": 0.0257,
+ "loss": 0.0265,
"macro_f1": 0.9265305995941162,
"num_tokens": 2776482.0,
"repeat_count": 1.0,
- "routers_loss": 0.02480102889239788,
+ "routers_loss": 0.022948263213038445,
"skip_count": 3.0,
"step": 1722,
"text_loss": 0.13431036472320557
@@ -16376,13 +16376,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.0009646028347775409,
- "loss": 0.02,
+ "loss": 0.0204,
"macro_f1": 0.6666666865348816,
"num_tokens": 2778966.0,
"repeat_count": 0.0,
- "routers_loss": 0.012629947625100613,
+ "routers_loss": 0.011328035034239292,
"skip_count": 1.0,
"step": 1724,
"text_loss": 0.2085491120815277
@@ -16395,13 +16395,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08447265625,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009644883599083958,
"loss": 0.0238,
"macro_f1": 0.3333333432674408,
"num_tokens": 2781968.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024127380456775427,
+ "routers_loss": 0.002208018908277154,
"skip_count": 0.0,
"step": 1726,
"text_loss": 0.4948323965072632
@@ -16414,13 +16414,13 @@
"f1_execute": 0.9411764740943909,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.054443359375,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0009643737070464349,
- "loss": 0.0162,
+ "loss": 0.0158,
"macro_f1": 0.6470588445663452,
"num_tokens": 2784666.0,
"repeat_count": 1.0,
- "routers_loss": 0.0415453165769577,
+ "routers_loss": 0.04391832649707794,
"skip_count": 2.0,
"step": 1728,
"text_loss": 0.39060094952583313
@@ -16433,13 +16433,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.046630859375,
"learning_rate": 0.0009642588762355935,
- "loss": 0.0211,
+ "loss": 0.0212,
"macro_f1": 0.6666666865348816,
"num_tokens": 2787558.0,
"repeat_count": 0.0,
- "routers_loss": 0.0056681083515286446,
+ "routers_loss": 0.004497280344367027,
"skip_count": 1.0,
"step": 1730,
"text_loss": 0.34908708930015564
@@ -16452,13 +16452,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0009641438675198748,
- "loss": 0.0189,
+ "loss": 0.0175,
"macro_f1": 0.3333333432674408,
"num_tokens": 2790474.0,
"repeat_count": 0.0,
- "routers_loss": 0.006391602102667093,
+ "routers_loss": 0.00583475548774004,
"skip_count": 0.0,
"step": 1732,
"text_loss": 0.5720033049583435
@@ -16471,13 +16471,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0595703125,
+ "grad_norm": 0.08154296875,
"learning_rate": 0.0009640286809433508,
- "loss": 0.0229,
+ "loss": 0.0235,
"macro_f1": 0.3333333432674408,
"num_tokens": 2793272.0,
"repeat_count": 0.0,
- "routers_loss": 0.007466991897672415,
+ "routers_loss": 0.007826375775039196,
"skip_count": 0.0,
"step": 1734,
"text_loss": 0.32181721925735474
@@ -16490,13 +16490,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0009639133165501606,
- "loss": 0.0197,
+ "loss": 0.0192,
"macro_f1": 0.3333333432674408,
"num_tokens": 2797726.0,
"repeat_count": 0.0,
- "routers_loss": 0.001953453291207552,
+ "routers_loss": 0.0019055595621466637,
"skip_count": 0.0,
"step": 1736,
"text_loss": 0.620936393737793
@@ -16509,13 +16509,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009637977743845124,
- "loss": 0.0223,
+ "loss": 0.0229,
"macro_f1": 0.3333333432674408,
"num_tokens": 2800706.0,
"repeat_count": 0.0,
- "routers_loss": 0.003612719476222992,
+ "routers_loss": 0.0028302327264100313,
"skip_count": 0.0,
"step": 1738,
"text_loss": 0.6473138332366943
@@ -16528,13 +16528,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.049072265625,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009636820544906823,
- "loss": 0.0145,
+ "loss": 0.0146,
"macro_f1": 1.0,
"num_tokens": 2803847.0,
"repeat_count": 1.0,
- "routers_loss": 0.009977150708436966,
+ "routers_loss": 0.01105099730193615,
"skip_count": 2.0,
"step": 1740,
"text_loss": 0.4401201903820038
@@ -16547,13 +16547,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009635661569130141,
"loss": 0.0195,
"macro_f1": 0.5934640765190125,
"num_tokens": 2807235.0,
"repeat_count": 0.0,
- "routers_loss": 0.026468059048056602,
+ "routers_loss": 0.02619045600295067,
"skip_count": 3.0,
"step": 1742,
"text_loss": 0.459264874458313
@@ -16566,13 +16566,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0009634500816959202,
- "loss": 0.0165,
+ "loss": 0.0162,
"macro_f1": 0.6666666865348816,
"num_tokens": 2810396.0,
"repeat_count": 0.0,
- "routers_loss": 0.00849854201078415,
+ "routers_loss": 0.007915694266557693,
"skip_count": 2.0,
"step": 1744,
"text_loss": 0.5084020495414734
@@ -16585,13 +16585,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009633338288838805,
- "loss": 0.0275,
+ "loss": 0.0271,
"macro_f1": 0.5492662787437439,
"num_tokens": 2813215.0,
"repeat_count": 2.0,
- "routers_loss": 0.08082596957683563,
+ "routers_loss": 0.08364596217870712,
"skip_count": 0.0,
"step": 1746,
"text_loss": 0.27681824564933777
@@ -16604,13 +16604,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.046142578125,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.0009632173985214438,
- "loss": 0.015,
+ "loss": 0.0156,
"macro_f1": 0.8817967176437378,
"num_tokens": 2816452.0,
"repeat_count": 3.0,
- "routers_loss": 0.029500717297196388,
+ "routers_loss": 0.028805451467633247,
"skip_count": 2.0,
"step": 1748,
"text_loss": 0.4678419530391693
@@ -16623,13 +16623,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.0625,
"learning_rate": 0.000963100790653226,
- "loss": 0.0183,
+ "loss": 0.0188,
"macro_f1": 0.3272727429866791,
"num_tokens": 2819364.0,
"repeat_count": 0.0,
- "routers_loss": 0.025238536298274994,
+ "routers_loss": 0.03056817688047886,
"skip_count": 1.0,
"step": 1750,
"text_loss": 0.3078109920024872
@@ -16642,13 +16642,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0703125,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009629840053239116,
- "loss": 0.0204,
+ "loss": 0.0205,
"macro_f1": 0.3333333432674408,
"num_tokens": 2823469.0,
"repeat_count": 0.0,
- "routers_loss": 0.002069319598376751,
+ "routers_loss": 0.0019477814203128219,
"skip_count": 0.0,
"step": 1752,
"text_loss": 0.45501336455345154
@@ -16661,13 +16661,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05224609375,
+ "grad_norm": 0.057373046875,
"learning_rate": 0.000962867042578253,
- "loss": 0.0169,
+ "loss": 0.0173,
"macro_f1": 0.3333333432674408,
"num_tokens": 2826716.0,
"repeat_count": 0.0,
- "routers_loss": 0.002853946527466178,
+ "routers_loss": 0.0032963966950774193,
"skip_count": 0.0,
"step": 1754,
"text_loss": 0.49234694242477417
@@ -16680,13 +16680,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0947265625,
"learning_rate": 0.0009627499024610707,
- "loss": 0.0236,
+ "loss": 0.0239,
"macro_f1": 0.3272727429866791,
"num_tokens": 2829733.0,
"repeat_count": 0.0,
- "routers_loss": 0.0100983502343297,
+ "routers_loss": 0.010289114899933338,
"skip_count": 1.0,
"step": 1756,
"text_loss": 0.22335539758205414
@@ -16699,13 +16699,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09228515625,
+ "grad_norm": 0.0888671875,
"learning_rate": 0.0009626325850172527,
- "loss": 0.0173,
+ "loss": 0.0174,
"macro_f1": 0.3272727429866791,
"num_tokens": 2833350.0,
"repeat_count": 0.0,
- "routers_loss": 0.031218983232975006,
+ "routers_loss": 0.03249066323041916,
"skip_count": 1.0,
"step": 1758,
"text_loss": 0.6581931114196777
@@ -16718,13 +16718,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009625150902917555,
- "loss": 0.019,
+ "loss": 0.0185,
"macro_f1": 0.3333333432674408,
"num_tokens": 2836558.0,
"repeat_count": 0.0,
- "routers_loss": 0.010347879491746426,
+ "routers_loss": 0.00870000571012497,
"skip_count": 0.0,
"step": 1760,
"text_loss": 0.22938725352287292
@@ -16737,13 +16737,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1455078125,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0009623974183296031,
- "loss": 0.0193,
+ "loss": 0.0192,
"macro_f1": 0.3333333432674408,
"num_tokens": 2840560.0,
"repeat_count": 0.0,
- "routers_loss": 0.007768871728330851,
+ "routers_loss": 0.007767196744680405,
"skip_count": 0.0,
"step": 1762,
"text_loss": 0.24473799765110016
@@ -16756,13 +16756,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009622795691758876,
- "loss": 0.0253,
+ "loss": 0.0244,
"macro_f1": 0.3333333432674408,
"num_tokens": 2843548.0,
"repeat_count": 0.0,
- "routers_loss": 0.002887974726036191,
+ "routers_loss": 0.0021693643648177385,
"skip_count": 0.0,
"step": 1764,
"text_loss": 0.3084608018398285
@@ -16777,11 +16777,11 @@
"f1_skip": 0.0,
"grad_norm": 0.0498046875,
"learning_rate": 0.0009621615428757693,
- "loss": 0.0147,
+ "loss": 0.0149,
"macro_f1": 0.3333333432674408,
"num_tokens": 2847076.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027294005267322063,
+ "routers_loss": 0.0024727333802729845,
"skip_count": 0.0,
"step": 1766,
"text_loss": 0.5251734852790833
@@ -16794,13 +16794,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.000962043339474476,
- "loss": 0.0193,
+ "loss": 0.0194,
"macro_f1": 0.3333333432674408,
"num_tokens": 2849751.0,
"repeat_count": 0.0,
- "routers_loss": 0.00543541694059968,
+ "routers_loss": 0.005174890160560608,
"skip_count": 0.0,
"step": 1768,
"text_loss": 0.4410129189491272
@@ -16813,13 +16813,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.06103515625,
"learning_rate": 0.0009619249590173032,
- "loss": 0.0167,
+ "loss": 0.016,
"macro_f1": 0.6666666865348816,
"num_tokens": 2853916.0,
"repeat_count": 0.0,
- "routers_loss": 0.006514009553939104,
+ "routers_loss": 0.006785830482840538,
"skip_count": 2.0,
"step": 1770,
"text_loss": 0.550076425075531
@@ -16832,13 +16832,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.06591796875,
"learning_rate": 0.0009618064015496149,
- "loss": 0.019,
+ "loss": 0.0192,
"macro_f1": 0.5934640765190125,
"num_tokens": 2857372.0,
"repeat_count": 0.0,
- "routers_loss": 0.02333846502006054,
+ "routers_loss": 0.021370256319642067,
"skip_count": 3.0,
"step": 1772,
"text_loss": 0.1988629847764969
@@ -16851,13 +16851,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0732421875,
+ "grad_norm": 0.072265625,
"learning_rate": 0.0009616876671168423,
- "loss": 0.0165,
+ "loss": 0.0162,
"macro_f1": 0.6666666865348816,
"num_tokens": 2861028.0,
"repeat_count": 0.0,
- "routers_loss": 0.004471905063837767,
+ "routers_loss": 0.004313841462135315,
"skip_count": 1.0,
"step": 1774,
"text_loss": 0.42581331729888916
@@ -16870,13 +16870,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.0009615687557644847,
- "loss": 0.0261,
+ "loss": 0.0268,
"macro_f1": 0.3333333432674408,
"num_tokens": 2864847.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024362702388316393,
+ "routers_loss": 0.0025742491707205772,
"skip_count": 0.0,
"step": 1776,
"text_loss": 0.46510905027389526
@@ -16889,13 +16889,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009614496675381093,
- "loss": 0.0116,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 2867392.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021166049409657717,
+ "routers_loss": 0.0016813480760902166,
"skip_count": 0.0,
"step": 1778,
"text_loss": 0.5922174453735352
@@ -16908,13 +16908,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0712890625,
+ "grad_norm": 0.0810546875,
"learning_rate": 0.0009613304024833507,
"loss": 0.0166,
"macro_f1": 0.3333333432674408,
"num_tokens": 2871273.0,
"repeat_count": 0.0,
- "routers_loss": 0.004722296260297298,
+ "routers_loss": 0.004948933608829975,
"skip_count": 0.0,
"step": 1780,
"text_loss": 0.6776977777481079
@@ -16929,11 +16929,11 @@
"f1_skip": 1.0,
"grad_norm": 0.07470703125,
"learning_rate": 0.0009612109606459117,
- "loss": 0.0199,
+ "loss": 0.0186,
"macro_f1": 1.0,
"num_tokens": 2874172.0,
"repeat_count": 1.0,
- "routers_loss": 0.014188882894814014,
+ "routers_loss": 0.016950147226452827,
"skip_count": 2.0,
"step": 1782,
"text_loss": 0.48758944869041443
@@ -16946,13 +16946,13 @@
"f1_execute": 0.9599999785423279,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.076171875,
+ "grad_norm": 0.08251953125,
"learning_rate": 0.0009610913420715623,
- "loss": 0.0241,
+ "loss": 0.0237,
"macro_f1": 0.7644444704055786,
"num_tokens": 2877528.0,
"repeat_count": 2.0,
- "routers_loss": 0.04599560424685478,
+ "routers_loss": 0.04880943149328232,
"skip_count": 1.0,
"step": 1784,
"text_loss": 0.4404778480529785
@@ -16965,13 +16965,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.06201171875,
"learning_rate": 0.0009609715468061411,
- "loss": 0.0216,
+ "loss": 0.0205,
"macro_f1": 0.3333333432674408,
"num_tokens": 2880627.0,
"repeat_count": 0.0,
- "routers_loss": 0.004942454397678375,
+ "routers_loss": 0.004678630735725164,
"skip_count": 0.0,
"step": 1786,
"text_loss": 0.7295402884483337
@@ -16984,13 +16984,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08349609375,
+ "grad_norm": 0.07958984375,
"learning_rate": 0.0009608515748955535,
- "loss": 0.021,
+ "loss": 0.0205,
"macro_f1": 0.3333333432674408,
"num_tokens": 2883333.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020542226266115904,
+ "routers_loss": 0.0026695074047893286,
"skip_count": 0.0,
"step": 1788,
"text_loss": 0.9697831273078918
@@ -17003,13 +17003,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.1171875,
+ "grad_norm": 0.107421875,
"learning_rate": 0.000960731426385773,
- "loss": 0.0155,
+ "loss": 0.0157,
"macro_f1": 0.4871794879436493,
"num_tokens": 2887444.0,
"repeat_count": 0.0,
- "routers_loss": 0.0397041030228138,
+ "routers_loss": 0.029743613675236702,
"skip_count": 2.0,
"step": 1790,
"text_loss": 0.4737568199634552
@@ -17022,13 +17022,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.10107421875,
"learning_rate": 0.0009606111013228407,
- "loss": 0.0204,
+ "loss": 0.0207,
"macro_f1": 0.3333333432674408,
"num_tokens": 2890221.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017490010941401124,
+ "routers_loss": 0.0016153788892552257,
"skip_count": 0.0,
"step": 1792,
"text_loss": 0.6693558096885681
@@ -17041,13 +17041,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08251953125,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009604905997528655,
- "loss": 0.021,
+ "loss": 0.02,
"macro_f1": 0.3272727429866791,
"num_tokens": 2893262.0,
"repeat_count": 0.0,
- "routers_loss": 0.023590171709656715,
+ "routers_loss": 0.01965433731675148,
"skip_count": 1.0,
"step": 1794,
"text_loss": 0.45227760076522827
@@ -17060,13 +17060,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.08642578125,
"learning_rate": 0.0009603699217220239,
- "loss": 0.0125,
+ "loss": 0.0117,
"macro_f1": 0.6601307392120361,
"num_tokens": 2896823.0,
"repeat_count": 1.0,
- "routers_loss": 0.02458076737821102,
+ "routers_loss": 0.024017298594117165,
"skip_count": 2.0,
"step": 1796,
"text_loss": 0.48865509033203125
@@ -17079,13 +17079,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.08837890625,
"learning_rate": 0.0009602490672765597,
- "loss": 0.019,
+ "loss": 0.0182,
"macro_f1": 0.3333333432674408,
"num_tokens": 2899707.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014341498026624322,
+ "routers_loss": 0.0012420224957168102,
"skip_count": 0.0,
"step": 1798,
"text_loss": 0.43292415142059326
@@ -17098,13 +17098,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08056640625,
+ "grad_norm": 0.07861328125,
"learning_rate": 0.0009601280364627848,
- "loss": 0.02,
+ "loss": 0.0196,
"macro_f1": 0.3333333432674408,
"num_tokens": 2902795.0,
"repeat_count": 0.0,
- "routers_loss": 0.00213223067112267,
+ "routers_loss": 0.0020389219280332327,
"skip_count": 0.0,
"step": 1800,
"text_loss": 0.41021591424942017
@@ -17117,13 +17117,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07275390625,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009600068293270783,
- "loss": 0.0147,
+ "loss": 0.0142,
"macro_f1": 0.3333333432674408,
"num_tokens": 2905769.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027340995147824287,
+ "routers_loss": 0.002006303984671831,
"skip_count": 0.0,
"step": 1802,
"text_loss": 0.46892106533050537
@@ -17136,32 +17136,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.08740234375,
"learning_rate": 0.000959885445915887,
- "loss": 0.0172,
+ "loss": 0.017,
"macro_f1": 0.3333333432674408,
"num_tokens": 2909475.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035587961319833994,
+ "routers_loss": 0.003734810510650277,
"skip_count": 0.0,
"step": 1804,
"text_loss": 0.45364710688591003
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.5,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 8.479013795127678,
- "f1_execute": 0.9615384340286255,
- "f1_repeat": 0.0,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009597638862757254,
- "loss": 0.0187,
- "macro_f1": 0.5427350401878357,
+ "loss": 0.0182,
+ "macro_f1": 0.8823530077934265,
"num_tokens": 2914348.0,
"repeat_count": 1.0,
- "routers_loss": 0.04446055367588997,
+ "routers_loss": 0.038971323519945145,
"skip_count": 2.0,
"step": 1806,
"text_loss": 0.42913779616355896
@@ -17174,13 +17174,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08447265625,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009596421504531751,
- "loss": 0.0244,
+ "loss": 0.0249,
"macro_f1": 0.3272727429866791,
"num_tokens": 2917467.0,
"repeat_count": 1.0,
- "routers_loss": 0.05095123499631882,
+ "routers_loss": 0.04800829663872719,
"skip_count": 0.0,
"step": 1808,
"text_loss": 0.17332297563552856
@@ -17193,13 +17193,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009595202384948858,
- "loss": 0.0232,
+ "loss": 0.0227,
"macro_f1": 0.6666666865348816,
"num_tokens": 2920223.0,
"repeat_count": 1.0,
- "routers_loss": 0.008440068922936916,
+ "routers_loss": 0.009164143353700638,
"skip_count": 0.0,
"step": 1810,
"text_loss": 0.33740702271461487
@@ -17212,13 +17212,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0927734375,
+ "grad_norm": 0.0947265625,
"learning_rate": 0.0009593981504475742,
- "loss": 0.0273,
+ "loss": 0.0275,
"macro_f1": 0.6666666865348816,
"num_tokens": 2923780.0,
"repeat_count": 0.0,
- "routers_loss": 0.012230116873979568,
+ "routers_loss": 0.011236993595957756,
"skip_count": 2.0,
"step": 1812,
"text_loss": 0.1609916388988495
@@ -17231,13 +17231,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009592758863580248,
- "loss": 0.026,
+ "loss": 0.0259,
"macro_f1": 0.5492662787437439,
"num_tokens": 2926259.0,
"repeat_count": 0.0,
- "routers_loss": 0.017307188361883163,
+ "routers_loss": 0.019026532769203186,
"skip_count": 2.0,
"step": 1814,
"text_loss": 0.6460903882980347
@@ -17250,13 +17250,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009591534462730894,
- "loss": 0.0215,
+ "loss": 0.0206,
"macro_f1": 0.5492662787437439,
"num_tokens": 2929173.0,
"repeat_count": 2.0,
- "routers_loss": 0.07191162556409836,
+ "routers_loss": 0.0608333982527256,
"skip_count": 0.0,
"step": 1816,
"text_loss": 0.476126492023468
@@ -17269,13 +17269,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.06640625,
"learning_rate": 0.000959030830239687,
- "loss": 0.0182,
+ "loss": 0.0175,
"macro_f1": 0.3333333432674408,
"num_tokens": 2932703.0,
"repeat_count": 0.0,
- "routers_loss": 0.008753604255616665,
+ "routers_loss": 0.0093300249427557,
"skip_count": 0.0,
"step": 1818,
"text_loss": 0.5471875667572021
@@ -17288,13 +17288,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19921875,
+ "grad_norm": 0.2001953125,
"learning_rate": 0.0009589080383048048,
- "loss": 0.0233,
+ "loss": 0.0235,
"macro_f1": 0.3333333432674408,
"num_tokens": 2936195.0,
"repeat_count": 0.0,
- "routers_loss": 0.008390828967094421,
+ "routers_loss": 0.010434109717607498,
"skip_count": 0.0,
"step": 1820,
"text_loss": 0.5068115592002869
@@ -17307,13 +17307,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0009587850705154964,
"loss": 0.0291,
"macro_f1": 0.3333333432674408,
"num_tokens": 2939412.0,
"repeat_count": 0.0,
- "routers_loss": 0.005617359187453985,
+ "routers_loss": 0.004347751382738352,
"skip_count": 0.0,
"step": 1822,
"text_loss": 0.4241984784603119
@@ -17326,13 +17326,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.0859375,
"learning_rate": 0.0009586619269188836,
- "loss": 0.0227,
+ "loss": 0.0224,
"macro_f1": 0.32098767161369324,
"num_tokens": 2942318.0,
"repeat_count": 0.0,
- "routers_loss": 0.0346846878528595,
+ "routers_loss": 0.034238871186971664,
"skip_count": 1.0,
"step": 1824,
"text_loss": 0.2328975349664688
@@ -17345,32 +17345,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0009585386075621553,
"loss": 0.027,
"macro_f1": 0.3333333432674408,
"num_tokens": 2945731.0,
"repeat_count": 0.0,
- "routers_loss": 0.006601692643016577,
+ "routers_loss": 0.006097695790231228,
"skip_count": 0.0,
"step": 1826,
"text_loss": 0.22816994786262512
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 8.582330496037569,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.08837890625,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0908203125,
"learning_rate": 0.0009584151124925676,
- "loss": 0.0207,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0208,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2948944.0,
"repeat_count": 0.0,
- "routers_loss": 0.0065619745291769505,
+ "routers_loss": 0.007790776435285807,
"skip_count": 1.0,
"step": 1828,
"text_loss": 0.5009413361549377
@@ -17383,13 +17383,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0009582914417574438,
- "loss": 0.0149,
+ "loss": 0.0145,
"macro_f1": 0.6666666865348816,
"num_tokens": 2951723.0,
"repeat_count": 0.0,
- "routers_loss": 0.011109639890491962,
+ "routers_loss": 0.009144559502601624,
"skip_count": 2.0,
"step": 1830,
"text_loss": 0.1402502954006195
@@ -17404,11 +17404,11 @@
"f1_skip": 0.0,
"grad_norm": 0.06201171875,
"learning_rate": 0.0009581675954041751,
- "loss": 0.0167,
+ "loss": 0.0166,
"macro_f1": 0.6666666865348816,
"num_tokens": 2954726.0,
"repeat_count": 1.0,
- "routers_loss": 0.008432094007730484,
+ "routers_loss": 0.006593191530555487,
"skip_count": 0.0,
"step": 1832,
"text_loss": 0.4871736466884613
@@ -17421,13 +17421,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0859375,
+ "grad_norm": 0.0869140625,
"learning_rate": 0.0009580435734802196,
- "loss": 0.0208,
+ "loss": 0.0206,
"macro_f1": 0.3333333432674408,
"num_tokens": 2957853.0,
"repeat_count": 0.0,
- "routers_loss": 0.011518111452460289,
+ "routers_loss": 0.01241068821400404,
"skip_count": 0.0,
"step": 1834,
"text_loss": 0.30100154876708984
@@ -17440,13 +17440,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009579193760331027,
- "loss": 0.0211,
+ "loss": 0.022,
"macro_f1": 0.3333333432674408,
"num_tokens": 2960783.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026744187343865633,
+ "routers_loss": 0.002219218760728836,
"skip_count": 0.0,
"step": 1836,
"text_loss": 0.4961516559123993
@@ -17459,13 +17459,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009577950031104169,
- "loss": 0.0165,
+ "loss": 0.0166,
"macro_f1": 0.6601307392120361,
"num_tokens": 2963328.0,
"repeat_count": 1.0,
- "routers_loss": 0.028107430785894394,
+ "routers_loss": 0.029363535344600677,
"skip_count": 2.0,
"step": 1838,
"text_loss": 0.42814353108406067
@@ -17478,13 +17478,13 @@
"f1_execute": 0.9387754797935486,
"f1_repeat": 1.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009576704547598226,
- "loss": 0.0263,
+ "loss": 0.0257,
"macro_f1": 0.7795917987823486,
"num_tokens": 2966108.0,
"repeat_count": 1.0,
- "routers_loss": 0.060007549822330475,
+ "routers_loss": 0.0579402856528759,
"skip_count": 4.0,
"step": 1840,
"text_loss": 0.20523512363433838
@@ -17497,13 +17497,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009575457310290463,
"loss": 0.0121,
"macro_f1": 0.3272727429866791,
"num_tokens": 2969137.0,
"repeat_count": 0.0,
- "routers_loss": 0.01074182614684105,
+ "routers_loss": 0.008810589089989662,
"skip_count": 0.0,
"step": 1842,
"text_loss": 0.6199528574943542
@@ -17516,13 +17516,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0732421875,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0009574208319658831,
- "loss": 0.0213,
+ "loss": 0.0208,
"macro_f1": 0.6666666865348816,
"num_tokens": 2972407.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019638657104223967,
+ "routers_loss": 0.0012295129708945751,
"skip_count": 1.0,
"step": 1844,
"text_loss": 0.66938316822052
@@ -17535,13 +17535,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.000957295757618194,
- "loss": 0.0156,
+ "loss": 0.0152,
"macro_f1": 0.4871794879436493,
"num_tokens": 2976045.0,
"repeat_count": 0.0,
- "routers_loss": 0.06953249871730804,
+ "routers_loss": 0.06162935495376587,
"skip_count": 2.0,
"step": 1846,
"text_loss": 0.5381782650947571
@@ -17554,13 +17554,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009571705080339079,
- "loss": 0.0154,
+ "loss": 0.0144,
"macro_f1": 0.3333333432674408,
"num_tokens": 2979025.0,
"repeat_count": 0.0,
- "routers_loss": 0.003563052974641323,
+ "routers_loss": 0.003950524143874645,
"skip_count": 0.0,
"step": 1848,
"text_loss": 0.5831671357154846
@@ -17573,13 +17573,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.11376953125,
"learning_rate": 0.0009570450832610208,
- "loss": 0.0216,
+ "loss": 0.0209,
"macro_f1": 0.3333333432674408,
"num_tokens": 2982276.0,
"repeat_count": 0.0,
- "routers_loss": 0.010409255512058735,
+ "routers_loss": 0.010354886762797832,
"skip_count": 0.0,
"step": 1850,
"text_loss": 0.27448201179504395
@@ -17592,13 +17592,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.0009569194833475956,
- "loss": 0.0195,
+ "loss": 0.0199,
"macro_f1": 0.3272727429866791,
"num_tokens": 2985691.0,
"repeat_count": 0.0,
- "routers_loss": 0.009769548662006855,
+ "routers_loss": 0.010167439468204975,
"skip_count": 0.0,
"step": 1852,
"text_loss": 0.5264663696289062
@@ -17611,13 +17611,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1181640625,
+ "grad_norm": 0.1328125,
"learning_rate": 0.0009567937083417624,
- "loss": 0.0184,
+ "loss": 0.0194,
"macro_f1": 0.3272727429866791,
"num_tokens": 2989126.0,
"repeat_count": 0.0,
- "routers_loss": 0.036616452038288116,
+ "routers_loss": 0.0371871180832386,
"skip_count": 1.0,
"step": 1854,
"text_loss": 0.2008018046617508
@@ -17630,13 +17630,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.0009566677582917185,
- "loss": 0.0192,
+ "loss": 0.0184,
"macro_f1": 0.3333333432674408,
"num_tokens": 2992814.0,
"repeat_count": 0.0,
- "routers_loss": 0.009581349790096283,
+ "routers_loss": 0.010190588422119617,
"skip_count": 0.0,
"step": 1856,
"text_loss": 0.749717116355896
@@ -17649,13 +17649,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09814453125,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009565416332457282,
- "loss": 0.0138,
+ "loss": 0.0132,
"macro_f1": 0.6538461446762085,
"num_tokens": 2995729.0,
"repeat_count": 1.0,
- "routers_loss": 0.02330300398170948,
+ "routers_loss": 0.022285036742687225,
"skip_count": 1.0,
"step": 1858,
"text_loss": 0.5870219469070435
@@ -17668,13 +17668,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009564153332521228,
- "loss": 0.0226,
+ "loss": 0.0224,
"macro_f1": 0.3272727429866791,
"num_tokens": 2998812.0,
"repeat_count": 0.0,
- "routers_loss": 0.011985735036432743,
+ "routers_loss": 0.011050296947360039,
"skip_count": 1.0,
"step": 1860,
"text_loss": 0.8444408774375916
@@ -17687,13 +17687,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.06005859375,
"learning_rate": 0.0009562888583593005,
- "loss": 0.0162,
+ "loss": 0.0163,
"macro_f1": 0.3333333432674408,
"num_tokens": 3001799.0,
"repeat_count": 0.0,
- "routers_loss": 0.005997250322252512,
+ "routers_loss": 0.007125461008399725,
"skip_count": 0.0,
"step": 1862,
"text_loss": 0.41510361433029175
@@ -17706,13 +17706,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009561622086157272,
- "loss": 0.0243,
+ "loss": 0.0236,
"macro_f1": 0.3333333432674408,
"num_tokens": 3005088.0,
"repeat_count": 0.0,
- "routers_loss": 0.004814761225134134,
+ "routers_loss": 0.0049054501578211784,
"skip_count": 0.0,
"step": 1864,
"text_loss": 0.3801248073577881
@@ -17725,13 +17725,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.056884765625,
+ "grad_norm": 0.054443359375,
"learning_rate": 0.000956035384069935,
- "loss": 0.0242,
+ "loss": 0.0238,
"macro_f1": 1.0,
"num_tokens": 3008178.0,
"repeat_count": 1.0,
- "routers_loss": 0.004750931169837713,
+ "routers_loss": 0.005162427201867104,
"skip_count": 1.0,
"step": 1866,
"text_loss": 0.2687684893608093
@@ -17744,13 +17744,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1123046875,
+ "grad_norm": 0.10400390625,
"learning_rate": 0.0009559083847705233,
- "loss": 0.0216,
+ "loss": 0.0214,
"macro_f1": 0.3272727429866791,
"num_tokens": 3010923.0,
"repeat_count": 0.0,
- "routers_loss": 0.038251202553510666,
+ "routers_loss": 0.028984658420085907,
"skip_count": 1.0,
"step": 1868,
"text_loss": 0.6277349591255188
@@ -17763,13 +17763,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009557812107661584,
- "loss": 0.0204,
+ "loss": 0.0208,
"macro_f1": 1.0,
"num_tokens": 3015030.0,
"repeat_count": 1.0,
- "routers_loss": 0.010951942764222622,
+ "routers_loss": 0.012200530618429184,
"skip_count": 1.0,
"step": 1870,
"text_loss": 0.6293368339538574
@@ -17782,13 +17782,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.11962890625,
"learning_rate": 0.0009556538621055739,
- "loss": 0.0265,
+ "loss": 0.0268,
"macro_f1": 0.3272727429866791,
"num_tokens": 3019067.0,
"repeat_count": 0.0,
- "routers_loss": 0.06582094728946686,
+ "routers_loss": 0.06365182995796204,
"skip_count": 1.0,
"step": 1872,
"text_loss": 0.39046618342399597
@@ -17796,18 +17796,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
- "avg_layers": 26.0,
+ "avg_layers": 27.0,
"epoch": 8.798356325212797,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.12353515625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.115234375,
"learning_rate": 0.0009555263388375699,
- "loss": 0.0143,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.014,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3022166.0,
"repeat_count": 0.0,
- "routers_loss": 0.008920271880924702,
+ "routers_loss": 0.0041703456081449986,
"skip_count": 1.0,
"step": 1874,
"text_loss": 0.42232340574264526
@@ -17820,13 +17820,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1220703125,
+ "grad_norm": 0.11572265625,
"learning_rate": 0.0009553986410110134,
"loss": 0.016,
"macro_f1": 0.3333333432674408,
"num_tokens": 3025865.0,
"repeat_count": 0.0,
- "routers_loss": 0.006444344762712717,
+ "routers_loss": 0.005841755773872137,
"skip_count": 0.0,
"step": 1876,
"text_loss": 0.37600573897361755
@@ -17839,13 +17839,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009552707686748388,
- "loss": 0.022,
+ "loss": 0.0219,
"macro_f1": 0.3272727429866791,
"num_tokens": 3029950.0,
"repeat_count": 0.0,
- "routers_loss": 0.05197767913341522,
+ "routers_loss": 0.05165952071547508,
"skip_count": 1.0,
"step": 1878,
"text_loss": 0.33717799186706543
@@ -17858,13 +17858,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009551427218780467,
- "loss": 0.0224,
+ "loss": 0.0219,
"macro_f1": 0.6666666865348816,
"num_tokens": 3033649.0,
"repeat_count": 0.0,
- "routers_loss": 0.017570581287145615,
+ "routers_loss": 0.020680008456110954,
"skip_count": 2.0,
"step": 1880,
"text_loss": 0.5011783838272095
@@ -17877,13 +17877,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.173828125,
+ "grad_norm": 0.15625,
"learning_rate": 0.0009550145006697048,
- "loss": 0.0225,
+ "loss": 0.0217,
"macro_f1": 0.32098764181137085,
"num_tokens": 3036847.0,
"repeat_count": 0.0,
- "routers_loss": 0.07106777280569077,
+ "routers_loss": 0.07626450061798096,
"skip_count": 2.0,
"step": 1882,
"text_loss": 0.3066408336162567
@@ -17896,13 +17896,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0009548861050989482,
- "loss": 0.0139,
+ "loss": 0.0136,
"macro_f1": 1.0,
"num_tokens": 3040353.0,
"repeat_count": 1.0,
- "routers_loss": 0.009862381964921951,
+ "routers_loss": 0.010884666815400124,
"skip_count": 1.0,
"step": 1884,
"text_loss": 0.49779415130615234
@@ -17915,13 +17915,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0908203125,
"learning_rate": 0.0009547575352149778,
- "loss": 0.0209,
+ "loss": 0.0213,
"macro_f1": 0.6666666865348816,
"num_tokens": 3043504.0,
"repeat_count": 0.0,
- "routers_loss": 0.006928981747478247,
+ "routers_loss": 0.006704333238303661,
"skip_count": 2.0,
"step": 1886,
"text_loss": 0.12284614145755768
@@ -17934,13 +17934,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09423828125,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.0009546287910670621,
"loss": 0.0211,
"macro_f1": 0.5427350401878357,
"num_tokens": 3046422.0,
"repeat_count": 1.0,
- "routers_loss": 0.04788029566407204,
+ "routers_loss": 0.04799000173807144,
"skip_count": 2.0,
"step": 1888,
"text_loss": 0.1824081838130951
@@ -17953,13 +17953,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1357421875,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009544998727045361,
- "loss": 0.0299,
+ "loss": 0.0306,
"macro_f1": 0.3333333432674408,
"num_tokens": 3049819.0,
"repeat_count": 0.0,
- "routers_loss": 0.008282946422696114,
+ "routers_loss": 0.008139612153172493,
"skip_count": 0.0,
"step": 1890,
"text_loss": 0.18929053843021393
@@ -17972,32 +17972,32 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.09375,
"learning_rate": 0.0009543707801768015,
- "loss": 0.0181,
+ "loss": 0.0175,
"macro_f1": 0.5934640765190125,
"num_tokens": 3052766.0,
"repeat_count": 0.0,
- "routers_loss": 0.03251546248793602,
+ "routers_loss": 0.02966771461069584,
"skip_count": 3.0,
"step": 1892,
"text_loss": 0.247748002409935
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 24.0,
+ "acc_skip": 0.5,
+ "avg_layers": 25.0,
"epoch": 8.892280598767243,
- "f1_execute": 0.9600000381469727,
+ "f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.06640625,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009542415135333267,
- "loss": 0.0195,
- "macro_f1": 0.542222261428833,
+ "loss": 0.0193,
+ "macro_f1": 0.44705885648727417,
"num_tokens": 3056427.0,
"repeat_count": 0.0,
- "routers_loss": 0.03368280455470085,
+ "routers_loss": 0.03637036308646202,
"skip_count": 2.0,
"step": 1894,
"text_loss": 0.2583999037742615
@@ -18010,13 +18010,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.0595703125,
"learning_rate": 0.0009541120728236472,
- "loss": 0.0133,
+ "loss": 0.0136,
"macro_f1": 0.3333333432674408,
"num_tokens": 3059497.0,
"repeat_count": 0.0,
- "routers_loss": 0.0069940583780407906,
+ "routers_loss": 0.007026574574410915,
"skip_count": 0.0,
"step": 1896,
"text_loss": 0.5222375988960266
@@ -18029,13 +18029,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0810546875,
+ "grad_norm": 0.076171875,
"learning_rate": 0.0009539824580973646,
- "loss": 0.0221,
+ "loss": 0.0219,
"macro_f1": 0.3333333432674408,
"num_tokens": 3062187.0,
"repeat_count": 0.0,
- "routers_loss": 0.004268508404493332,
+ "routers_loss": 0.003449335927143693,
"skip_count": 0.0,
"step": 1898,
"text_loss": 0.5736427307128906
@@ -18048,13 +18048,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0009538526694041477,
- "loss": 0.0159,
+ "loss": 0.0163,
"macro_f1": 0.3333333432674408,
"num_tokens": 3066100.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032616283278912306,
+ "routers_loss": 0.0035463871899992228,
"skip_count": 0.0,
"step": 1900,
"text_loss": 0.5471583604812622
@@ -18067,13 +18067,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.08056640625,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009537227067937318,
- "loss": 0.023,
+ "loss": 0.0233,
"macro_f1": 1.0,
"num_tokens": 3068737.0,
"repeat_count": 3.0,
- "routers_loss": 0.005389219615608454,
+ "routers_loss": 0.00597514258697629,
"skip_count": 3.0,
"step": 1902,
"text_loss": 0.36644190549850464
@@ -18086,13 +18086,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.166015625,
"learning_rate": 0.0009535925703159186,
- "loss": 0.0311,
+ "loss": 0.0301,
"macro_f1": 0.32098764181137085,
"num_tokens": 3071686.0,
"repeat_count": 0.0,
- "routers_loss": 0.024814991280436516,
+ "routers_loss": 0.025420479476451874,
"skip_count": 2.0,
"step": 1904,
"text_loss": 0.535789966583252
@@ -18107,11 +18107,11 @@
"f1_skip": 0.0,
"grad_norm": 0.07568359375,
"learning_rate": 0.0009534622600205769,
- "loss": 0.0151,
+ "loss": 0.0145,
"macro_f1": 0.3333333432674408,
"num_tokens": 3074954.0,
"repeat_count": 0.0,
- "routers_loss": 0.013415839523077011,
+ "routers_loss": 0.014377486892044544,
"skip_count": 0.0,
"step": 1906,
"text_loss": 0.19009549915790558
@@ -18124,13 +18124,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009533317759576416,
- "loss": 0.019,
+ "loss": 0.0197,
"macro_f1": 0.3333333432674408,
"num_tokens": 3077540.0,
"repeat_count": 0.0,
- "routers_loss": 0.005814475007355213,
+ "routers_loss": 0.004848944488912821,
"skip_count": 0.0,
"step": 1908,
"text_loss": 0.5022001266479492
@@ -18143,13 +18143,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0732421875,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0009532011181771148,
- "loss": 0.0218,
+ "loss": 0.0217,
"macro_f1": 0.6666666865348816,
"num_tokens": 3080445.0,
"repeat_count": 0.0,
- "routers_loss": 0.007621586322784424,
+ "routers_loss": 0.009480170905590057,
"skip_count": 2.0,
"step": 1910,
"text_loss": 0.35135936737060547
@@ -18162,13 +18162,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.10400390625,
"learning_rate": 0.0009530702867290644,
- "loss": 0.0178,
+ "loss": 0.0185,
"macro_f1": 0.3333333432674408,
"num_tokens": 3083657.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020917020738124847,
+ "routers_loss": 0.0019353039097040892,
"skip_count": 0.0,
"step": 1912,
"text_loss": 0.5123994946479797
@@ -18181,13 +18181,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009529392816636256,
- "loss": 0.025,
+ "loss": 0.0249,
"macro_f1": 0.3333333432674408,
"num_tokens": 3086837.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010824954370036721,
+ "routers_loss": 0.0010921972570940852,
"skip_count": 0.0,
"step": 1914,
"text_loss": 0.44477662444114685
@@ -18200,13 +18200,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.19140625,
"learning_rate": 0.0009528081030309995,
- "loss": 0.0353,
+ "loss": 0.0351,
"macro_f1": 0.3333333432674408,
"num_tokens": 3089892.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018075350672006607,
+ "routers_loss": 0.0018027103506028652,
"skip_count": 0.0,
"step": 1916,
"text_loss": 0.7356183528900146
@@ -18219,13 +18219,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07958984375,
+ "grad_norm": 0.07568359375,
"learning_rate": 0.0009526767508814542,
- "loss": 0.0235,
+ "loss": 0.0236,
"macro_f1": 0.3333333432674408,
"num_tokens": 3093058.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032930250745266676,
+ "routers_loss": 0.003243023296818137,
"skip_count": 0.0,
"step": 1918,
"text_loss": 0.48823556303977966
@@ -18238,13 +18238,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08837890625,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009525452252653239,
- "loss": 0.0184,
+ "loss": 0.0175,
"macro_f1": 0.3333333432674408,
"num_tokens": 3096404.0,
"repeat_count": 0.0,
- "routers_loss": 0.009042349644005299,
+ "routers_loss": 0.009360014460980892,
"skip_count": 0.0,
"step": 1920,
"text_loss": 0.21498437225818634
@@ -18257,13 +18257,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009524135262330098,
- "loss": 0.022,
+ "loss": 0.0224,
"macro_f1": 0.9265305995941162,
"num_tokens": 3099520.0,
"repeat_count": 1.0,
- "routers_loss": 0.016776500269770622,
+ "routers_loss": 0.017444295808672905,
"skip_count": 3.0,
"step": 1922,
"text_loss": 0.27608850598335266
@@ -18276,13 +18276,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.0009522816538349789,
- "loss": 0.016,
+ "loss": 0.0162,
"macro_f1": 0.5492662787437439,
"num_tokens": 3102956.0,
"repeat_count": 0.0,
- "routers_loss": 0.06579705327749252,
+ "routers_loss": 0.06424452364444733,
"skip_count": 2.0,
"step": 1924,
"text_loss": 0.21558666229248047
@@ -18295,13 +18295,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.058349609375,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0009521496081217651,
- "loss": 0.0113,
+ "loss": 0.0112,
"macro_f1": 0.6666666865348816,
"num_tokens": 3106565.0,
"repeat_count": 1.0,
- "routers_loss": 0.0022786022163927555,
+ "routers_loss": 0.002270506462082267,
"skip_count": 0.0,
"step": 1926,
"text_loss": 0.5641813278198242
@@ -18314,13 +18314,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.09033203125,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009520173891439684,
"loss": 0.0216,
"macro_f1": 0.6666666865348816,
"num_tokens": 3109314.0,
"repeat_count": 0.0,
- "routers_loss": 0.01074281521141529,
+ "routers_loss": 0.011512448079884052,
"skip_count": 1.0,
"step": 1928,
"text_loss": 0.6351624727249146
@@ -18333,13 +18333,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009518849969522556,
- "loss": 0.0201,
+ "loss": 0.0198,
"macro_f1": 0.3333333432674408,
"num_tokens": 3112956.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032052614260464907,
+ "routers_loss": 0.003883908037096262,
"skip_count": 0.0,
"step": 1930,
"text_loss": 0.35160085558891296
@@ -18352,32 +18352,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009517524315973595,
- "loss": 0.0186,
+ "loss": 0.019,
"macro_f1": 1.0,
"num_tokens": 3115593.0,
"repeat_count": 1.0,
- "routers_loss": 0.008593574166297913,
+ "routers_loss": 0.009479222819209099,
"skip_count": 3.0,
"step": 1932,
"text_loss": 0.2900560200214386
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 9.079835632521279,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.07373046875,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0771484375,
"learning_rate": 0.0009516196931300794,
- "loss": 0.0152,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0153,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3118516.0,
"repeat_count": 0.0,
- "routers_loss": 0.0201246440410614,
+ "routers_loss": 0.017834696918725967,
"skip_count": 2.0,
"step": 1934,
"text_loss": 0.20094378292560577
@@ -18390,13 +18390,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1357421875,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009514867816012809,
- "loss": 0.0199,
+ "loss": 0.02,
"macro_f1": 0.3333333432674408,
"num_tokens": 3122242.0,
"repeat_count": 0.0,
- "routers_loss": 0.001721356064081192,
+ "routers_loss": 0.0017964740982279181,
"skip_count": 0.0,
"step": 1936,
"text_loss": 0.6498590707778931
@@ -18409,13 +18409,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.049072265625,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0009513536970618961,
- "loss": 0.0135,
+ "loss": 0.013,
"macro_f1": 0.6666666865348816,
"num_tokens": 3125645.0,
"repeat_count": 0.0,
- "routers_loss": 0.010442634113132954,
+ "routers_loss": 0.007437168620526791,
"skip_count": 2.0,
"step": 1938,
"text_loss": 0.25863033533096313
@@ -18428,13 +18428,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.058349609375,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009512204395629232,
- "loss": 0.019,
+ "loss": 0.0184,
"macro_f1": 0.6666666865348816,
"num_tokens": 3128740.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009493798715993762,
+ "routers_loss": 0.0008759932243265212,
"skip_count": 1.0,
"step": 1940,
"text_loss": 0.5638351440429688
@@ -18447,13 +18447,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05517578125,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009510870091554264,
- "loss": 0.0149,
+ "loss": 0.0153,
"macro_f1": 0.3272727429866791,
"num_tokens": 3131742.0,
"repeat_count": 1.0,
- "routers_loss": 0.022104881703853607,
+ "routers_loss": 0.019906625151634216,
"skip_count": 0.0,
"step": 1942,
"text_loss": 0.8410717844963074
@@ -18466,13 +18466,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009509534058905369,
- "loss": 0.0164,
+ "loss": 0.016,
"macro_f1": 0.3333333432674408,
"num_tokens": 3134407.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009013625676743686,
+ "routers_loss": 0.0009229081333614886,
"skip_count": 0.0,
"step": 1944,
"text_loss": 0.47506049275398254
@@ -18485,13 +18485,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06103515625,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0009508196298194517,
- "loss": 0.0121,
+ "loss": 0.0123,
"macro_f1": 0.3333333432674408,
"num_tokens": 3137053.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028069843538105488,
+ "routers_loss": 0.003630586201325059,
"skip_count": 0.0,
"step": 1946,
"text_loss": 0.32225799560546875
@@ -18504,13 +18504,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009506856809934338,
- "loss": 0.0116,
+ "loss": 0.0119,
"macro_f1": 0.3333333432674408,
"num_tokens": 3140943.0,
"repeat_count": 0.0,
- "routers_loss": 0.006877045147120953,
+ "routers_loss": 0.007580445148050785,
"skip_count": 0.0,
"step": 1948,
"text_loss": 0.3120577931404114
@@ -18523,13 +18523,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0009505515594638127,
- "loss": 0.0127,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 3144298.0,
"repeat_count": 0.0,
- "routers_loss": 0.004543667659163475,
+ "routers_loss": 0.004471861757338047,
"skip_count": 0.0,
"step": 1950,
"text_loss": 0.22052447497844696
@@ -18542,13 +18542,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.09130859375,
"learning_rate": 0.0009504172652819843,
- "loss": 0.0232,
+ "loss": 0.023,
"macro_f1": 1.0,
"num_tokens": 3147069.0,
"repeat_count": 1.0,
- "routers_loss": 0.007053609937429428,
+ "routers_loss": 0.009606664068996906,
"skip_count": 1.0,
"step": 1952,
"text_loss": 0.34773921966552734
@@ -18561,13 +18561,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0537109375,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009502827984994099,
- "loss": 0.0146,
+ "loss": 0.0148,
"macro_f1": 0.6666666865348816,
"num_tokens": 3149992.0,
"repeat_count": 0.0,
- "routers_loss": 0.006783280987292528,
+ "routers_loss": 0.006443799939006567,
"skip_count": 1.0,
"step": 1954,
"text_loss": 0.6442171335220337
@@ -18580,13 +18580,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.0009501481591676177,
- "loss": 0.0181,
+ "loss": 0.0188,
"macro_f1": 0.3333333432674408,
"num_tokens": 3153167.0,
"repeat_count": 0.0,
- "routers_loss": 0.002531677018851042,
+ "routers_loss": 0.003219039412215352,
"skip_count": 0.0,
"step": 1956,
"text_loss": 0.43369221687316895
@@ -18599,32 +18599,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.078125,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.000950013347338202,
- "loss": 0.0154,
+ "loss": 0.0152,
"macro_f1": 0.3272727429866791,
"num_tokens": 3156590.0,
"repeat_count": 0.0,
- "routers_loss": 0.027040868997573853,
+ "routers_loss": 0.025551019236445427,
"skip_count": 1.0,
"step": 1958,
"text_loss": 0.294479101896286
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 1.0,
- "avg_layers": 26.0,
+ "avg_layers": 27.0,
"epoch": 9.201937188142061,
- "f1_execute": 0.9803921580314636,
- "f1_repeat": 0.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009498783630628225,
- "loss": 0.0154,
- "macro_f1": 0.6601307392120361,
+ "loss": 0.0158,
+ "macro_f1": 1.0,
"num_tokens": 3159451.0,
"repeat_count": 1.0,
- "routers_loss": 0.01573321223258972,
+ "routers_loss": 0.013802438974380493,
"skip_count": 2.0,
"step": 1960,
"text_loss": 0.20888492465019226
@@ -18637,13 +18637,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06689453125,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009497432063932057,
- "loss": 0.0135,
+ "loss": 0.0137,
"macro_f1": 0.6601307392120361,
"num_tokens": 3162889.0,
"repeat_count": 1.0,
- "routers_loss": 0.02442278526723385,
+ "routers_loss": 0.02852988988161087,
"skip_count": 2.0,
"step": 1962,
"text_loss": 0.5027125477790833
@@ -18656,13 +18656,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.046630859375,
+ "grad_norm": 0.045166015625,
"learning_rate": 0.0009496078773811437,
- "loss": 0.0142,
+ "loss": 0.0136,
"macro_f1": 0.6666666865348816,
"num_tokens": 3165979.0,
"repeat_count": 0.0,
- "routers_loss": 0.018267054110765457,
+ "routers_loss": 0.01784522272646427,
"skip_count": 2.0,
"step": 1964,
"text_loss": 0.1696339100599289
@@ -18675,13 +18675,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.060302734375,
"learning_rate": 0.000949472376078495,
- "loss": 0.0162,
+ "loss": 0.016,
"macro_f1": 0.3333333432674408,
"num_tokens": 3168683.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016024474753066897,
+ "routers_loss": 0.0017019887454807758,
"skip_count": 0.0,
"step": 1966,
"text_loss": 0.48905447125434875
@@ -18694,13 +18694,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.052978515625,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.000949336702537184,
- "loss": 0.011,
+ "loss": 0.0108,
"macro_f1": 0.6666666865348816,
"num_tokens": 3171968.0,
"repeat_count": 0.0,
- "routers_loss": 0.004668849054723978,
+ "routers_loss": 0.004817947279661894,
"skip_count": 2.0,
"step": 1968,
"text_loss": 0.20984773337841034
@@ -18713,13 +18713,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0009492008568092007,
- "loss": 0.0098,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 3175947.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011657609138637781,
+ "routers_loss": 0.0012963006738573313,
"skip_count": 0.0,
"step": 1970,
"text_loss": 0.5215106010437012
@@ -18732,13 +18732,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.04248046875,
+ "grad_norm": 0.044921875,
"learning_rate": 0.0009490648389466019,
- "loss": 0.0133,
+ "loss": 0.0135,
"macro_f1": 0.4871794879436493,
"num_tokens": 3179348.0,
"repeat_count": 0.0,
- "routers_loss": 0.03806794434785843,
+ "routers_loss": 0.03950481489300728,
"skip_count": 2.0,
"step": 1972,
"text_loss": 0.24640929698944092
@@ -18751,13 +18751,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08837890625,
+ "grad_norm": 0.09326171875,
"learning_rate": 0.0009489286490015097,
- "loss": 0.0189,
+ "loss": 0.0183,
"macro_f1": 0.6666666865348816,
"num_tokens": 3182640.0,
"repeat_count": 0.0,
- "routers_loss": 0.005107097327709198,
+ "routers_loss": 0.0043345349840819836,
"skip_count": 2.0,
"step": 1974,
"text_loss": 0.6362852454185486
@@ -18770,13 +18770,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.078125,
+ "grad_norm": 0.07958984375,
"learning_rate": 0.0009487922870261122,
- "loss": 0.0156,
+ "loss": 0.0155,
"macro_f1": 0.3333333432674408,
"num_tokens": 3185657.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013696947135031223,
+ "routers_loss": 0.0015687479171901941,
"skip_count": 0.0,
"step": 1976,
"text_loss": 0.8977144360542297
@@ -18789,13 +18789,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.0009486557530726638,
- "loss": 0.0136,
+ "loss": 0.0139,
"macro_f1": 0.3333333432674408,
"num_tokens": 3188772.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012224154779687524,
+ "routers_loss": 0.0010977238416671753,
"skip_count": 0.0,
"step": 1978,
"text_loss": 0.38512736558914185
@@ -18808,13 +18808,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09423828125,
+ "grad_norm": 0.11279296875,
"learning_rate": 0.0009485190471934844,
"loss": 0.0196,
"macro_f1": 0.6666666865348816,
"num_tokens": 3193131.0,
"repeat_count": 2.0,
- "routers_loss": 0.0030119111761450768,
+ "routers_loss": 0.002264744369313121,
"skip_count": 0.0,
"step": 1980,
"text_loss": 0.4171289801597595
@@ -18827,13 +18827,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.09033203125,
"learning_rate": 0.00094838216944096,
- "loss": 0.0222,
+ "loss": 0.0219,
"macro_f1": 0.3272727429866791,
"num_tokens": 3196668.0,
"repeat_count": 0.0,
- "routers_loss": 0.04286033287644386,
+ "routers_loss": 0.042320676147937775,
"skip_count": 1.0,
"step": 1982,
"text_loss": 0.19008000195026398
@@ -18846,32 +18846,32 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.053466796875,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0009482451198675424,
- "loss": 0.0158,
+ "loss": 0.0151,
"macro_f1": 0.32098767161369324,
"num_tokens": 3200282.0,
"repeat_count": 0.0,
- "routers_loss": 0.019988590851426125,
+ "routers_loss": 0.01796630397439003,
"skip_count": 1.0,
"step": 1984,
"text_loss": 0.5009249448776245
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 9.324038743762841,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.0634765625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.061767578125,
"learning_rate": 0.0009481078985257494,
- "loss": 0.0154,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0147,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3204439.0,
"repeat_count": 0.0,
- "routers_loss": 0.012215938419103622,
+ "routers_loss": 0.01052347756922245,
"skip_count": 1.0,
"step": 1986,
"text_loss": 0.15319275856018066
@@ -18884,13 +18884,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0009479705054681644,
- "loss": 0.0149,
+ "loss": 0.015,
"macro_f1": 0.3076923191547394,
"num_tokens": 3207590.0,
"repeat_count": 1.0,
- "routers_loss": 0.10747655481100082,
+ "routers_loss": 0.09640293568372726,
"skip_count": 3.0,
"step": 1988,
"text_loss": 0.3654652535915375
@@ -18903,13 +18903,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009478329407474366,
- "loss": 0.0186,
+ "loss": 0.0183,
"macro_f1": 0.5492662787437439,
"num_tokens": 3211172.0,
"repeat_count": 0.0,
- "routers_loss": 0.016109853982925415,
+ "routers_loss": 0.012670112773776054,
"skip_count": 1.0,
"step": 1990,
"text_loss": 0.5817596316337585
@@ -18922,13 +18922,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.05859375,
"learning_rate": 0.000947695204416281,
- "loss": 0.0116,
+ "loss": 0.0121,
"macro_f1": 0.6666666865348816,
"num_tokens": 3214050.0,
"repeat_count": 1.0,
- "routers_loss": 0.006929324474185705,
+ "routers_loss": 0.005263707600533962,
"skip_count": 0.0,
"step": 1992,
"text_loss": 0.5985888242721558
@@ -18941,13 +18941,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009475572965274787,
- "loss": 0.0147,
+ "loss": 0.0144,
"macro_f1": 0.3272727429866791,
"num_tokens": 3217318.0,
"repeat_count": 1.0,
- "routers_loss": 0.0715102106332779,
+ "routers_loss": 0.0682850033044815,
"skip_count": 0.0,
"step": 1994,
"text_loss": 0.316506564617157
@@ -18960,13 +18960,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.052490234375,
+ "grad_norm": 0.0595703125,
"learning_rate": 0.000947419217133876,
- "loss": 0.0187,
+ "loss": 0.019,
"macro_f1": 0.6666666865348816,
"num_tokens": 3220012.0,
"repeat_count": 0.0,
- "routers_loss": 0.008499355986714363,
+ "routers_loss": 0.008508823812007904,
"skip_count": 2.0,
"step": 1996,
"text_loss": 0.09665893763303757
@@ -18979,13 +18979,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.048583984375,
+ "grad_norm": 0.053466796875,
"learning_rate": 0.0009472809662883852,
- "loss": 0.0162,
+ "loss": 0.0155,
"macro_f1": 1.0,
"num_tokens": 3223019.0,
"repeat_count": 1.0,
- "routers_loss": 0.012003371492028236,
+ "routers_loss": 0.01100847590714693,
"skip_count": 2.0,
"step": 1998,
"text_loss": 0.4938808083534241
@@ -18998,13 +18998,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0009471425440439844,
- "loss": 0.0137,
+ "loss": 0.0135,
"macro_f1": 0.8817967176437378,
"num_tokens": 3226013.0,
"repeat_count": 2.0,
- "routers_loss": 0.0529167577624321,
+ "routers_loss": 0.04953207075595856,
"skip_count": 3.0,
"step": 2000,
"text_loss": 0.22258254885673523
diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin
index deeea733277b4031781a5b299881dd8e675e7606..a3d3ae372faf14539639f54454aa52b6ee730c4a 100644
--- a/checkpoint-2000/training_args.bin
+++ b/checkpoint-2000/training_args.bin
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:0b3f5975f57762b552c7ee29776bf32a4dbb125781a0658488d3884fb25c5296
+oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8
size 5880
diff --git a/checkpoint-3000/model-00002-of-00002.safetensors b/checkpoint-3000/model-00002-of-00002.safetensors
index 90e60903b10ee645ae44e95a07ca692b662c0b11..f35eb1877a2531abd7604388b55f0e2f227e0139 100644
--- a/checkpoint-3000/model-00002-of-00002.safetensors
+++ b/checkpoint-3000/model-00002-of-00002.safetensors
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:556a7e0a1afc9189ba05912546bcbd5642f962428969c3a6175460e4f7ed088d
+oid sha256:74237309fd851d3e6a87c2ecae9fdf046cda24a2b071142d227d3596658c57de
size 1481790520
diff --git a/checkpoint-3000/optimizer.pt b/checkpoint-3000/optimizer.pt
index 7cdbe7097aa1559dbc1d224433bc639415e56007..de25de043e1925d01a3a27e8c32e731639eb50cf 100644
--- a/checkpoint-3000/optimizer.pt
+++ b/checkpoint-3000/optimizer.pt
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:b5fc4ae4a4dcddd8241f1b24d63a0e756f40bb65d4eea6c288b5406b68fe3ad1
+oid sha256:0c95beb972e19eb9beaf599780a940fbae8dc2eb2b781515cf6fba5f661673d4
size 44191162
diff --git a/checkpoint-3000/trainer_state.json b/checkpoint-3000/trainer_state.json
index 71e24cdcfd8eb68dc8d169c82346790853fec879..5b23440931215970ba54e98fb0e391e46eef8b91 100644
--- a/checkpoint-3000/trainer_state.json
+++ b/checkpoint-3000/trainer_state.json
@@ -12,18 +12,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 31.0,
+ "avg_layers": 25.0,
"epoch": 0.009392427355444672,
- "f1_execute": 0.4864864945411682,
+ "f1_execute": 0.6976743936538696,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 2.40625,
+ "grad_norm": 2.25,
"learning_rate": 2e-06,
- "loss": 0.5484,
- "macro_f1": 0.1621621698141098,
+ "loss": 0.4974,
+ "macro_f1": 0.23255813121795654,
"num_tokens": 3175.0,
"repeat_count": 0.0,
- "routers_loss": 0.503563642501831,
+ "routers_loss": 0.4339469373226166,
"skip_count": 0.0,
"step": 2,
"text_loss": 0.3330848515033722
@@ -31,18 +31,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 23.0,
"epoch": 0.018784854710889344,
- "f1_execute": 0.4864864945411682,
+ "f1_execute": 0.7272726893424988,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.9140625,
+ "grad_norm": 1.8359375,
"learning_rate": 6e-06,
- "loss": 0.536,
- "macro_f1": 0.1621621698141098,
+ "loss": 0.4988,
+ "macro_f1": 0.24242423474788666,
"num_tokens": 5816.0,
"repeat_count": 0.0,
- "routers_loss": 0.4589468538761139,
+ "routers_loss": 0.4511934816837311,
"skip_count": 1.0,
"step": 4,
"text_loss": 0.4571273922920227
@@ -50,37 +50,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 32.0,
+ "avg_layers": 28.0,
"epoch": 0.02817728206633402,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.6666666865348816,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 2.375,
+ "grad_norm": 2.234375,
"learning_rate": 1e-05,
- "loss": 0.5469,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.5113,
+ "macro_f1": 0.222222238779068,
"num_tokens": 9739.0,
"repeat_count": 0.0,
- "routers_loss": 0.5736724138259888,
+ "routers_loss": 0.49306994676589966,
"skip_count": 0.0,
"step": 6,
"text_loss": 0.41060560941696167
},
{
- "acc_repeat": 1.0,
- "acc_skip": 0.5,
- "avg_layers": 33.0,
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 0.03756970942177869,
- "f1_execute": 0.47058823704719543,
- "f1_repeat": 0.1538461595773697,
- "f1_skip": 0.222222238779068,
- "grad_norm": 1.8515625,
+ "f1_execute": 0.5641025900840759,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.7265625,
"learning_rate": 1.4e-05,
- "loss": 0.5291,
- "macro_f1": 0.28221890330314636,
+ "loss": 0.4766,
+ "macro_f1": 0.18803420662879944,
"num_tokens": 12869.0,
"repeat_count": 1.0,
- "routers_loss": 0.49970296025276184,
+ "routers_loss": 0.48872503638267517,
"skip_count": 2.0,
"step": 8,
"text_loss": 0.36678561568260193
@@ -88,37 +88,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 27.0,
"epoch": 0.046962136777223364,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.6976743936538696,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.953125,
+ "grad_norm": 1.78125,
"learning_rate": 1.8e-05,
- "loss": 0.5316,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.4806,
+ "macro_f1": 0.23255813121795654,
"num_tokens": 15845.0,
"repeat_count": 0.0,
- "routers_loss": 0.5153562426567078,
+ "routers_loss": 0.45077216625213623,
"skip_count": 0.0,
"step": 10,
"text_loss": 0.5597779154777527
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 0.5,
"acc_skip": 0.3333333432674408,
- "avg_layers": 34.0,
+ "avg_layers": 26.0,
"epoch": 0.05635456413266804,
- "f1_execute": 0.5714285373687744,
- "f1_repeat": 0.0,
- "f1_skip": 0.25,
- "grad_norm": 1.6328125,
+ "f1_execute": 0.7179487347602844,
+ "f1_repeat": 0.2857142984867096,
+ "f1_skip": 0.20000000298023224,
+ "grad_norm": 1.5390625,
"learning_rate": 2.2e-05,
- "loss": 0.5051,
- "macro_f1": 0.2738095223903656,
+ "loss": 0.4557,
+ "macro_f1": 0.40122103691101074,
"num_tokens": 19353.0,
"repeat_count": 2.0,
- "routers_loss": 0.46214747428894043,
+ "routers_loss": 0.4130440056324005,
"skip_count": 3.0,
"step": 12,
"text_loss": 0.2056603729724884
@@ -126,37 +126,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 27.0,
"epoch": 0.06574699148811271,
- "f1_execute": 0.5263157486915588,
+ "f1_execute": 0.6976743936538696,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 2.671875,
+ "grad_norm": 2.4375,
"learning_rate": 2.6e-05,
- "loss": 0.5653,
- "macro_f1": 0.17543858289718628,
+ "loss": 0.5129,
+ "macro_f1": 0.23255813121795654,
"num_tokens": 22675.0,
"repeat_count": 0.0,
- "routers_loss": 0.5300976634025574,
+ "routers_loss": 0.4582902193069458,
"skip_count": 0.0,
"step": 14,
"text_loss": 0.32989829778671265
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 34.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 0.07513941884355738,
- "f1_execute": 0.6153846383094788,
+ "f1_execute": 0.6829268336296082,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 1.8828125,
+ "f1_skip": 0.2222222238779068,
+ "grad_norm": 1.7421875,
"learning_rate": 3e-05,
- "loss": 0.5225,
- "macro_f1": 0.20512822270393372,
+ "loss": 0.4729,
+ "macro_f1": 0.3017163574695587,
"num_tokens": 26022.0,
"repeat_count": 0.0,
- "routers_loss": 0.473240464925766,
+ "routers_loss": 0.42910993099212646,
"skip_count": 1.0,
"step": 16,
"text_loss": 0.1353905349969864
@@ -164,18 +164,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 38.0,
+ "avg_layers": 27.0,
"epoch": 0.08453184619900206,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.7555555105209351,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.6015625,
+ "grad_norm": 1.4765625,
"learning_rate": 3.4000000000000007e-05,
- "loss": 0.4867,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.4274,
+ "macro_f1": 0.2518518567085266,
"num_tokens": 29251.0,
"repeat_count": 0.0,
- "routers_loss": 0.4795944094657898,
+ "routers_loss": 0.3990713059902191,
"skip_count": 0.0,
"step": 18,
"text_loss": 0.3806765377521515
@@ -183,18 +183,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 36.0,
+ "avg_layers": 26.0,
"epoch": 0.09392427355444673,
- "f1_execute": 0.6153846383094788,
- "f1_repeat": 0.1538461595773697,
+ "f1_execute": 0.6829268336296082,
+ "f1_repeat": 0.2857142984867096,
"f1_skip": 0.0,
- "grad_norm": 1.3984375,
+ "grad_norm": 1.3125,
"learning_rate": 3.8e-05,
- "loss": 0.4718,
- "macro_f1": 0.25641027092933655,
+ "loss": 0.4261,
+ "macro_f1": 0.3228803873062134,
"num_tokens": 32545.0,
"repeat_count": 1.0,
- "routers_loss": 0.41872408986091614,
+ "routers_loss": 0.40146592259407043,
"skip_count": 0.0,
"step": 20,
"text_loss": 0.25648367404937744
@@ -202,18 +202,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 26.0,
"epoch": 0.1033167009098914,
- "f1_execute": 0.6341463327407837,
+ "f1_execute": 0.7272727489471436,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.7734375,
+ "grad_norm": 1.625,
"learning_rate": 4.2000000000000004e-05,
- "loss": 0.4472,
- "macro_f1": 0.21138212084770203,
+ "loss": 0.404,
+ "macro_f1": 0.24242424964904785,
"num_tokens": 36560.0,
"repeat_count": 0.0,
- "routers_loss": 0.4152105450630188,
+ "routers_loss": 0.372715026140213,
"skip_count": 0.0,
"step": 22,
"text_loss": 0.2799522578716278
@@ -221,18 +221,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 32.0,
+ "avg_layers": 27.0,
"epoch": 0.11270912826533608,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.7555555105209351,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.8046875,
+ "grad_norm": 1.6328125,
"learning_rate": 4.6e-05,
- "loss": 0.4554,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.4218,
+ "macro_f1": 0.2518518567085266,
"num_tokens": 39597.0,
"repeat_count": 0.0,
- "routers_loss": 0.47541096806526184,
+ "routers_loss": 0.4504941403865814,
"skip_count": 0.0,
"step": 24,
"text_loss": 0.6635695695877075
@@ -240,18 +240,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 34.0,
+ "avg_layers": 27.0,
"epoch": 0.12210155562078075,
- "f1_execute": 0.7826087474822998,
+ "f1_execute": 0.8085106015205383,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.875,
+ "grad_norm": 1.7109375,
"learning_rate": 5e-05,
- "loss": 0.4182,
- "macro_f1": 0.2608695924282074,
+ "loss": 0.3886,
+ "macro_f1": 0.26950353384017944,
"num_tokens": 43080.0,
"repeat_count": 0.0,
- "routers_loss": 0.37319275736808777,
+ "routers_loss": 0.3498791456222534,
"skip_count": 0.0,
"step": 26,
"text_loss": 0.7035041451454163
@@ -259,18 +259,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 0.13149398297622542,
- "f1_execute": 0.7826087474822998,
+ "f1_execute": 0.8085106015205383,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.4375,
+ "grad_norm": 1.34375,
"learning_rate": 5.4e-05,
- "loss": 0.3991,
- "macro_f1": 0.2608695924282074,
+ "loss": 0.3724,
+ "macro_f1": 0.26950353384017944,
"num_tokens": 46406.0,
"repeat_count": 0.0,
- "routers_loss": 0.3604123294353485,
+ "routers_loss": 0.31265875697135925,
"skip_count": 0.0,
"step": 28,
"text_loss": 0.6388277411460876
@@ -280,16 +280,16 @@
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.1408864103316701,
- "f1_execute": 0.8979591727256775,
+ "f1_execute": 0.8571428060531616,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.421875,
+ "grad_norm": 1.2578125,
"learning_rate": 5.800000000000001e-05,
- "loss": 0.3827,
- "macro_f1": 0.2993197441101074,
+ "loss": 0.341,
+ "macro_f1": 0.2857142686843872,
"num_tokens": 49966.0,
"repeat_count": 0.0,
- "routers_loss": 0.35880225896835327,
+ "routers_loss": 0.3200918138027191,
"skip_count": 2.0,
"step": 30,
"text_loss": 0.17372547090053558
@@ -297,18 +297,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 24.0,
+ "avg_layers": 25.0,
"epoch": 0.15027883768711475,
- "f1_execute": 0.9200000166893005,
+ "f1_execute": 0.8571428060531616,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.4609375,
+ "grad_norm": 1.4140625,
"learning_rate": 6.2e-05,
- "loss": 0.3452,
- "macro_f1": 0.30666667222976685,
+ "loss": 0.3207,
+ "macro_f1": 0.2857142686843872,
"num_tokens": 53378.0,
"repeat_count": 1.0,
- "routers_loss": 0.31086465716362,
+ "routers_loss": 0.32304447889328003,
"skip_count": 1.0,
"step": 32,
"text_loss": 0.18196581304073334
@@ -316,18 +316,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 25.0,
"epoch": 0.15967126504255943,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.3671875,
+ "grad_norm": 1.46875,
"learning_rate": 6.6e-05,
- "loss": 0.3283,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.3304,
+ "macro_f1": 0.3006536364555359,
"num_tokens": 56933.0,
"repeat_count": 0.0,
- "routers_loss": 0.2674171030521393,
+ "routers_loss": 0.24814388155937195,
"skip_count": 0.0,
"step": 34,
"text_loss": 0.28823015093803406
@@ -335,18 +335,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 0.16906369239800412,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.1015625,
+ "grad_norm": 1.1171875,
"learning_rate": 7.000000000000001e-05,
- "loss": 0.2849,
- "macro_f1": 0.3205128312110901,
+ "loss": 0.2778,
+ "macro_f1": 0.3006536066532135,
"num_tokens": 60744.0,
"repeat_count": 1.0,
- "routers_loss": 0.24587315320968628,
+ "routers_loss": 0.22411039471626282,
"skip_count": 0.0,
"step": 36,
"text_loss": 0.5260357856750488
@@ -354,18 +354,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 31.0,
+ "avg_layers": 27.0,
"epoch": 0.17845611975344877,
- "f1_execute": 0.8085106015205383,
+ "f1_execute": 0.8571428656578064,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.3046875,
+ "grad_norm": 1.484375,
"learning_rate": 7.4e-05,
- "loss": 0.2616,
- "macro_f1": 0.26950353384017944,
+ "loss": 0.2738,
+ "macro_f1": 0.2857142984867096,
"num_tokens": 64900.0,
"repeat_count": 0.0,
- "routers_loss": 0.32050269842147827,
+ "routers_loss": 0.44355395436286926,
"skip_count": 0.0,
"step": 38,
"text_loss": 0.5382097363471985
@@ -373,18 +373,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 0.18784854710889345,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.1796875,
+ "grad_norm": 1.3828125,
"learning_rate": 7.8e-05,
- "loss": 0.2084,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.2137,
+ "macro_f1": 0.3076923191547394,
"num_tokens": 68000.0,
"repeat_count": 0.0,
- "routers_loss": 0.15196125209331512,
+ "routers_loss": 0.202330082654953,
"skip_count": 0.0,
"step": 40,
"text_loss": 0.5946118831634521
@@ -392,18 +392,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 25.0,
"epoch": 0.19724097446433814,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.61328125,
+ "grad_norm": 0.78125,
"learning_rate": 8.2e-05,
- "loss": 0.1947,
+ "loss": 0.21,
"macro_f1": 0.3144654333591461,
"num_tokens": 70529.0,
"repeat_count": 0.0,
- "routers_loss": 0.14121046662330627,
+ "routers_loss": 0.18023855984210968,
"skip_count": 0.0,
"step": 42,
"text_loss": 0.5550904273986816
@@ -416,13 +416,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.50390625,
+ "grad_norm": 0.609375,
"learning_rate": 8.599999999999999e-05,
- "loss": 0.1884,
+ "loss": 0.1918,
"macro_f1": 0.32098764181137085,
"num_tokens": 73427.0,
"repeat_count": 2.0,
- "routers_loss": 0.21312278509140015,
+ "routers_loss": 0.2101590931415558,
"skip_count": 0.0,
"step": 44,
"text_loss": 0.4636923372745514
@@ -435,13 +435,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.45703125,
+ "grad_norm": 0.53125,
"learning_rate": 8.999999999999999e-05,
- "loss": 0.166,
+ "loss": 0.1881,
"macro_f1": 0.3333333432674408,
"num_tokens": 76472.0,
"repeat_count": 0.0,
- "routers_loss": 0.1184137836098671,
+ "routers_loss": 0.11800424009561539,
"skip_count": 0.0,
"step": 46,
"text_loss": 0.4187001883983612
@@ -454,13 +454,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.62890625,
+ "grad_norm": 0.953125,
"learning_rate": 9.400000000000001e-05,
- "loss": 0.1313,
+ "loss": 0.1446,
"macro_f1": 0.3272727429866791,
"num_tokens": 79124.0,
"repeat_count": 1.0,
- "routers_loss": 0.10897563397884369,
+ "routers_loss": 0.11632519960403442,
"skip_count": 0.0,
"step": 48,
"text_loss": 0.2253919243812561
@@ -468,18 +468,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 0.2348106838861168,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.4375,
+ "grad_norm": 0.58984375,
"learning_rate": 9.800000000000001e-05,
- "loss": 0.1531,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.1543,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 81980.0,
"repeat_count": 1.0,
- "routers_loss": 0.09979952871799469,
+ "routers_loss": 0.09669367223978043,
"skip_count": 0.0,
"step": 50,
"text_loss": 0.6053179502487183
@@ -487,18 +487,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 0.2442031112415615,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.515625,
+ "grad_norm": 0.8515625,
"learning_rate": 0.000102,
- "loss": 0.1265,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.1393,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 85236.0,
"repeat_count": 0.0,
- "routers_loss": 0.05543195456266403,
+ "routers_loss": 0.12471720576286316,
"skip_count": 0.0,
"step": 52,
"text_loss": 0.6027331948280334
@@ -511,13 +511,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.328125,
+ "grad_norm": 0.421875,
"learning_rate": 0.000106,
- "loss": 0.1436,
+ "loss": 0.1473,
"macro_f1": 0.32098764181137085,
"num_tokens": 88238.0,
"repeat_count": 0.0,
- "routers_loss": 0.15049344301223755,
+ "routers_loss": 0.1376056969165802,
"skip_count": 2.0,
"step": 54,
"text_loss": 0.2861751616001129
@@ -530,13 +530,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.263671875,
+ "grad_norm": 0.35546875,
"learning_rate": 0.00011,
- "loss": 0.1021,
+ "loss": 0.1082,
"macro_f1": 0.3333333432674408,
"num_tokens": 91056.0,
"repeat_count": 0.0,
- "routers_loss": 0.07367338240146637,
+ "routers_loss": 0.07449393719434738,
"skip_count": 0.0,
"step": 56,
"text_loss": 0.48106974363327026
@@ -544,18 +544,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 26.0,
"epoch": 0.2723803933078955,
- "f1_execute": 1.0,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25,
+ "grad_norm": 0.271484375,
"learning_rate": 0.000114,
- "loss": 0.114,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.1123,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 94987.0,
"repeat_count": 0.0,
- "routers_loss": 0.03782692551612854,
+ "routers_loss": 0.07064720243215561,
"skip_count": 0.0,
"step": 58,
"text_loss": 0.3554874658584595
@@ -568,13 +568,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.333984375,
+ "grad_norm": 0.5390625,
"learning_rate": 0.000118,
- "loss": 0.1197,
+ "loss": 0.1234,
"macro_f1": 0.32098764181137085,
"num_tokens": 97909.0,
"repeat_count": 0.0,
- "routers_loss": 0.14074955880641937,
+ "routers_loss": 0.16835889220237732,
"skip_count": 2.0,
"step": 60,
"text_loss": 0.5475804805755615
@@ -587,13 +587,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.000122,
- "loss": 0.1174,
+ "loss": 0.1224,
"macro_f1": 0.3333333432674408,
"num_tokens": 101043.0,
"repeat_count": 0.0,
- "routers_loss": 0.058013737201690674,
+ "routers_loss": 0.06127442046999931,
"skip_count": 0.0,
"step": 62,
"text_loss": 0.5966938734054565
@@ -606,13 +606,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.212890625,
"learning_rate": 0.000126,
- "loss": 0.0911,
+ "loss": 0.0931,
"macro_f1": 0.3333333432674408,
"num_tokens": 104103.0,
"repeat_count": 0.0,
- "routers_loss": 0.04936821386218071,
+ "routers_loss": 0.047825805842876434,
"skip_count": 0.0,
"step": 64,
"text_loss": 0.5480486750602722
@@ -625,13 +625,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.220703125,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.00013000000000000002,
- "loss": 0.1107,
+ "loss": 0.1088,
"macro_f1": 0.3006536364555359,
"num_tokens": 107009.0,
"repeat_count": 1.0,
- "routers_loss": 0.2628525495529175,
+ "routers_loss": 0.275174081325531,
"skip_count": 4.0,
"step": 66,
"text_loss": 0.41714492440223694
@@ -644,13 +644,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.000134,
- "loss": 0.1109,
+ "loss": 0.1123,
"macro_f1": 0.3333333432674408,
"num_tokens": 110486.0,
"repeat_count": 0.0,
- "routers_loss": 0.02859785594046116,
+ "routers_loss": 0.029025178402662277,
"skip_count": 0.0,
"step": 68,
"text_loss": 0.6775627732276917
@@ -663,13 +663,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.298828125,
+ "grad_norm": 0.314453125,
"learning_rate": 0.00013800000000000002,
- "loss": 0.1067,
+ "loss": 0.1049,
"macro_f1": 0.3272727429866791,
"num_tokens": 113878.0,
"repeat_count": 0.0,
- "routers_loss": 0.10459086298942566,
+ "routers_loss": 0.10141710191965103,
"skip_count": 1.0,
"step": 70,
"text_loss": 0.6678873896598816
@@ -682,13 +682,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2109375,
+ "grad_norm": 0.248046875,
"learning_rate": 0.00014199999999999998,
- "loss": 0.1166,
+ "loss": 0.1119,
"macro_f1": 0.3272727429866791,
"num_tokens": 116989.0,
"repeat_count": 0.0,
- "routers_loss": 0.0718551054596901,
+ "routers_loss": 0.08002066612243652,
"skip_count": 1.0,
"step": 72,
"text_loss": 0.405692994594574
@@ -701,13 +701,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1787109375,
"learning_rate": 0.000146,
- "loss": 0.1007,
+ "loss": 0.0944,
"macro_f1": 0.3144654333591461,
"num_tokens": 119883.0,
"repeat_count": 0.0,
- "routers_loss": 0.1850946843624115,
+ "routers_loss": 0.1867009848356247,
"skip_count": 3.0,
"step": 74,
"text_loss": 0.44616150856018066
@@ -720,13 +720,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.34375,
+ "grad_norm": 0.333984375,
"learning_rate": 0.00015,
- "loss": 0.1019,
+ "loss": 0.1003,
"macro_f1": 0.32098764181137085,
"num_tokens": 123325.0,
"repeat_count": 0.0,
- "routers_loss": 0.09809529036283493,
+ "routers_loss": 0.07042168825864792,
"skip_count": 2.0,
"step": 76,
"text_loss": 0.11340200901031494
@@ -739,13 +739,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.259765625,
+ "grad_norm": 0.26171875,
"learning_rate": 0.000154,
- "loss": 0.1088,
+ "loss": 0.1066,
"macro_f1": 0.32098764181137085,
"num_tokens": 126131.0,
"repeat_count": 0.0,
- "routers_loss": 0.11277207732200623,
+ "routers_loss": 0.11535373330116272,
"skip_count": 2.0,
"step": 78,
"text_loss": 0.3269135355949402
@@ -758,13 +758,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2412109375,
+ "grad_norm": 0.255859375,
"learning_rate": 0.000158,
- "loss": 0.0866,
+ "loss": 0.0891,
"macro_f1": 0.3272727429866791,
"num_tokens": 130349.0,
"repeat_count": 0.0,
- "routers_loss": 0.09079254418611526,
+ "routers_loss": 0.09497501701116562,
"skip_count": 1.0,
"step": 80,
"text_loss": 0.15273472666740417
@@ -777,13 +777,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.000162,
- "loss": 0.0928,
+ "loss": 0.0929,
"macro_f1": 0.3333333432674408,
"num_tokens": 133607.0,
"repeat_count": 0.0,
- "routers_loss": 0.02900076098740101,
+ "routers_loss": 0.030639523640275,
"skip_count": 0.0,
"step": 82,
"text_loss": 0.282884806394577
@@ -796,13 +796,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.00016600000000000002,
- "loss": 0.1251,
+ "loss": 0.1254,
"macro_f1": 0.3272727429866791,
"num_tokens": 136694.0,
"repeat_count": 0.0,
- "routers_loss": 0.0763339251279831,
+ "routers_loss": 0.07906441390514374,
"skip_count": 1.0,
"step": 84,
"text_loss": 0.459094375371933
@@ -817,11 +817,11 @@
"f1_skip": 0.0,
"grad_norm": 0.212890625,
"learning_rate": 0.00017,
- "loss": 0.1064,
+ "loss": 0.1071,
"macro_f1": 0.3144654333591461,
"num_tokens": 139966.0,
"repeat_count": 1.0,
- "routers_loss": 0.13191410899162292,
+ "routers_loss": 0.1124570444226265,
"skip_count": 2.0,
"step": 86,
"text_loss": 0.29985448718070984
@@ -834,13 +834,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.25390625,
"learning_rate": 0.000174,
- "loss": 0.1055,
+ "loss": 0.1031,
"macro_f1": 0.32098764181137085,
"num_tokens": 142788.0,
"repeat_count": 2.0,
- "routers_loss": 0.21200031042099,
+ "routers_loss": 0.1966402679681778,
"skip_count": 0.0,
"step": 88,
"text_loss": 0.6435291767120361
@@ -853,13 +853,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.318359375,
+ "grad_norm": 0.349609375,
"learning_rate": 0.000178,
- "loss": 0.0971,
+ "loss": 0.0963,
"macro_f1": 0.3333333432674408,
"num_tokens": 146192.0,
"repeat_count": 0.0,
- "routers_loss": 0.031911369413137436,
+ "routers_loss": 0.0325632207095623,
"skip_count": 0.0,
"step": 90,
"text_loss": 0.35170626640319824
@@ -872,13 +872,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.216796875,
+ "grad_norm": 0.2265625,
"learning_rate": 0.000182,
- "loss": 0.1056,
+ "loss": 0.1073,
"macro_f1": 0.32098764181137085,
"num_tokens": 149792.0,
"repeat_count": 1.0,
- "routers_loss": 0.14131835103034973,
+ "routers_loss": 0.15115146338939667,
"skip_count": 1.0,
"step": 92,
"text_loss": 0.83159339427948
@@ -891,13 +891,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.205078125,
"learning_rate": 0.000186,
- "loss": 0.1059,
+ "loss": 0.1073,
"macro_f1": 0.3333333432674408,
"num_tokens": 152766.0,
"repeat_count": 0.0,
- "routers_loss": 0.04137955233454704,
+ "routers_loss": 0.043313540518283844,
"skip_count": 0.0,
"step": 94,
"text_loss": 0.49707934260368347
@@ -910,13 +910,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.00019,
- "loss": 0.0934,
+ "loss": 0.0947,
"macro_f1": 0.3333333432674408,
"num_tokens": 156112.0,
"repeat_count": 0.0,
- "routers_loss": 0.03163003921508789,
+ "routers_loss": 0.032021280378103256,
"skip_count": 0.0,
"step": 96,
"text_loss": 0.27608928084373474
@@ -929,13 +929,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.2099609375,
"learning_rate": 0.000194,
- "loss": 0.0847,
+ "loss": 0.0846,
"macro_f1": 0.3076923191547394,
"num_tokens": 159454.0,
"repeat_count": 2.0,
- "routers_loss": 0.2567490339279175,
+ "routers_loss": 0.24473154544830322,
"skip_count": 2.0,
"step": 98,
"text_loss": 0.6026689410209656
@@ -948,13 +948,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.30859375,
+ "grad_norm": 0.271484375,
"learning_rate": 0.00019800000000000002,
- "loss": 0.1077,
+ "loss": 0.1028,
"macro_f1": 0.32098764181137085,
"num_tokens": 163661.0,
"repeat_count": 0.0,
- "routers_loss": 0.11468870937824249,
+ "routers_loss": 0.11468276381492615,
"skip_count": 2.0,
"step": 100,
"text_loss": 0.46733155846595764
@@ -967,13 +967,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.17578125,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.000202,
- "loss": 0.1131,
+ "loss": 0.1089,
"macro_f1": 0.3333333432674408,
"num_tokens": 167134.0,
"repeat_count": 0.0,
- "routers_loss": 0.02124219387769699,
+ "routers_loss": 0.021144939586520195,
"skip_count": 0.0,
"step": 102,
"text_loss": 0.6362994909286499
@@ -986,13 +986,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.000206,
- "loss": 0.0624,
+ "loss": 0.0621,
"macro_f1": 0.3272727429866791,
"num_tokens": 170433.0,
"repeat_count": 0.0,
- "routers_loss": 0.06983796507120132,
+ "routers_loss": 0.06594710797071457,
"skip_count": 1.0,
"step": 104,
"text_loss": 0.4515477120876312
@@ -1005,13 +1005,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.00021,
- "loss": 0.0951,
+ "loss": 0.0929,
"macro_f1": 0.3333333432674408,
"num_tokens": 173387.0,
"repeat_count": 0.0,
- "routers_loss": 0.03467355668544769,
+ "routers_loss": 0.032923027873039246,
"skip_count": 0.0,
"step": 106,
"text_loss": 0.6638453006744385
@@ -1024,13 +1024,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.240234375,
"learning_rate": 0.000214,
- "loss": 0.0881,
+ "loss": 0.0883,
"macro_f1": 0.3272727429866791,
"num_tokens": 176170.0,
"repeat_count": 1.0,
- "routers_loss": 0.08142061531543732,
+ "routers_loss": 0.08034781366586685,
"skip_count": 0.0,
"step": 108,
"text_loss": 1.186936855316162
@@ -1043,13 +1043,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.267578125,
"learning_rate": 0.000218,
- "loss": 0.0795,
+ "loss": 0.0794,
"macro_f1": 0.3272727429866791,
"num_tokens": 179877.0,
"repeat_count": 0.0,
- "routers_loss": 0.08327355235815048,
+ "routers_loss": 0.07814185321331024,
"skip_count": 1.0,
"step": 110,
"text_loss": 0.5488709211349487
@@ -1062,13 +1062,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.000222,
- "loss": 0.0943,
+ "loss": 0.0946,
"macro_f1": 0.3333333432674408,
"num_tokens": 182726.0,
"repeat_count": 0.0,
- "routers_loss": 0.019890006631612778,
+ "routers_loss": 0.01884695515036583,
"skip_count": 0.0,
"step": 112,
"text_loss": 0.5195863842964172
@@ -1081,13 +1081,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2001953125,
+ "grad_norm": 0.19921875,
"learning_rate": 0.00022600000000000002,
- "loss": 0.0933,
+ "loss": 0.0974,
"macro_f1": 0.32098764181137085,
"num_tokens": 185624.0,
"repeat_count": 0.0,
- "routers_loss": 0.09992363303899765,
+ "routers_loss": 0.09657823294401169,
"skip_count": 2.0,
"step": 114,
"text_loss": 0.43858134746551514
@@ -1100,13 +1100,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.3046875,
"learning_rate": 0.00023,
- "loss": 0.0762,
+ "loss": 0.0753,
"macro_f1": 0.3333333432674408,
"num_tokens": 188155.0,
"repeat_count": 0.0,
- "routers_loss": 0.014119029976427555,
+ "routers_loss": 0.01463601179420948,
"skip_count": 0.0,
"step": 116,
"text_loss": 0.392981618642807
@@ -1119,13 +1119,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.423828125,
+ "grad_norm": 0.439453125,
"learning_rate": 0.00023400000000000002,
- "loss": 0.0842,
+ "loss": 0.0843,
"macro_f1": 0.3333333432674408,
"num_tokens": 190970.0,
"repeat_count": 0.0,
- "routers_loss": 0.03976766765117645,
+ "routers_loss": 0.03859659656882286,
"skip_count": 0.0,
"step": 118,
"text_loss": 0.309179425239563
@@ -1138,13 +1138,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.2255859375,
"learning_rate": 0.00023799999999999998,
- "loss": 0.0517,
+ "loss": 0.053,
"macro_f1": 0.3333333432674408,
"num_tokens": 193988.0,
"repeat_count": 0.0,
- "routers_loss": 0.017428619787096977,
+ "routers_loss": 0.019092386588454247,
"skip_count": 0.0,
"step": 120,
"text_loss": 0.48543134331703186
@@ -1157,13 +1157,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.296875,
+ "grad_norm": 0.35546875,
"learning_rate": 0.000242,
- "loss": 0.1134,
+ "loss": 0.1203,
"macro_f1": 0.3272727429866791,
"num_tokens": 196475.0,
"repeat_count": 0.0,
- "routers_loss": 0.06965513527393341,
+ "routers_loss": 0.0619138665497303,
"skip_count": 1.0,
"step": 122,
"text_loss": 0.4615364074707031
@@ -1176,13 +1176,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1875,
"learning_rate": 0.000246,
- "loss": 0.0984,
+ "loss": 0.1002,
"macro_f1": 0.3272727429866791,
"num_tokens": 200045.0,
"repeat_count": 1.0,
- "routers_loss": 0.10476501286029816,
+ "routers_loss": 0.09752107411623001,
"skip_count": 0.0,
"step": 124,
"text_loss": 0.15802054107189178
@@ -1195,13 +1195,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.00025,
- "loss": 0.0771,
+ "loss": 0.0773,
"macro_f1": 0.3333333432674408,
"num_tokens": 203214.0,
"repeat_count": 0.0,
- "routers_loss": 0.028317544609308243,
+ "routers_loss": 0.02896115928888321,
"skip_count": 0.0,
"step": 126,
"text_loss": 0.4543360471725464
@@ -1214,13 +1214,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.390625,
+ "grad_norm": 0.4296875,
"learning_rate": 0.000254,
- "loss": 0.0933,
+ "loss": 0.0973,
"macro_f1": 0.3333333432674408,
"num_tokens": 206168.0,
"repeat_count": 0.0,
- "routers_loss": 0.012766432017087936,
+ "routers_loss": 0.011423567309975624,
"skip_count": 0.0,
"step": 128,
"text_loss": 0.4730179011821747
@@ -1233,13 +1233,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.353515625,
+ "grad_norm": 0.365234375,
"learning_rate": 0.00025800000000000004,
- "loss": 0.0989,
+ "loss": 0.099,
"macro_f1": 0.3333333432674408,
"num_tokens": 209907.0,
"repeat_count": 0.0,
- "routers_loss": 0.021400077268481255,
+ "routers_loss": 0.01957600563764572,
"skip_count": 0.0,
"step": 130,
"text_loss": 0.45122358202934265
@@ -1252,13 +1252,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.2060546875,
"learning_rate": 0.000262,
- "loss": 0.0873,
+ "loss": 0.0868,
"macro_f1": 0.3272727429866791,
"num_tokens": 213521.0,
"repeat_count": 0.0,
- "routers_loss": 0.05025051161646843,
+ "routers_loss": 0.04882373288273811,
"skip_count": 1.0,
"step": 132,
"text_loss": 0.4341491758823395
@@ -1271,13 +1271,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1611328125,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.000266,
- "loss": 0.085,
+ "loss": 0.0834,
"macro_f1": 0.3333333432674408,
"num_tokens": 216484.0,
"repeat_count": 0.0,
- "routers_loss": 0.017420046031475067,
+ "routers_loss": 0.016083380207419395,
"skip_count": 0.0,
"step": 134,
"text_loss": 0.46990111470222473
@@ -1290,13 +1290,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2041015625,
+ "grad_norm": 0.220703125,
"learning_rate": 0.00027,
- "loss": 0.086,
+ "loss": 0.0863,
"macro_f1": 0.3333333432674408,
"num_tokens": 219398.0,
"repeat_count": 0.0,
- "routers_loss": 0.018217921257019043,
+ "routers_loss": 0.01733536459505558,
"skip_count": 0.0,
"step": 136,
"text_loss": 0.4455361068248749
@@ -1309,13 +1309,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.00027400000000000005,
- "loss": 0.0985,
+ "loss": 0.0997,
"macro_f1": 0.3333333432674408,
"num_tokens": 222430.0,
"repeat_count": 0.0,
- "routers_loss": 0.012350660748779774,
+ "routers_loss": 0.01332803163677454,
"skip_count": 0.0,
"step": 138,
"text_loss": 0.47699397802352905
@@ -1328,13 +1328,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.302734375,
+ "grad_norm": 0.333984375,
"learning_rate": 0.00027800000000000004,
"loss": 0.0922,
"macro_f1": 0.3144654333591461,
"num_tokens": 225458.0,
"repeat_count": 1.0,
- "routers_loss": 0.14993029832839966,
+ "routers_loss": 0.14924728870391846,
"skip_count": 2.0,
"step": 140,
"text_loss": 0.5858222842216492
@@ -1347,13 +1347,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.251953125,
+ "grad_norm": 0.25,
"learning_rate": 0.00028199999999999997,
- "loss": 0.0791,
+ "loss": 0.0798,
"macro_f1": 0.3144654333591461,
"num_tokens": 229365.0,
"repeat_count": 1.0,
- "routers_loss": 0.17921413481235504,
+ "routers_loss": 0.1860177218914032,
"skip_count": 2.0,
"step": 142,
"text_loss": 0.5003137588500977
@@ -1366,13 +1366,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.00028599999999999996,
- "loss": 0.0535,
+ "loss": 0.054,
"macro_f1": 0.32098764181137085,
"num_tokens": 231787.0,
"repeat_count": 1.0,
- "routers_loss": 0.1420905590057373,
+ "routers_loss": 0.16498211026191711,
"skip_count": 1.0,
"step": 144,
"text_loss": 0.5026470422744751
@@ -1385,13 +1385,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.29296875,
+ "grad_norm": 0.306640625,
"learning_rate": 0.00029,
- "loss": 0.0956,
+ "loss": 0.0936,
"macro_f1": 0.32098764181137085,
"num_tokens": 235014.0,
"repeat_count": 1.0,
- "routers_loss": 0.12468750029802322,
+ "routers_loss": 0.11801310628652573,
"skip_count": 1.0,
"step": 146,
"text_loss": 0.611888587474823
@@ -1404,13 +1404,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.000294,
- "loss": 0.0879,
+ "loss": 0.0878,
"macro_f1": 0.3333333432674408,
"num_tokens": 238210.0,
"repeat_count": 0.0,
- "routers_loss": 0.024295611307024956,
+ "routers_loss": 0.02422776259481907,
"skip_count": 0.0,
"step": 148,
"text_loss": 0.2876914143562317
@@ -1423,13 +1423,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.000298,
- "loss": 0.087,
+ "loss": 0.0858,
"macro_f1": 0.32098764181137085,
"num_tokens": 241582.0,
"repeat_count": 0.0,
- "routers_loss": 0.07016433775424957,
+ "routers_loss": 0.07282499223947525,
"skip_count": 2.0,
"step": 150,
"text_loss": 0.3919292390346527
@@ -1442,13 +1442,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3828125,
+ "grad_norm": 0.37890625,
"learning_rate": 0.000302,
- "loss": 0.0782,
+ "loss": 0.0797,
"macro_f1": 0.32098764181137085,
"num_tokens": 244621.0,
"repeat_count": 1.0,
- "routers_loss": 0.18942493200302124,
+ "routers_loss": 0.20659038424491882,
"skip_count": 1.0,
"step": 152,
"text_loss": 0.4294498860836029
@@ -1461,13 +1461,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1787109375,
"learning_rate": 0.000306,
- "loss": 0.0713,
+ "loss": 0.072,
"macro_f1": 0.3333333432674408,
"num_tokens": 247833.0,
"repeat_count": 0.0,
- "routers_loss": 0.02319060079753399,
+ "routers_loss": 0.02428400330245495,
"skip_count": 0.0,
"step": 154,
"text_loss": 0.5930765867233276
@@ -1480,13 +1480,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.15234375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.00031,
- "loss": 0.0778,
+ "loss": 0.0772,
"macro_f1": 0.3333333432674408,
"num_tokens": 251349.0,
"repeat_count": 0.0,
- "routers_loss": 0.01764747127890587,
+ "routers_loss": 0.0167869683355093,
"skip_count": 0.0,
"step": 156,
"text_loss": 0.41063904762268066
@@ -1499,13 +1499,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.000314,
- "loss": 0.0829,
+ "loss": 0.0821,
"macro_f1": 0.3333333432674408,
"num_tokens": 254886.0,
"repeat_count": 0.0,
- "routers_loss": 0.02268100716173649,
+ "routers_loss": 0.02531604655086994,
"skip_count": 0.0,
"step": 158,
"text_loss": 0.6739020347595215
@@ -1518,13 +1518,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.201171875,
"learning_rate": 0.00031800000000000003,
- "loss": 0.0889,
+ "loss": 0.09,
"macro_f1": 0.3333333432674408,
"num_tokens": 258260.0,
"repeat_count": 0.0,
- "routers_loss": 0.016952091827988625,
+ "routers_loss": 0.017772775143384933,
"skip_count": 0.0,
"step": 160,
"text_loss": 0.46873849630355835
@@ -1537,13 +1537,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2216796875,
+ "grad_norm": 0.224609375,
"learning_rate": 0.000322,
- "loss": 0.0923,
+ "loss": 0.0893,
"macro_f1": 0.3272727429866791,
"num_tokens": 261846.0,
"repeat_count": 0.0,
- "routers_loss": 0.03669808804988861,
+ "routers_loss": 0.034902360290288925,
"skip_count": 1.0,
"step": 162,
"text_loss": 0.3727971017360687
@@ -1556,13 +1556,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.212890625,
"learning_rate": 0.000326,
- "loss": 0.0769,
+ "loss": 0.076,
"macro_f1": 0.3333333432674408,
"num_tokens": 264348.0,
"repeat_count": 0.0,
- "routers_loss": 0.012101447209715843,
+ "routers_loss": 0.013553355820477009,
"skip_count": 0.0,
"step": 164,
"text_loss": 0.5798237323760986
@@ -1575,13 +1575,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.37109375,
+ "grad_norm": 0.408203125,
"learning_rate": 0.00033,
- "loss": 0.0897,
+ "loss": 0.0926,
"macro_f1": 0.32098764181137085,
"num_tokens": 267479.0,
"repeat_count": 1.0,
- "routers_loss": 0.1562056541442871,
+ "routers_loss": 0.13571743667125702,
"skip_count": 1.0,
"step": 166,
"text_loss": 0.8084776997566223
@@ -1594,13 +1594,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.2431640625,
"learning_rate": 0.00033400000000000004,
- "loss": 0.0829,
+ "loss": 0.0817,
"macro_f1": 0.32098764181137085,
"num_tokens": 270268.0,
"repeat_count": 2.0,
- "routers_loss": 0.20807914435863495,
+ "routers_loss": 0.19884146749973297,
"skip_count": 0.0,
"step": 168,
"text_loss": 0.7366134524345398
@@ -1613,13 +1613,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.267578125,
"learning_rate": 0.00033800000000000003,
- "loss": 0.0987,
+ "loss": 0.1022,
"macro_f1": 0.32098764181137085,
"num_tokens": 273518.0,
"repeat_count": 1.0,
- "routers_loss": 0.1530539095401764,
+ "routers_loss": 0.15469175577163696,
"skip_count": 1.0,
"step": 170,
"text_loss": 0.27204006910324097
@@ -1632,13 +1632,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000342,
- "loss": 0.087,
+ "loss": 0.0865,
"macro_f1": 0.32098764181137085,
"num_tokens": 277210.0,
"repeat_count": 0.0,
- "routers_loss": 0.08004544675350189,
+ "routers_loss": 0.08603330701589584,
"skip_count": 2.0,
"step": 172,
"text_loss": 0.7137667536735535
@@ -1651,13 +1651,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1767578125,
+ "grad_norm": 0.189453125,
"learning_rate": 0.000346,
- "loss": 0.0916,
+ "loss": 0.0902,
"macro_f1": 0.3076923191547394,
"num_tokens": 280389.0,
"repeat_count": 0.0,
- "routers_loss": 0.19228078424930573,
+ "routers_loss": 0.17851492762565613,
"skip_count": 4.0,
"step": 174,
"text_loss": 0.5148105621337891
@@ -1670,13 +1670,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1611328125,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.00035,
- "loss": 0.0863,
+ "loss": 0.0853,
"macro_f1": 0.3333333432674408,
"num_tokens": 283501.0,
"repeat_count": 0.0,
- "routers_loss": 0.024507170543074608,
+ "routers_loss": 0.021331604570150375,
"skip_count": 0.0,
"step": 176,
"text_loss": 0.301013320684433
@@ -1689,13 +1689,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.000354,
- "loss": 0.0898,
+ "loss": 0.0911,
"macro_f1": 0.32098764181137085,
"num_tokens": 287154.0,
"repeat_count": 0.0,
- "routers_loss": 0.05055495724081993,
+ "routers_loss": 0.057273946702480316,
"skip_count": 2.0,
"step": 178,
"text_loss": 0.4740981459617615
@@ -1708,13 +1708,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.240234375,
"learning_rate": 0.000358,
- "loss": 0.0865,
+ "loss": 0.0904,
"macro_f1": 0.3272727429866791,
"num_tokens": 289929.0,
"repeat_count": 0.0,
- "routers_loss": 0.03999815881252289,
+ "routers_loss": 0.04116598889231682,
"skip_count": 1.0,
"step": 180,
"text_loss": 0.4838573932647705
@@ -1727,13 +1727,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.14453125,
"learning_rate": 0.000362,
- "loss": 0.0983,
+ "loss": 0.0991,
"macro_f1": 0.3333333432674408,
"num_tokens": 294293.0,
"repeat_count": 0.0,
- "routers_loss": 0.025158070027828217,
+ "routers_loss": 0.027111956849694252,
"skip_count": 0.0,
"step": 182,
"text_loss": 0.7495553493499756
@@ -1746,32 +1746,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.158203125,
"learning_rate": 0.000366,
- "loss": 0.1015,
+ "loss": 0.1038,
"macro_f1": 0.3333333432674408,
"num_tokens": 297730.0,
"repeat_count": 0.0,
- "routers_loss": 0.01825365424156189,
+ "routers_loss": 0.019166452810168266,
"skip_count": 0.0,
"step": 184,
"text_loss": 0.534831166267395
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 0.8734957440563546,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.2158203125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2236328125,
"learning_rate": 0.00037,
- "loss": 0.0736,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0784,
+ "macro_f1": 0.5427350401878357,
"num_tokens": 300593.0,
"repeat_count": 1.0,
- "routers_loss": 0.22729666531085968,
+ "routers_loss": 0.2349659502506256,
"skip_count": 2.0,
"step": 186,
"text_loss": 0.3549048602581024
@@ -1784,13 +1784,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.2041015625,
"learning_rate": 0.000374,
- "loss": 0.0838,
+ "loss": 0.0827,
"macro_f1": 0.3076923191547394,
"num_tokens": 303456.0,
"repeat_count": 2.0,
- "routers_loss": 0.24516475200653076,
+ "routers_loss": 0.22502389550209045,
"skip_count": 2.0,
"step": 188,
"text_loss": 0.8837642073631287
@@ -1803,13 +1803,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2470703125,
+ "grad_norm": 0.271484375,
"learning_rate": 0.000378,
- "loss": 0.1056,
+ "loss": 0.1085,
"macro_f1": 0.3272727429866791,
"num_tokens": 306241.0,
"repeat_count": 1.0,
- "routers_loss": 0.1307530701160431,
+ "routers_loss": 0.12291611731052399,
"skip_count": 0.0,
"step": 190,
"text_loss": 0.73353511095047
@@ -1822,13 +1822,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.15625,
"learning_rate": 0.000382,
- "loss": 0.0961,
+ "loss": 0.0969,
"macro_f1": 0.3272727429866791,
"num_tokens": 310606.0,
"repeat_count": 0.0,
- "routers_loss": 0.06541688740253448,
+ "routers_loss": 0.055988848209381104,
"skip_count": 1.0,
"step": 192,
"text_loss": 0.6261917352676392
@@ -1841,13 +1841,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.333984375,
+ "grad_norm": 0.34375,
"learning_rate": 0.000386,
- "loss": 0.1058,
+ "loss": 0.1055,
"macro_f1": 0.3144654333591461,
"num_tokens": 313564.0,
"repeat_count": 0.0,
- "routers_loss": 0.12492545694112778,
+ "routers_loss": 0.12363404780626297,
"skip_count": 3.0,
"step": 194,
"text_loss": 0.2790874242782593
@@ -1860,13 +1860,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28515625,
+ "grad_norm": 0.27734375,
"learning_rate": 0.00039000000000000005,
- "loss": 0.0966,
+ "loss": 0.0964,
"macro_f1": 0.3076923191547394,
"num_tokens": 316958.0,
"repeat_count": 2.0,
- "routers_loss": 0.2838033139705658,
+ "routers_loss": 0.2718356251716614,
"skip_count": 2.0,
"step": 196,
"text_loss": 0.14428086578845978
@@ -1881,11 +1881,11 @@
"f1_skip": 0.0,
"grad_norm": 0.2021484375,
"learning_rate": 0.00039400000000000004,
- "loss": 0.0929,
+ "loss": 0.0917,
"macro_f1": 0.32098764181137085,
"num_tokens": 320103.0,
"repeat_count": 0.0,
- "routers_loss": 0.07692629098892212,
+ "routers_loss": 0.07188102602958679,
"skip_count": 2.0,
"step": 198,
"text_loss": 0.27155816555023193
@@ -1898,13 +1898,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.201171875,
"learning_rate": 0.000398,
"loss": 0.0809,
"macro_f1": 0.32098764181137085,
"num_tokens": 323566.0,
"repeat_count": 1.0,
- "routers_loss": 0.18504399061203003,
+ "routers_loss": 0.18038256466388702,
"skip_count": 1.0,
"step": 200,
"text_loss": 0.8453494310379028
@@ -1917,13 +1917,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.000402,
- "loss": 0.078,
+ "loss": 0.0801,
"macro_f1": 0.3333333432674408,
"num_tokens": 326385.0,
"repeat_count": 0.0,
- "routers_loss": 0.014647359028458595,
+ "routers_loss": 0.014639763161540031,
"skip_count": 0.0,
"step": 202,
"text_loss": 0.5733131766319275
@@ -1936,13 +1936,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2041015625,
+ "grad_norm": 0.21875,
"learning_rate": 0.00040600000000000006,
- "loss": 0.1028,
+ "loss": 0.104,
"macro_f1": 0.3333333432674408,
"num_tokens": 329266.0,
"repeat_count": 0.0,
- "routers_loss": 0.017848484218120575,
+ "routers_loss": 0.015269627794623375,
"skip_count": 0.0,
"step": 204,
"text_loss": 0.7355639934539795
@@ -1955,13 +1955,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.27734375,
"learning_rate": 0.00041,
- "loss": 0.0832,
+ "loss": 0.0833,
"macro_f1": 0.3333333432674408,
"num_tokens": 332984.0,
"repeat_count": 0.0,
- "routers_loss": 0.01900508813560009,
+ "routers_loss": 0.018046971410512924,
"skip_count": 0.0,
"step": 206,
"text_loss": 0.587641179561615
@@ -1974,13 +1974,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.166015625,
+ "grad_norm": 0.185546875,
"learning_rate": 0.000414,
"loss": 0.0588,
"macro_f1": 0.3272727429866791,
"num_tokens": 335739.0,
"repeat_count": 1.0,
- "routers_loss": 0.13018715381622314,
+ "routers_loss": 0.12791286408901215,
"skip_count": 0.0,
"step": 208,
"text_loss": 0.6538406610488892
@@ -1993,13 +1993,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.24609375,
"learning_rate": 0.00041799999999999997,
- "loss": 0.0697,
+ "loss": 0.0732,
"macro_f1": 0.3272727429866791,
"num_tokens": 338966.0,
"repeat_count": 0.0,
- "routers_loss": 0.055288366973400116,
+ "routers_loss": 0.050490595400333405,
"skip_count": 1.0,
"step": 210,
"text_loss": 0.4188295602798462
@@ -2012,13 +2012,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.271484375,
"learning_rate": 0.000422,
- "loss": 0.0576,
+ "loss": 0.0588,
"macro_f1": 0.3144654333591461,
"num_tokens": 342063.0,
"repeat_count": 0.0,
- "routers_loss": 0.10952572524547577,
+ "routers_loss": 0.11652113497257233,
"skip_count": 3.0,
"step": 212,
"text_loss": 0.21822240948677063
@@ -2031,13 +2031,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.197265625,
+ "grad_norm": 0.2060546875,
"learning_rate": 0.000426,
- "loss": 0.062,
+ "loss": 0.0621,
"macro_f1": 0.3333333432674408,
"num_tokens": 344887.0,
"repeat_count": 0.0,
- "routers_loss": 0.02415696159005165,
+ "routers_loss": 0.023898238316178322,
"skip_count": 0.0,
"step": 214,
"text_loss": 0.24692800641059875
@@ -2050,13 +2050,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.353515625,
+ "grad_norm": 0.3671875,
"learning_rate": 0.00043,
- "loss": 0.1011,
+ "loss": 0.1005,
"macro_f1": 0.3272727429866791,
"num_tokens": 348700.0,
"repeat_count": 1.0,
- "routers_loss": 0.06956391036510468,
+ "routers_loss": 0.06414655596017838,
"skip_count": 0.0,
"step": 216,
"text_loss": 0.4744548797607422
@@ -2069,13 +2069,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1904296875,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.00043400000000000003,
- "loss": 0.076,
+ "loss": 0.0753,
"macro_f1": 0.32098764181137085,
"num_tokens": 351507.0,
"repeat_count": 1.0,
- "routers_loss": 0.1140352189540863,
+ "routers_loss": 0.11702914535999298,
"skip_count": 1.0,
"step": 218,
"text_loss": 0.5614864826202393
@@ -2090,11 +2090,11 @@
"f1_skip": 0.0,
"grad_norm": 0.189453125,
"learning_rate": 0.000438,
- "loss": 0.0788,
+ "loss": 0.0792,
"macro_f1": 0.3333333432674408,
"num_tokens": 354484.0,
"repeat_count": 0.0,
- "routers_loss": 0.011621571145951748,
+ "routers_loss": 0.014991643838584423,
"skip_count": 0.0,
"step": 220,
"text_loss": 0.47209832072257996
@@ -2107,13 +2107,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.251953125,
"learning_rate": 0.000442,
"loss": 0.106,
"macro_f1": 0.3272727429866791,
"num_tokens": 357954.0,
"repeat_count": 0.0,
- "routers_loss": 0.05813701078295708,
+ "routers_loss": 0.04747112840414047,
"skip_count": 1.0,
"step": 222,
"text_loss": 0.2968728244304657
@@ -2126,13 +2126,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.357421875,
+ "grad_norm": 0.40234375,
"learning_rate": 0.000446,
- "loss": 0.0827,
+ "loss": 0.0853,
"macro_f1": 0.32098764181137085,
"num_tokens": 360547.0,
"repeat_count": 0.0,
- "routers_loss": 0.0646885335445404,
+ "routers_loss": 0.06754162162542343,
"skip_count": 2.0,
"step": 224,
"text_loss": 0.2364148646593094
@@ -2145,13 +2145,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.244140625,
+ "grad_norm": 0.2412109375,
"learning_rate": 0.00045000000000000004,
- "loss": 0.1011,
+ "loss": 0.1016,
"macro_f1": 0.3272727429866791,
"num_tokens": 364529.0,
"repeat_count": 0.0,
- "routers_loss": 0.07224348932504654,
+ "routers_loss": 0.07830183953046799,
"skip_count": 1.0,
"step": 226,
"text_loss": 0.4787476360797882
@@ -2164,13 +2164,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.1953125,
"learning_rate": 0.00045400000000000003,
- "loss": 0.0781,
+ "loss": 0.0792,
"macro_f1": 0.3333333432674408,
"num_tokens": 367683.0,
"repeat_count": 0.0,
- "routers_loss": 0.015971746295690536,
+ "routers_loss": 0.015735948458313942,
"skip_count": 0.0,
"step": 228,
"text_loss": 0.37148505449295044
@@ -2183,13 +2183,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.25,
"learning_rate": 0.000458,
- "loss": 0.099,
+ "loss": 0.0995,
"macro_f1": 0.3333333432674408,
"num_tokens": 371402.0,
"repeat_count": 0.0,
- "routers_loss": 0.017818331718444824,
+ "routers_loss": 0.013354359194636345,
"skip_count": 0.0,
"step": 230,
"text_loss": 0.7464763522148132
@@ -2202,13 +2202,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.000462,
- "loss": 0.0757,
+ "loss": 0.0731,
"macro_f1": 0.3333333432674408,
"num_tokens": 374587.0,
"repeat_count": 0.0,
- "routers_loss": 0.01582280732691288,
+ "routers_loss": 0.013763721100986004,
"skip_count": 0.0,
"step": 232,
"text_loss": 0.8754443526268005
@@ -2221,13 +2221,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.42578125,
+ "grad_norm": 0.3984375,
"learning_rate": 0.00046600000000000005,
- "loss": 0.0876,
+ "loss": 0.0861,
"macro_f1": 0.3333333432674408,
"num_tokens": 377513.0,
"repeat_count": 0.0,
- "routers_loss": 0.011417915113270283,
+ "routers_loss": 0.010075435042381287,
"skip_count": 0.0,
"step": 234,
"text_loss": 0.31534913182258606
@@ -2240,13 +2240,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.17578125,
"learning_rate": 0.00047,
- "loss": 0.0801,
+ "loss": 0.0791,
"macro_f1": 0.3272727429866791,
"num_tokens": 380736.0,
"repeat_count": 0.0,
- "routers_loss": 0.05787832289934158,
+ "routers_loss": 0.059825167059898376,
"skip_count": 1.0,
"step": 236,
"text_loss": 0.5936337113380432
@@ -2259,13 +2259,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.236328125,
+ "grad_norm": 0.267578125,
"learning_rate": 0.000474,
- "loss": 0.0508,
+ "loss": 0.0514,
"macro_f1": 0.32098764181137085,
"num_tokens": 383236.0,
"repeat_count": 0.0,
- "routers_loss": 0.09476690739393234,
+ "routers_loss": 0.09134846180677414,
"skip_count": 2.0,
"step": 238,
"text_loss": 0.5976157784461975
@@ -2278,13 +2278,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.208984375,
"learning_rate": 0.00047799999999999996,
- "loss": 0.0833,
+ "loss": 0.0858,
"macro_f1": 0.32098764181137085,
"num_tokens": 385778.0,
"repeat_count": 1.0,
- "routers_loss": 0.1099705696105957,
+ "routers_loss": 0.11989791691303253,
"skip_count": 1.0,
"step": 240,
"text_loss": 0.3554210960865021
@@ -2297,13 +2297,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.171875,
"learning_rate": 0.000482,
- "loss": 0.0745,
+ "loss": 0.0734,
"macro_f1": 0.3333333432674408,
"num_tokens": 388777.0,
"repeat_count": 0.0,
- "routers_loss": 0.01269970741122961,
+ "routers_loss": 0.013591105118393898,
"skip_count": 0.0,
"step": 242,
"text_loss": 0.4829460382461548
@@ -2316,13 +2316,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11962890625,
+ "grad_norm": 0.12060546875,
"learning_rate": 0.000486,
- "loss": 0.061,
+ "loss": 0.0625,
"macro_f1": 0.32098764181137085,
"num_tokens": 391797.0,
"repeat_count": 0.0,
- "routers_loss": 0.08505752682685852,
+ "routers_loss": 0.0920003354549408,
"skip_count": 2.0,
"step": 244,
"text_loss": 0.3085818886756897
@@ -2335,13 +2335,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.00049,
- "loss": 0.0504,
+ "loss": 0.0501,
"macro_f1": 0.3333333432674408,
"num_tokens": 396485.0,
"repeat_count": 0.0,
- "routers_loss": 0.012750142253935337,
+ "routers_loss": 0.0129330949857831,
"skip_count": 0.0,
"step": 246,
"text_loss": 0.42803969979286194
@@ -2354,13 +2354,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.291015625,
+ "grad_norm": 0.296875,
"learning_rate": 0.000494,
- "loss": 0.0962,
+ "loss": 0.0945,
"macro_f1": 0.3144654333591461,
"num_tokens": 399923.0,
"repeat_count": 0.0,
- "routers_loss": 0.11287309974431992,
+ "routers_loss": 0.10677755624055862,
"skip_count": 3.0,
"step": 248,
"text_loss": 0.2908555567264557
@@ -2373,32 +2373,32 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.203125,
"learning_rate": 0.000498,
- "loss": 0.0821,
+ "loss": 0.0812,
"macro_f1": 0.3144654333591461,
"num_tokens": 403647.0,
"repeat_count": 0.0,
- "routers_loss": 0.1486474722623825,
+ "routers_loss": 0.1504337340593338,
"skip_count": 3.0,
"step": 250,
"text_loss": 0.333095908164978
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 1.183152333431171,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
+ "f1_skip": 0.0,
"grad_norm": 0.22265625,
"learning_rate": 0.0005020000000000001,
- "loss": 0.0832,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0828,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 409147.0,
"repeat_count": 0.0,
- "routers_loss": 0.06636594980955124,
+ "routers_loss": 0.06503184884786606,
"skip_count": 2.0,
"step": 252,
"text_loss": 0.16117942333221436
@@ -2411,13 +2411,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.267578125,
+ "grad_norm": 0.287109375,
"learning_rate": 0.000506,
- "loss": 0.1,
+ "loss": 0.0995,
"macro_f1": 0.3333333432674408,
"num_tokens": 412072.0,
"repeat_count": 0.0,
- "routers_loss": 0.015062150545418262,
+ "routers_loss": 0.016280122101306915,
"skip_count": 0.0,
"step": 254,
"text_loss": 0.4217492640018463
@@ -2430,13 +2430,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2138671875,
+ "grad_norm": 0.21484375,
"learning_rate": 0.00051,
- "loss": 0.0808,
+ "loss": 0.0803,
"macro_f1": 0.3144654333591461,
"num_tokens": 415052.0,
"repeat_count": 2.0,
- "routers_loss": 0.2051105946302414,
+ "routers_loss": 0.2117508500814438,
"skip_count": 1.0,
"step": 256,
"text_loss": 0.5795308947563171
@@ -2449,13 +2449,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2412109375,
+ "grad_norm": 0.2421875,
"learning_rate": 0.000514,
- "loss": 0.068,
+ "loss": 0.0668,
"macro_f1": 0.3272727429866791,
"num_tokens": 418099.0,
"repeat_count": 1.0,
- "routers_loss": 0.1467045396566391,
+ "routers_loss": 0.15002092719078064,
"skip_count": 0.0,
"step": 258,
"text_loss": 0.4840938448905945
@@ -2468,13 +2468,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.000518,
- "loss": 0.0543,
+ "loss": 0.0538,
"macro_f1": 0.3333333432674408,
"num_tokens": 422526.0,
"repeat_count": 0.0,
- "routers_loss": 0.013022038154304028,
+ "routers_loss": 0.012834074907004833,
"skip_count": 0.0,
"step": 260,
"text_loss": 0.36141225695610046
@@ -2487,13 +2487,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.216796875,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.000522,
- "loss": 0.0848,
+ "loss": 0.085,
"macro_f1": 0.3076923191547394,
"num_tokens": 425765.0,
"repeat_count": 2.0,
- "routers_loss": 0.2575930058956146,
+ "routers_loss": 0.23808011412620544,
"skip_count": 2.0,
"step": 262,
"text_loss": 0.27572691440582275
@@ -2506,13 +2506,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000526,
- "loss": 0.07,
+ "loss": 0.0708,
"macro_f1": 0.3272727429866791,
"num_tokens": 429048.0,
"repeat_count": 0.0,
- "routers_loss": 0.0558602549135685,
+ "routers_loss": 0.055687375366687775,
"skip_count": 1.0,
"step": 264,
"text_loss": 0.37020301818847656
@@ -2525,13 +2525,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.2080078125,
"learning_rate": 0.0005300000000000001,
- "loss": 0.082,
+ "loss": 0.0839,
"macro_f1": 0.3272727429866791,
"num_tokens": 431784.0,
"repeat_count": 0.0,
- "routers_loss": 0.09126655012369156,
+ "routers_loss": 0.0872957780957222,
"skip_count": 1.0,
"step": 266,
"text_loss": 0.5937283039093018
@@ -2544,13 +2544,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.263671875,
"learning_rate": 0.0005340000000000001,
- "loss": 0.0764,
+ "loss": 0.0733,
"macro_f1": 0.32098764181137085,
"num_tokens": 434297.0,
"repeat_count": 2.0,
- "routers_loss": 0.24805288016796112,
+ "routers_loss": 0.23507654666900635,
"skip_count": 0.0,
"step": 268,
"text_loss": 0.3367372453212738
@@ -2563,13 +2563,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.22265625,
+ "grad_norm": 0.2431640625,
"learning_rate": 0.0005380000000000001,
- "loss": 0.0686,
+ "loss": 0.0708,
"macro_f1": 0.32098764181137085,
"num_tokens": 437586.0,
"repeat_count": 0.0,
- "routers_loss": 0.13135533034801483,
+ "routers_loss": 0.12860390543937683,
"skip_count": 2.0,
"step": 270,
"text_loss": 0.7149854302406311
@@ -2582,13 +2582,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.2451171875,
"learning_rate": 0.0005420000000000001,
- "loss": 0.1083,
+ "loss": 0.1072,
"macro_f1": 0.3272727429866791,
"num_tokens": 440649.0,
"repeat_count": 0.0,
- "routers_loss": 0.04991440102458,
+ "routers_loss": 0.044308312237262726,
"skip_count": 1.0,
"step": 272,
"text_loss": 0.26778292655944824
@@ -2601,13 +2601,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.455078125,
+ "grad_norm": 0.44921875,
"learning_rate": 0.000546,
- "loss": 0.0991,
+ "loss": 0.0938,
"macro_f1": 0.3144654333591461,
"num_tokens": 443907.0,
"repeat_count": 0.0,
- "routers_loss": 0.12236632406711578,
+ "routers_loss": 0.11514109373092651,
"skip_count": 3.0,
"step": 274,
"text_loss": 0.23578761518001556
@@ -2620,13 +2620,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.25,
+ "grad_norm": 0.2578125,
"learning_rate": 0.00055,
- "loss": 0.0936,
+ "loss": 0.0932,
"macro_f1": 0.5492662787437439,
"num_tokens": 447147.0,
"repeat_count": 0.0,
- "routers_loss": 0.053506772965192795,
+ "routers_loss": 0.055705297738313675,
"skip_count": 2.0,
"step": 276,
"text_loss": 0.2513524889945984
@@ -2639,13 +2639,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.265625,
+ "grad_norm": 0.29296875,
"learning_rate": 0.000554,
- "loss": 0.066,
+ "loss": 0.0667,
"macro_f1": 0.32098764181137085,
"num_tokens": 450032.0,
"repeat_count": 0.0,
- "routers_loss": 0.13446088135242462,
+ "routers_loss": 0.13778971135616302,
"skip_count": 2.0,
"step": 278,
"text_loss": 0.4857243597507477
@@ -2658,32 +2658,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.185546875,
"learning_rate": 0.000558,
- "loss": 0.0682,
+ "loss": 0.0672,
"macro_f1": 0.3272727429866791,
"num_tokens": 453195.0,
"repeat_count": 1.0,
- "routers_loss": 0.07270720601081848,
+ "routers_loss": 0.0700262188911438,
"skip_count": 0.0,
"step": 280,
"text_loss": 0.7589789628982544
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 1.3240387437628411,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.28125,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25,
"learning_rate": 0.0005620000000000001,
- "loss": 0.0648,
- "macro_f1": 0.5427350401878357,
+ "loss": 0.0603,
+ "macro_f1": 0.3144654333591461,
"num_tokens": 455942.0,
"repeat_count": 1.0,
- "routers_loss": 0.13866399228572845,
+ "routers_loss": 0.11706235259771347,
"skip_count": 2.0,
"step": 282,
"text_loss": 0.4783432185649872
@@ -2696,13 +2696,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.236328125,
+ "grad_norm": 0.265625,
"learning_rate": 0.000566,
- "loss": 0.0782,
+ "loss": 0.0793,
"macro_f1": 0.3272727429866791,
"num_tokens": 458932.0,
"repeat_count": 0.0,
- "routers_loss": 0.0645354762673378,
+ "routers_loss": 0.07073967158794403,
"skip_count": 1.0,
"step": 284,
"text_loss": 0.7117193937301636
@@ -2715,13 +2715,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.00057,
- "loss": 0.0892,
+ "loss": 0.0915,
"macro_f1": 0.3272727429866791,
"num_tokens": 462650.0,
"repeat_count": 0.0,
- "routers_loss": 0.05967628210783005,
+ "routers_loss": 0.05301115661859512,
"skip_count": 1.0,
"step": 286,
"text_loss": 0.4175460636615753
@@ -2734,13 +2734,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23828125,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.000574,
- "loss": 0.0676,
+ "loss": 0.0675,
"macro_f1": 0.3272727429866791,
"num_tokens": 466290.0,
"repeat_count": 0.0,
- "routers_loss": 0.06438407301902771,
+ "routers_loss": 0.06356479972600937,
"skip_count": 1.0,
"step": 288,
"text_loss": 0.5832946300506592
@@ -2753,13 +2753,13 @@
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.28515625,
"learning_rate": 0.000578,
- "loss": 0.0781,
+ "loss": 0.0805,
"macro_f1": 0.3006536066532135,
"num_tokens": 469296.0,
"repeat_count": 1.0,
- "routers_loss": 0.21225209534168243,
+ "routers_loss": 0.21032999455928802,
"skip_count": 3.0,
"step": 290,
"text_loss": 0.36023473739624023
@@ -2772,13 +2772,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.244140625,
+ "grad_norm": 0.27734375,
"learning_rate": 0.0005819999999999999,
- "loss": 0.0664,
+ "loss": 0.0685,
"macro_f1": 0.32098764181137085,
"num_tokens": 472272.0,
"repeat_count": 1.0,
- "routers_loss": 0.08085516840219498,
+ "routers_loss": 0.08062280714511871,
"skip_count": 1.0,
"step": 292,
"text_loss": 0.37197956442832947
@@ -2791,13 +2791,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.28125,
"learning_rate": 0.0005859999999999999,
- "loss": 0.0874,
+ "loss": 0.0878,
"macro_f1": 0.32098764181137085,
"num_tokens": 475864.0,
"repeat_count": 0.0,
- "routers_loss": 0.05378658324480057,
+ "routers_loss": 0.05023600533604622,
"skip_count": 2.0,
"step": 294,
"text_loss": 0.4765273630619049
@@ -2810,13 +2810,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.2177734375,
"learning_rate": 0.00059,
- "loss": 0.0715,
+ "loss": 0.0728,
"macro_f1": 0.3333333432674408,
"num_tokens": 478916.0,
"repeat_count": 0.0,
- "routers_loss": 0.01145261898636818,
+ "routers_loss": 0.011689410544931889,
"skip_count": 0.0,
"step": 296,
"text_loss": 0.5878773927688599
@@ -2831,11 +2831,11 @@
"f1_skip": 0.0,
"grad_norm": 0.15625,
"learning_rate": 0.000594,
- "loss": 0.0737,
+ "loss": 0.0727,
"macro_f1": 0.3333333432674408,
"num_tokens": 482369.0,
"repeat_count": 0.0,
- "routers_loss": 0.009397956542670727,
+ "routers_loss": 0.010772093199193478,
"skip_count": 0.0,
"step": 298,
"text_loss": 0.4424116313457489
@@ -2848,13 +2848,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.181640625,
"learning_rate": 0.000598,
- "loss": 0.0802,
+ "loss": 0.0787,
"macro_f1": 0.3076923191547394,
"num_tokens": 486049.0,
"repeat_count": 2.0,
- "routers_loss": 0.2389357089996338,
+ "routers_loss": 0.23482851684093475,
"skip_count": 2.0,
"step": 300,
"text_loss": 0.21217775344848633
@@ -2862,18 +2862,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 1.417963017317288,
- "f1_execute": 0.9019607901573181,
+ "f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.2080078125,
"learning_rate": 0.000602,
- "loss": 0.0745,
- "macro_f1": 0.3006536066532135,
+ "loss": 0.073,
+ "macro_f1": 0.3076923191547394,
"num_tokens": 488683.0,
"repeat_count": 1.0,
- "routers_loss": 0.18252353370189667,
+ "routers_loss": 0.18843084573745728,
"skip_count": 3.0,
"step": 302,
"text_loss": 0.2109498232603073
@@ -2886,13 +2886,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.27734375,
+ "grad_norm": 0.279296875,
"learning_rate": 0.000606,
- "loss": 0.0935,
+ "loss": 0.0945,
"macro_f1": 0.3144654333591461,
"num_tokens": 492010.0,
"repeat_count": 0.0,
- "routers_loss": 0.18185268342494965,
+ "routers_loss": 0.17861786484718323,
"skip_count": 3.0,
"step": 304,
"text_loss": 0.8446305394172668
@@ -2905,13 +2905,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.00061,
- "loss": 0.0853,
+ "loss": 0.0827,
"macro_f1": 0.3333333432674408,
"num_tokens": 494764.0,
"repeat_count": 0.0,
- "routers_loss": 0.013210167177021503,
+ "routers_loss": 0.014124520123004913,
"skip_count": 0.0,
"step": 306,
"text_loss": 0.742735743522644
@@ -2924,13 +2924,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.26953125,
"learning_rate": 0.000614,
- "loss": 0.1089,
+ "loss": 0.1071,
"macro_f1": 0.3333333432674408,
"num_tokens": 497820.0,
"repeat_count": 0.0,
- "routers_loss": 0.016936838626861572,
+ "routers_loss": 0.017968112602829933,
"skip_count": 0.0,
"step": 308,
"text_loss": 0.28305482864379883
@@ -2943,13 +2943,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.0006180000000000001,
- "loss": 0.077,
+ "loss": 0.0775,
"macro_f1": 0.32098764181137085,
"num_tokens": 500694.0,
"repeat_count": 0.0,
- "routers_loss": 0.08630389720201492,
+ "routers_loss": 0.08593655377626419,
"skip_count": 2.0,
"step": 310,
"text_loss": 0.3496848940849304
@@ -2962,13 +2962,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.19140625,
"learning_rate": 0.000622,
- "loss": 0.0602,
+ "loss": 0.061,
"macro_f1": 0.3333333432674408,
"num_tokens": 503871.0,
"repeat_count": 0.0,
- "routers_loss": 0.013665963895618916,
+ "routers_loss": 0.016449492424726486,
"skip_count": 0.0,
"step": 312,
"text_loss": 0.6691372990608215
@@ -2981,13 +2981,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.205078125,
"learning_rate": 0.000626,
- "loss": 0.0794,
+ "loss": 0.0815,
"macro_f1": 0.3333333432674408,
"num_tokens": 506730.0,
"repeat_count": 0.0,
- "routers_loss": 0.01584783010184765,
+ "routers_loss": 0.014532964676618576,
"skip_count": 0.0,
"step": 314,
"text_loss": 0.6118118166923523
@@ -3000,13 +3000,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.224609375,
+ "grad_norm": 0.2216796875,
"learning_rate": 0.00063,
- "loss": 0.0762,
+ "loss": 0.0742,
"macro_f1": 0.3333333432674408,
"num_tokens": 510323.0,
"repeat_count": 0.0,
- "routers_loss": 0.01368923019617796,
+ "routers_loss": 0.013093139044940472,
"skip_count": 0.0,
"step": 316,
"text_loss": 0.38126271963119507
@@ -3019,13 +3019,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.388671875,
+ "grad_norm": 0.400390625,
"learning_rate": 0.000634,
- "loss": 0.0908,
+ "loss": 0.0915,
"macro_f1": 0.3333333432674408,
"num_tokens": 514075.0,
"repeat_count": 0.0,
- "routers_loss": 0.009135022759437561,
+ "routers_loss": 0.008627045899629593,
"skip_count": 0.0,
"step": 318,
"text_loss": 0.5983037948608398
@@ -3038,13 +3038,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000638,
- "loss": 0.0949,
+ "loss": 0.1008,
"macro_f1": 0.3272727429866791,
"num_tokens": 517418.0,
"repeat_count": 0.0,
- "routers_loss": 0.046641621738672256,
+ "routers_loss": 0.04561378434300423,
"skip_count": 1.0,
"step": 320,
"text_loss": 0.767257034778595
@@ -3052,18 +3052,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 1.5118872908717347,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23046875,
+ "grad_norm": 0.259765625,
"learning_rate": 0.000642,
- "loss": 0.0925,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0926,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 520443.0,
"repeat_count": 0.0,
- "routers_loss": 0.020637936890125275,
+ "routers_loss": 0.024372953921556473,
"skip_count": 0.0,
"step": 322,
"text_loss": 0.6572105884552002
@@ -3076,13 +3076,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26953125,
+ "grad_norm": 0.30078125,
"learning_rate": 0.000646,
"loss": 0.0822,
"macro_f1": 0.3272727429866791,
"num_tokens": 523317.0,
"repeat_count": 1.0,
- "routers_loss": 0.08289298415184021,
+ "routers_loss": 0.08099937438964844,
"skip_count": 0.0,
"step": 324,
"text_loss": 0.205499529838562
@@ -3090,18 +3090,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 1.530672145582624,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23828125,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.0006500000000000001,
- "loss": 0.0823,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0809,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 526355.0,
"repeat_count": 0.0,
- "routers_loss": 0.06960040330886841,
+ "routers_loss": 0.0657225176692009,
"skip_count": 1.0,
"step": 326,
"text_loss": 0.2587239742279053
@@ -3114,13 +3114,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1162109375,
+ "grad_norm": 0.111328125,
"learning_rate": 0.0006540000000000001,
- "loss": 0.0799,
+ "loss": 0.0779,
"macro_f1": 0.3333333432674408,
"num_tokens": 529689.0,
"repeat_count": 0.0,
- "routers_loss": 0.02087482251226902,
+ "routers_loss": 0.01849208027124405,
"skip_count": 0.0,
"step": 328,
"text_loss": 0.2172023057937622
@@ -3133,13 +3133,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.1845703125,
"learning_rate": 0.0006580000000000001,
- "loss": 0.0757,
+ "loss": 0.0758,
"macro_f1": 0.3333333432674408,
"num_tokens": 532603.0,
"repeat_count": 0.0,
- "routers_loss": 0.016592051833868027,
+ "routers_loss": 0.016184113919734955,
"skip_count": 0.0,
"step": 330,
"text_loss": 0.5980568528175354
@@ -3152,32 +3152,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.22265625,
+ "grad_norm": 0.220703125,
"learning_rate": 0.000662,
- "loss": 0.0438,
+ "loss": 0.0439,
"macro_f1": 0.3333333432674408,
"num_tokens": 536056.0,
"repeat_count": 0.0,
- "routers_loss": 0.012950568459928036,
+ "routers_loss": 0.01303898449987173,
"skip_count": 0.0,
"step": 332,
"text_loss": 0.5421966314315796
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
"epoch": 1.5682418550044028,
- "f1_execute": 0.8799999952316284,
+ "f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.310546875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.296875,
"learning_rate": 0.000666,
- "loss": 0.0964,
- "macro_f1": 0.29333335161209106,
+ "loss": 0.0963,
+ "macro_f1": 0.465986430644989,
"num_tokens": 539231.0,
"repeat_count": 3.0,
- "routers_loss": 0.3373340964317322,
+ "routers_loss": 0.3075675964355469,
"skip_count": 3.0,
"step": 334,
"text_loss": 0.19719554483890533
@@ -3190,13 +3190,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.171875,
+ "grad_norm": 0.173828125,
"learning_rate": 0.00067,
"loss": 0.0706,
"macro_f1": 0.3333333432674408,
"num_tokens": 542038.0,
"repeat_count": 0.0,
- "routers_loss": 0.008110735565423965,
+ "routers_loss": 0.009116224013268948,
"skip_count": 0.0,
"step": 336,
"text_loss": 0.3407036066055298
@@ -3209,13 +3209,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.248046875,
+ "grad_norm": 0.2421875,
"learning_rate": 0.000674,
- "loss": 0.0771,
+ "loss": 0.0768,
"macro_f1": 0.3333333432674408,
"num_tokens": 545019.0,
"repeat_count": 0.0,
- "routers_loss": 0.01841609925031662,
+ "routers_loss": 0.021463042125105858,
"skip_count": 0.0,
"step": 338,
"text_loss": 0.24486012756824493
@@ -3228,13 +3228,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.0006780000000000001,
- "loss": 0.0894,
+ "loss": 0.0889,
"macro_f1": 0.3333333432674408,
"num_tokens": 548036.0,
"repeat_count": 0.0,
- "routers_loss": 0.01612614095211029,
+ "routers_loss": 0.01857556402683258,
"skip_count": 0.0,
"step": 340,
"text_loss": 0.28140124678611755
@@ -3247,13 +3247,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.125,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0006820000000000001,
- "loss": 0.0611,
+ "loss": 0.0617,
"macro_f1": 0.3006536364555359,
"num_tokens": 551419.0,
"repeat_count": 2.0,
- "routers_loss": 0.26202192902565,
+ "routers_loss": 0.27090007066726685,
"skip_count": 3.0,
"step": 342,
"text_loss": 0.20690307021141052
@@ -3266,13 +3266,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.3046875,
"learning_rate": 0.0006860000000000001,
- "loss": 0.1013,
+ "loss": 0.1047,
"macro_f1": 0.32098764181137085,
"num_tokens": 554037.0,
"repeat_count": 0.0,
- "routers_loss": 0.09235779196023941,
+ "routers_loss": 0.09231195598840714,
"skip_count": 2.0,
"step": 344,
"text_loss": 0.4479128420352936
@@ -3285,13 +3285,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.255859375,
"learning_rate": 0.00069,
- "loss": 0.0856,
+ "loss": 0.0883,
"macro_f1": 0.3333333432674408,
"num_tokens": 556672.0,
"repeat_count": 0.0,
- "routers_loss": 0.010735333897173405,
+ "routers_loss": 0.00935924518853426,
"skip_count": 0.0,
"step": 346,
"text_loss": 0.6377320289611816
@@ -3304,13 +3304,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.000694,
- "loss": 0.0778,
+ "loss": 0.0781,
"macro_f1": 0.32098764181137085,
"num_tokens": 559756.0,
"repeat_count": 0.0,
- "routers_loss": 0.14742356538772583,
+ "routers_loss": 0.17641772329807281,
"skip_count": 2.0,
"step": 348,
"text_loss": 0.6097636222839355
@@ -3323,13 +3323,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.30859375,
+ "grad_norm": 0.30078125,
"learning_rate": 0.0006979999999999999,
- "loss": 0.0614,
+ "loss": 0.0616,
"macro_f1": 0.5492662787437439,
"num_tokens": 563415.0,
"repeat_count": 0.0,
- "routers_loss": 0.06606879830360413,
+ "routers_loss": 0.06240406632423401,
"skip_count": 2.0,
"step": 350,
"text_loss": 0.5291631817817688
@@ -3342,13 +3342,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.322265625,
+ "grad_norm": 0.296875,
"learning_rate": 0.0007019999999999999,
- "loss": 0.1033,
+ "loss": 0.1026,
"macro_f1": 0.3333333432674408,
"num_tokens": 566357.0,
"repeat_count": 0.0,
- "routers_loss": 0.012873432599008083,
+ "routers_loss": 0.012269247323274612,
"skip_count": 0.0,
"step": 352,
"text_loss": 0.5170195698738098
@@ -3361,13 +3361,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0007059999999999999,
- "loss": 0.0819,
+ "loss": 0.0815,
"macro_f1": 0.32098764181137085,
"num_tokens": 569449.0,
"repeat_count": 0.0,
- "routers_loss": 0.07853665202856064,
+ "routers_loss": 0.07515309751033783,
"skip_count": 2.0,
"step": 354,
"text_loss": 0.34507250785827637
@@ -3380,13 +3380,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.251953125,
+ "grad_norm": 0.263671875,
"learning_rate": 0.00071,
- "loss": 0.0804,
+ "loss": 0.0791,
"macro_f1": 0.3144654333591461,
"num_tokens": 572761.0,
"repeat_count": 1.0,
- "routers_loss": 0.2216549813747406,
+ "routers_loss": 0.20768006145954132,
"skip_count": 2.0,
"step": 356,
"text_loss": 0.3158532381057739
@@ -3399,13 +3399,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.185546875,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.000714,
- "loss": 0.0675,
+ "loss": 0.0682,
"macro_f1": 0.3333333432674408,
"num_tokens": 575909.0,
"repeat_count": 0.0,
- "routers_loss": 0.02423691377043724,
+ "routers_loss": 0.025329967960715294,
"skip_count": 0.0,
"step": 358,
"text_loss": 0.21455390751361847
@@ -3413,18 +3413,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 1.6903434106251836,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.21484375,
"learning_rate": 0.000718,
- "loss": 0.0781,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0775,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 579186.0,
"repeat_count": 1.0,
- "routers_loss": 0.07496294379234314,
+ "routers_loss": 0.07676175981760025,
"skip_count": 0.0,
"step": 360,
"text_loss": 0.61895352602005
@@ -3437,13 +3437,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2138671875,
+ "grad_norm": 0.197265625,
"learning_rate": 0.000722,
- "loss": 0.0778,
+ "loss": 0.0781,
"macro_f1": 0.32098767161369324,
"num_tokens": 582437.0,
"repeat_count": 0.0,
- "routers_loss": 0.08181872963905334,
+ "routers_loss": 0.08070661872625351,
"skip_count": 1.0,
"step": 362,
"text_loss": 0.20557661354541779
@@ -3456,13 +3456,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.2216796875,
"learning_rate": 0.000726,
- "loss": 0.1112,
+ "loss": 0.11,
"macro_f1": 0.3333333432674408,
"num_tokens": 586096.0,
"repeat_count": 0.0,
- "routers_loss": 0.016959719359874725,
+ "routers_loss": 0.015891313552856445,
"skip_count": 0.0,
"step": 364,
"text_loss": 0.597991943359375
@@ -3475,13 +3475,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.15625,
"learning_rate": 0.00073,
- "loss": 0.0577,
+ "loss": 0.0573,
"macro_f1": 0.3076923191547394,
"num_tokens": 589520.0,
"repeat_count": 1.0,
- "routers_loss": 0.13295969367027283,
+ "routers_loss": 0.12844261527061462,
"skip_count": 3.0,
"step": 366,
"text_loss": 0.2944789230823517
@@ -3494,13 +3494,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1455078125,
+ "grad_norm": 0.150390625,
"learning_rate": 0.000734,
- "loss": 0.0986,
+ "loss": 0.1005,
"macro_f1": 0.3333333432674408,
"num_tokens": 592691.0,
"repeat_count": 0.0,
- "routers_loss": 0.02476893551647663,
+ "routers_loss": 0.02382199838757515,
"skip_count": 0.0,
"step": 368,
"text_loss": 0.23989969491958618
@@ -3513,13 +3513,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1796875,
"learning_rate": 0.000738,
- "loss": 0.0682,
+ "loss": 0.0661,
"macro_f1": 0.3333333432674408,
"num_tokens": 596004.0,
"repeat_count": 0.0,
- "routers_loss": 0.019863395020365715,
+ "routers_loss": 0.018812084570527077,
"skip_count": 0.0,
"step": 370,
"text_loss": 0.22111408412456512
@@ -3532,13 +3532,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.2412109375,
"learning_rate": 0.000742,
- "loss": 0.0663,
+ "loss": 0.0666,
"macro_f1": 0.3272727429866791,
"num_tokens": 599087.0,
"repeat_count": 0.0,
- "routers_loss": 0.07230417430400848,
+ "routers_loss": 0.08290331065654755,
"skip_count": 1.0,
"step": 372,
"text_loss": 0.2567356526851654
@@ -3551,13 +3551,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.2412109375,
"learning_rate": 0.000746,
- "loss": 0.0986,
+ "loss": 0.0941,
"macro_f1": 0.32098764181137085,
"num_tokens": 602330.0,
"repeat_count": 1.0,
- "routers_loss": 0.11727793514728546,
+ "routers_loss": 0.11482042074203491,
"skip_count": 1.0,
"step": 374,
"text_loss": 0.7217292785644531
@@ -3570,13 +3570,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.224609375,
+ "grad_norm": 0.2265625,
"learning_rate": 0.00075,
- "loss": 0.0724,
+ "loss": 0.0728,
"macro_f1": 0.3272727429866791,
"num_tokens": 605503.0,
"repeat_count": 1.0,
- "routers_loss": 0.13495951890945435,
+ "routers_loss": 0.11849870532751083,
"skip_count": 0.0,
"step": 376,
"text_loss": 0.5122153759002686
@@ -3589,13 +3589,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23046875,
+ "grad_norm": 0.2333984375,
"learning_rate": 0.000754,
- "loss": 0.0823,
+ "loss": 0.0835,
"macro_f1": 0.32098767161369324,
"num_tokens": 608505.0,
"repeat_count": 0.0,
- "routers_loss": 0.07612533867359161,
+ "routers_loss": 0.07090992480516434,
"skip_count": 1.0,
"step": 378,
"text_loss": 0.2204965502023697
@@ -3608,13 +3608,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.1826171875,
"learning_rate": 0.000758,
- "loss": 0.0803,
+ "loss": 0.0794,
"macro_f1": 0.3272727429866791,
"num_tokens": 611193.0,
"repeat_count": 0.0,
- "routers_loss": 0.0484120175242424,
+ "routers_loss": 0.03812089189887047,
"skip_count": 1.0,
"step": 380,
"text_loss": 0.44909021258354187
@@ -3627,13 +3627,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.000762,
- "loss": 0.0866,
+ "loss": 0.0882,
"macro_f1": 0.3272727429866791,
"num_tokens": 614231.0,
"repeat_count": 1.0,
- "routers_loss": 0.10939671844244003,
+ "routers_loss": 0.10270529240369797,
"skip_count": 0.0,
"step": 382,
"text_loss": 0.13624964654445648
@@ -3646,13 +3646,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.326171875,
+ "grad_norm": 0.330078125,
"learning_rate": 0.0007660000000000001,
- "loss": 0.1083,
+ "loss": 0.1107,
"macro_f1": 0.32098764181137085,
"num_tokens": 617090.0,
"repeat_count": 1.0,
- "routers_loss": 0.11382336914539337,
+ "routers_loss": 0.11624004691839218,
"skip_count": 1.0,
"step": 384,
"text_loss": 0.7314052581787109
@@ -3667,11 +3667,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1396484375,
"learning_rate": 0.0007700000000000001,
- "loss": 0.0616,
+ "loss": 0.0628,
"macro_f1": 0.32098764181137085,
"num_tokens": 620596.0,
"repeat_count": 0.0,
- "routers_loss": 0.07494530081748962,
+ "routers_loss": 0.07114322483539581,
"skip_count": 2.0,
"step": 386,
"text_loss": 0.503322958946228
@@ -3684,13 +3684,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.298828125,
+ "grad_norm": 0.306640625,
"learning_rate": 0.0007740000000000001,
- "loss": 0.0816,
+ "loss": 0.0829,
"macro_f1": 0.32098764181137085,
"num_tokens": 624108.0,
"repeat_count": 0.0,
- "routers_loss": 0.05718417093157768,
+ "routers_loss": 0.06061873584985733,
"skip_count": 2.0,
"step": 388,
"text_loss": 0.11481904983520508
@@ -3703,13 +3703,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1982421875,
+ "grad_norm": 0.2099609375,
"learning_rate": 0.000778,
- "loss": 0.0783,
+ "loss": 0.0791,
"macro_f1": 0.3006536364555359,
"num_tokens": 626895.0,
"repeat_count": 1.0,
- "routers_loss": 0.2848989963531494,
+ "routers_loss": 0.2921771705150604,
"skip_count": 4.0,
"step": 390,
"text_loss": 0.3069624602794647
@@ -3722,13 +3722,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.30078125,
+ "grad_norm": 0.30859375,
"learning_rate": 0.000782,
- "loss": 0.0608,
+ "loss": 0.0605,
"macro_f1": 0.3076923191547394,
"num_tokens": 630204.0,
"repeat_count": 0.0,
- "routers_loss": 0.2050076276063919,
+ "routers_loss": 0.202707901597023,
"skip_count": 4.0,
"step": 392,
"text_loss": 0.6022785305976868
@@ -3741,13 +3741,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28125,
+ "grad_norm": 0.29296875,
"learning_rate": 0.000786,
- "loss": 0.0863,
+ "loss": 0.0877,
"macro_f1": 0.3333333432674408,
"num_tokens": 634373.0,
"repeat_count": 0.0,
- "routers_loss": 0.020946886390447617,
+ "routers_loss": 0.0221510399132967,
"skip_count": 0.0,
"step": 394,
"text_loss": 0.26787394285202026
@@ -3760,13 +3760,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.376953125,
+ "grad_norm": 0.37890625,
"learning_rate": 0.00079,
- "loss": 0.0798,
+ "loss": 0.0805,
"macro_f1": 0.32098764181137085,
"num_tokens": 637442.0,
"repeat_count": 2.0,
- "routers_loss": 0.1270289123058319,
+ "routers_loss": 0.12636390328407288,
"skip_count": 0.0,
"step": 396,
"text_loss": 0.2799781560897827
@@ -3779,13 +3779,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.2080078125,
"learning_rate": 0.0007940000000000001,
- "loss": 0.0701,
+ "loss": 0.0724,
"macro_f1": 0.32098764181137085,
"num_tokens": 641231.0,
"repeat_count": 0.0,
- "routers_loss": 0.08012636005878448,
+ "routers_loss": 0.07933453470468521,
"skip_count": 2.0,
"step": 398,
"text_loss": 0.2507784366607666
@@ -3798,13 +3798,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.0007980000000000001,
- "loss": 0.0901,
+ "loss": 0.0909,
"macro_f1": 0.3272727429866791,
"num_tokens": 644560.0,
"repeat_count": 1.0,
- "routers_loss": 0.09315784275531769,
+ "routers_loss": 0.10324911028146744,
"skip_count": 0.0,
"step": 400,
"text_loss": 0.7756280303001404
@@ -3817,13 +3817,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0008020000000000001,
- "loss": 0.078,
+ "loss": 0.0783,
"macro_f1": 0.3144654333591461,
"num_tokens": 647393.0,
"repeat_count": 1.0,
- "routers_loss": 0.18492189049720764,
+ "routers_loss": 0.18546262383460999,
"skip_count": 2.0,
"step": 402,
"text_loss": 0.5013328194618225
@@ -3836,13 +3836,13 @@
"f1_execute": 0.8571428656578064,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.283203125,
"learning_rate": 0.0008060000000000001,
- "loss": 0.0801,
+ "loss": 0.0787,
"macro_f1": 0.2857142984867096,
"num_tokens": 650355.0,
"repeat_count": 3.0,
- "routers_loss": 0.32641324400901794,
+ "routers_loss": 0.3280293643474579,
"skip_count": 4.0,
"step": 404,
"text_loss": 0.2842077314853668
@@ -3855,13 +3855,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2080078125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.0008100000000000001,
- "loss": 0.0905,
+ "loss": 0.0901,
"macro_f1": 0.3333333432674408,
"num_tokens": 654280.0,
"repeat_count": 0.0,
- "routers_loss": 0.02722037397325039,
+ "routers_loss": 0.02623247355222702,
"skip_count": 0.0,
"step": 406,
"text_loss": 0.46742817759513855
@@ -3874,13 +3874,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.216796875,
"learning_rate": 0.0008139999999999999,
- "loss": 0.0958,
+ "loss": 0.0945,
"macro_f1": 0.3333333432674408,
"num_tokens": 657568.0,
"repeat_count": 0.0,
- "routers_loss": 0.010129833593964577,
+ "routers_loss": 0.009744114242494106,
"skip_count": 0.0,
"step": 408,
"text_loss": 0.7168047428131104
@@ -3893,13 +3893,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2373046875,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.0008179999999999999,
- "loss": 0.1084,
+ "loss": 0.1065,
"macro_f1": 0.32098764181137085,
"num_tokens": 660593.0,
"repeat_count": 0.0,
- "routers_loss": 0.07298308610916138,
+ "routers_loss": 0.07591600716114044,
"skip_count": 2.0,
"step": 410,
"text_loss": 0.449823260307312
@@ -3912,13 +3912,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.15625,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0008219999999999999,
- "loss": 0.0802,
+ "loss": 0.0795,
"macro_f1": 0.3333333432674408,
"num_tokens": 663916.0,
"repeat_count": 0.0,
- "routers_loss": 0.024257874116301537,
+ "routers_loss": 0.02076602540910244,
"skip_count": 0.0,
"step": 412,
"text_loss": 0.4764713943004608
@@ -3931,13 +3931,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1904296875,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.000826,
- "loss": 0.0842,
+ "loss": 0.0836,
"macro_f1": 0.3272727429866791,
"num_tokens": 667502.0,
"repeat_count": 0.0,
- "routers_loss": 0.048864223062992096,
+ "routers_loss": 0.049170155078172684,
"skip_count": 1.0,
"step": 414,
"text_loss": 0.30333325266838074
@@ -3950,13 +3950,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1513671875,
"learning_rate": 0.00083,
- "loss": 0.1026,
+ "loss": 0.1021,
"macro_f1": 0.3272727429866791,
"num_tokens": 670510.0,
"repeat_count": 1.0,
- "routers_loss": 0.1592330038547516,
+ "routers_loss": 0.15554003417491913,
"skip_count": 0.0,
"step": 416,
"text_loss": 0.3691870868206024
@@ -3969,13 +3969,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000834,
- "loss": 0.0963,
+ "loss": 0.1013,
"macro_f1": 0.3333333432674408,
"num_tokens": 674761.0,
"repeat_count": 0.0,
- "routers_loss": 0.02291976846754551,
+ "routers_loss": 0.024516675621271133,
"skip_count": 0.0,
"step": 418,
"text_loss": 0.32850381731987
@@ -3988,13 +3988,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.000838,
- "loss": 0.0634,
+ "loss": 0.0649,
"macro_f1": 0.3333333432674408,
"num_tokens": 678055.0,
"repeat_count": 0.0,
- "routers_loss": 0.010272650048136711,
+ "routers_loss": 0.011026890948414803,
"skip_count": 0.0,
"step": 420,
"text_loss": 0.6637290716171265
@@ -4007,13 +4007,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28125,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000842,
- "loss": 0.0786,
+ "loss": 0.0771,
"macro_f1": 0.3272727429866791,
"num_tokens": 680979.0,
"repeat_count": 0.0,
- "routers_loss": 0.0692613497376442,
+ "routers_loss": 0.07451887428760529,
"skip_count": 1.0,
"step": 422,
"text_loss": 0.27131685614585876
@@ -4026,13 +4026,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12890625,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.000846,
- "loss": 0.0706,
+ "loss": 0.0714,
"macro_f1": 0.32098764181137085,
"num_tokens": 684144.0,
"repeat_count": 1.0,
- "routers_loss": 0.12713804841041565,
+ "routers_loss": 0.11341800540685654,
"skip_count": 1.0,
"step": 424,
"text_loss": 0.652126669883728
@@ -4045,13 +4045,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.00085,
- "loss": 0.0758,
+ "loss": 0.0754,
"macro_f1": 0.3272727429866791,
"num_tokens": 687004.0,
"repeat_count": 1.0,
- "routers_loss": 0.08670130372047424,
+ "routers_loss": 0.08985847979784012,
"skip_count": 0.0,
"step": 426,
"text_loss": 0.2589428424835205
@@ -4064,13 +4064,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.23828125,
"learning_rate": 0.000854,
- "loss": 0.0857,
+ "loss": 0.0866,
"macro_f1": 0.3333333432674408,
"num_tokens": 689702.0,
"repeat_count": 0.0,
- "routers_loss": 0.01053862925618887,
+ "routers_loss": 0.011355436407029629,
"skip_count": 0.0,
"step": 428,
"text_loss": 0.8909716010093689
@@ -4083,13 +4083,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.000858,
- "loss": 0.0615,
+ "loss": 0.0623,
"macro_f1": 0.3333333432674408,
"num_tokens": 692698.0,
"repeat_count": 0.0,
- "routers_loss": 0.012946994043886662,
+ "routers_loss": 0.013788948766887188,
"skip_count": 0.0,
"step": 430,
"text_loss": 0.19141142070293427
@@ -4102,13 +4102,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.000862,
- "loss": 0.0498,
+ "loss": 0.0499,
"macro_f1": 0.32098764181137085,
"num_tokens": 696007.0,
"repeat_count": 0.0,
- "routers_loss": 0.08222822099924088,
+ "routers_loss": 0.07998392730951309,
"skip_count": 2.0,
"step": 432,
"text_loss": 0.1611809879541397
@@ -4121,13 +4121,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.173828125,
"learning_rate": 0.000866,
- "loss": 0.0532,
+ "loss": 0.0541,
"macro_f1": 0.32098764181137085,
"num_tokens": 700271.0,
"repeat_count": 0.0,
- "routers_loss": 0.07086442410945892,
+ "routers_loss": 0.06988382339477539,
"skip_count": 2.0,
"step": 434,
"text_loss": 0.37254223227500916
@@ -4140,13 +4140,13 @@
"f1_execute": 0.8333333730697632,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.00087,
- "loss": 0.0825,
+ "loss": 0.0834,
"macro_f1": 0.2777777910232544,
"num_tokens": 703519.0,
"repeat_count": 3.0,
- "routers_loss": 0.29007306694984436,
+ "routers_loss": 0.28240787982940674,
"skip_count": 5.0,
"step": 436,
"text_loss": 0.29636648297309875
@@ -4159,13 +4159,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.396484375,
+ "grad_norm": 0.423828125,
"learning_rate": 0.000874,
- "loss": 0.0658,
+ "loss": 0.0657,
"macro_f1": 0.3333333432674408,
"num_tokens": 706826.0,
"repeat_count": 0.0,
- "routers_loss": 0.014652491547167301,
+ "routers_loss": 0.013924967497587204,
"skip_count": 0.0,
"step": 438,
"text_loss": 0.20867908000946045
@@ -4178,13 +4178,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.000878,
- "loss": 0.0685,
+ "loss": 0.0657,
"macro_f1": 0.3333333432674408,
"num_tokens": 710530.0,
"repeat_count": 0.0,
- "routers_loss": 0.013720969669520855,
+ "routers_loss": 0.01170142088085413,
"skip_count": 0.0,
"step": 440,
"text_loss": 0.7273373007774353
@@ -4197,13 +4197,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.173828125,
+ "grad_norm": 0.171875,
"learning_rate": 0.000882,
- "loss": 0.0771,
+ "loss": 0.076,
"macro_f1": 0.3333333432674408,
"num_tokens": 713503.0,
"repeat_count": 0.0,
- "routers_loss": 0.011687638238072395,
+ "routers_loss": 0.011930872686207294,
"skip_count": 0.0,
"step": 442,
"text_loss": 0.39314430952072144
@@ -4216,13 +4216,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.0008860000000000001,
- "loss": 0.0604,
+ "loss": 0.0592,
"macro_f1": 0.3333333432674408,
"num_tokens": 716582.0,
"repeat_count": 0.0,
- "routers_loss": 0.007869532331824303,
+ "routers_loss": 0.008630385622382164,
"skip_count": 0.0,
"step": 444,
"text_loss": 0.5925271511077881
@@ -4230,18 +4230,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 2.0939242735544465,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.23046875,
"learning_rate": 0.0008900000000000001,
- "loss": 0.0797,
- "macro_f1": 0.3076923191547394,
+ "loss": 0.0811,
+ "macro_f1": 0.3006536066532135,
"num_tokens": 719941.0,
"repeat_count": 3.0,
- "routers_loss": 0.3034668564796448,
+ "routers_loss": 0.3015584945678711,
"skip_count": 1.0,
"step": 446,
"text_loss": 0.5059905052185059
@@ -4254,13 +4254,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2314453125,
+ "grad_norm": 0.203125,
"learning_rate": 0.000894,
- "loss": 0.0823,
+ "loss": 0.0822,
"macro_f1": 0.31446540355682373,
"num_tokens": 723113.0,
"repeat_count": 1.0,
- "routers_loss": 0.11066079139709473,
+ "routers_loss": 0.10897493362426758,
"skip_count": 1.0,
"step": 448,
"text_loss": 0.19616436958312988
@@ -4273,13 +4273,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3046875,
+ "grad_norm": 0.33984375,
"learning_rate": 0.000898,
- "loss": 0.0773,
+ "loss": 0.0782,
"macro_f1": 0.32098764181137085,
"num_tokens": 726193.0,
"repeat_count": 0.0,
- "routers_loss": 0.0755370482802391,
+ "routers_loss": 0.07236456125974655,
"skip_count": 2.0,
"step": 450,
"text_loss": 0.1773054152727127
@@ -4292,13 +4292,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28125,
+ "grad_norm": 0.3203125,
"learning_rate": 0.000902,
- "loss": 0.0596,
+ "loss": 0.058,
"macro_f1": 0.3272727429866791,
"num_tokens": 729275.0,
"repeat_count": 1.0,
- "routers_loss": 0.08470689505338669,
+ "routers_loss": 0.08184371143579483,
"skip_count": 0.0,
"step": 452,
"text_loss": 0.4927310049533844
@@ -4311,13 +4311,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19921875,
+ "grad_norm": 0.1953125,
"learning_rate": 0.000906,
- "loss": 0.0608,
+ "loss": 0.0607,
"macro_f1": 0.3333333432674408,
"num_tokens": 731948.0,
"repeat_count": 0.0,
- "routers_loss": 0.0130238626152277,
+ "routers_loss": 0.014033539220690727,
"skip_count": 0.0,
"step": 454,
"text_loss": 0.4745742678642273
@@ -4330,13 +4330,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.00091,
- "loss": 0.0652,
+ "loss": 0.0651,
"macro_f1": 0.3333333432674408,
"num_tokens": 735351.0,
"repeat_count": 0.0,
- "routers_loss": 0.007108641788363457,
+ "routers_loss": 0.0071774693205952644,
"skip_count": 0.0,
"step": 456,
"text_loss": 0.18523462116718292
@@ -4351,11 +4351,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.400390625,
"learning_rate": 0.0009140000000000001,
- "loss": 0.0746,
+ "loss": 0.0738,
"macro_f1": 0.5492662787437439,
"num_tokens": 738587.0,
"repeat_count": 0.0,
- "routers_loss": 0.06834109872579575,
+ "routers_loss": 0.07781517505645752,
"skip_count": 2.0,
"step": 458,
"text_loss": 0.3459635376930237
@@ -4368,13 +4368,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.28125,
"learning_rate": 0.0009180000000000001,
- "loss": 0.0733,
+ "loss": 0.0723,
"macro_f1": 0.3076923191547394,
"num_tokens": 741779.0,
"repeat_count": 0.0,
- "routers_loss": 0.10230778902769089,
+ "routers_loss": 0.09529037028551102,
"skip_count": 2.0,
"step": 460,
"text_loss": 0.20197433233261108
@@ -4387,13 +4387,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.1865234375,
"learning_rate": 0.0009220000000000001,
- "loss": 0.0528,
+ "loss": 0.0519,
"macro_f1": 0.3333333432674408,
"num_tokens": 745355.0,
"repeat_count": 0.0,
- "routers_loss": 0.009987542405724525,
+ "routers_loss": 0.009765669703483582,
"skip_count": 0.0,
"step": 462,
"text_loss": 0.7031404376029968
@@ -4406,13 +4406,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.125,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009260000000000001,
- "loss": 0.0536,
+ "loss": 0.0527,
"macro_f1": 0.3272727429866791,
"num_tokens": 748628.0,
"repeat_count": 0.0,
- "routers_loss": 0.03448869287967682,
+ "routers_loss": 0.03344850242137909,
"skip_count": 1.0,
"step": 464,
"text_loss": 0.21274663507938385
@@ -4425,13 +4425,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.173828125,
"learning_rate": 0.00093,
- "loss": 0.053,
+ "loss": 0.0534,
"macro_f1": 0.3076923191547394,
"num_tokens": 751472.0,
"repeat_count": 2.0,
- "routers_loss": 0.13631699979305267,
+ "routers_loss": 0.1354292333126068,
"skip_count": 2.0,
"step": 466,
"text_loss": 0.5350717306137085
@@ -4444,13 +4444,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.142578125,
"learning_rate": 0.000934,
- "loss": 0.06,
+ "loss": 0.0598,
"macro_f1": 0.3272727429866791,
"num_tokens": 754479.0,
"repeat_count": 0.0,
- "routers_loss": 0.053951870650053024,
+ "routers_loss": 0.056420840322971344,
"skip_count": 1.0,
"step": 468,
"text_loss": 0.28153330087661743
@@ -4463,13 +4463,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.228515625,
+ "grad_norm": 0.234375,
"learning_rate": 0.0009379999999999999,
- "loss": 0.059,
+ "loss": 0.0597,
"macro_f1": 0.31446540355682373,
"num_tokens": 757872.0,
"repeat_count": 1.0,
- "routers_loss": 0.14479905366897583,
+ "routers_loss": 0.1622387170791626,
"skip_count": 1.0,
"step": 470,
"text_loss": 0.22956843674182892
@@ -4482,13 +4482,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.44140625,
+ "grad_norm": 0.5,
"learning_rate": 0.000942,
- "loss": 0.0913,
+ "loss": 0.0953,
"macro_f1": 0.32098764181137085,
"num_tokens": 760468.0,
"repeat_count": 0.0,
- "routers_loss": 0.056221429258584976,
+ "routers_loss": 0.05146972835063934,
"skip_count": 2.0,
"step": 472,
"text_loss": 0.4513966739177704
@@ -4501,13 +4501,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1904296875,
+ "grad_norm": 0.212890625,
"learning_rate": 0.000946,
- "loss": 0.0591,
+ "loss": 0.0592,
"macro_f1": 0.3272727429866791,
"num_tokens": 763519.0,
"repeat_count": 1.0,
- "routers_loss": 0.09729792177677155,
+ "routers_loss": 0.09022669494152069,
"skip_count": 0.0,
"step": 474,
"text_loss": 0.25758957862854004
@@ -4520,13 +4520,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12158203125,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.00095,
- "loss": 0.0496,
+ "loss": 0.0498,
"macro_f1": 0.3272727429866791,
"num_tokens": 767391.0,
"repeat_count": 0.0,
- "routers_loss": 0.029447713866829872,
+ "routers_loss": 0.03044828027486801,
"skip_count": 1.0,
"step": 476,
"text_loss": 0.21366681158542633
@@ -4539,13 +4539,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.271484375,
+ "grad_norm": 0.291015625,
"learning_rate": 0.000954,
- "loss": 0.0801,
+ "loss": 0.0802,
"macro_f1": 0.3272727429866791,
"num_tokens": 770338.0,
"repeat_count": 0.0,
- "routers_loss": 0.09337342530488968,
+ "routers_loss": 0.10397060960531235,
"skip_count": 1.0,
"step": 478,
"text_loss": 1.0396177768707275
@@ -4560,11 +4560,11 @@
"f1_skip": 0.0,
"grad_norm": 0.267578125,
"learning_rate": 0.000958,
- "loss": 0.1102,
+ "loss": 0.1099,
"macro_f1": 0.285714328289032,
"num_tokens": 773699.0,
"repeat_count": 2.0,
- "routers_loss": 0.23193210363388062,
+ "routers_loss": 0.22604143619537354,
"skip_count": 4.0,
"step": 480,
"text_loss": 0.2570283114910126
@@ -4572,18 +4572,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 2.2629879659524508,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.146484375,
"learning_rate": 0.000962,
- "loss": 0.0669,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0667,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 777473.0,
"repeat_count": 0.0,
- "routers_loss": 0.046257760375738144,
+ "routers_loss": 0.048258859664201736,
"skip_count": 1.0,
"step": 482,
"text_loss": 0.2540103495121002
@@ -4596,13 +4596,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1708984375,
+ "grad_norm": 0.197265625,
"learning_rate": 0.000966,
- "loss": 0.0552,
+ "loss": 0.0592,
"macro_f1": 0.3333333432674408,
"num_tokens": 780833.0,
"repeat_count": 0.0,
- "routers_loss": 0.01683143898844719,
+ "routers_loss": 0.023018671199679375,
"skip_count": 0.0,
"step": 484,
"text_loss": 0.38524550199508667
@@ -4615,13 +4615,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.326171875,
+ "grad_norm": 0.314453125,
"learning_rate": 0.0009699999999999999,
- "loss": 0.071,
+ "loss": 0.0709,
"macro_f1": 0.3272727429866791,
"num_tokens": 783656.0,
"repeat_count": 0.0,
- "routers_loss": 0.04129387438297272,
+ "routers_loss": 0.044845327734947205,
"skip_count": 1.0,
"step": 486,
"text_loss": 0.5859048366546631
@@ -4634,13 +4634,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000974,
- "loss": 0.0605,
+ "loss": 0.0615,
"macro_f1": 0.3333333432674408,
"num_tokens": 787173.0,
"repeat_count": 0.0,
- "routers_loss": 0.01262948103249073,
+ "routers_loss": 0.010898692533373833,
"skip_count": 0.0,
"step": 488,
"text_loss": 0.3456067442893982
@@ -4653,13 +4653,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000978,
- "loss": 0.081,
+ "loss": 0.0796,
"macro_f1": 0.32098764181137085,
"num_tokens": 790395.0,
"repeat_count": 0.0,
- "routers_loss": 0.07404553890228271,
+ "routers_loss": 0.06497956812381744,
"skip_count": 2.0,
"step": 490,
"text_loss": 0.3751123249530792
@@ -4672,13 +4672,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.000982,
- "loss": 0.0751,
+ "loss": 0.0772,
"macro_f1": 0.3272727429866791,
"num_tokens": 793137.0,
"repeat_count": 0.0,
- "routers_loss": 0.06795930862426758,
+ "routers_loss": 0.07763728499412537,
"skip_count": 1.0,
"step": 492,
"text_loss": 0.43296709656715393
@@ -4691,13 +4691,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.1416015625,
"learning_rate": 0.0009860000000000001,
- "loss": 0.0804,
+ "loss": 0.0819,
"macro_f1": 0.3333333432674408,
"num_tokens": 796497.0,
"repeat_count": 0.0,
- "routers_loss": 0.02233024686574936,
+ "routers_loss": 0.02127906307578087,
"skip_count": 0.0,
"step": 494,
"text_loss": 0.4841311275959015
@@ -4710,13 +4710,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.00099,
- "loss": 0.0731,
+ "loss": 0.073,
"macro_f1": 0.3272727429866791,
"num_tokens": 799361.0,
"repeat_count": 1.0,
- "routers_loss": 0.07979031652212143,
+ "routers_loss": 0.09518691152334213,
"skip_count": 0.0,
"step": 496,
"text_loss": 0.5094487071037292
@@ -4729,13 +4729,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.130859375,
"learning_rate": 0.000994,
- "loss": 0.0795,
+ "loss": 0.0789,
"macro_f1": 0.5492662787437439,
"num_tokens": 802629.0,
"repeat_count": 0.0,
- "routers_loss": 0.045646365731954575,
+ "routers_loss": 0.0563947930932045,
"skip_count": 2.0,
"step": 498,
"text_loss": 0.42783617973327637
@@ -4748,13 +4748,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.1865234375,
"learning_rate": 0.000998,
"loss": 0.0476,
"macro_f1": 0.3272727429866791,
"num_tokens": 805881.0,
"repeat_count": 1.0,
- "routers_loss": 0.09717849642038345,
+ "routers_loss": 0.10570426285266876,
"skip_count": 0.0,
"step": 500,
"text_loss": 0.28395503759384155
@@ -4767,13 +4767,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.30078125,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0009999999760498814,
- "loss": 0.0894,
+ "loss": 0.0849,
"macro_f1": 0.5492662787437439,
"num_tokens": 809283.0,
"repeat_count": 0.0,
- "routers_loss": 0.03948225453495979,
+ "routers_loss": 0.031202208250761032,
"skip_count": 2.0,
"step": 502,
"text_loss": 0.32970911264419556
@@ -4786,13 +4786,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.15625,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009999997844489475,
- "loss": 0.0557,
+ "loss": 0.0574,
"macro_f1": 0.3272727429866791,
"num_tokens": 812440.0,
"repeat_count": 0.0,
- "routers_loss": 0.0742638111114502,
+ "routers_loss": 0.07647835463285446,
"skip_count": 1.0,
"step": 504,
"text_loss": 0.4901447296142578
@@ -4805,13 +4805,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.25,
"learning_rate": 0.000999999401247153,
- "loss": 0.0682,
+ "loss": 0.0668,
"macro_f1": 0.32098764181137085,
"num_tokens": 815716.0,
"repeat_count": 0.0,
- "routers_loss": 0.08293049037456512,
+ "routers_loss": 0.08515176922082901,
"skip_count": 2.0,
"step": 506,
"text_loss": 0.6157599687576294
@@ -4824,13 +4824,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.25390625,
"learning_rate": 0.0009999988264446445,
- "loss": 0.0697,
+ "loss": 0.0686,
"macro_f1": 0.3333333432674408,
"num_tokens": 819086.0,
"repeat_count": 0.0,
- "routers_loss": 0.010080376639962196,
+ "routers_loss": 0.00946938619017601,
"skip_count": 0.0,
"step": 508,
"text_loss": 0.5053519010543823
@@ -4843,13 +4843,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009999980600416424,
- "loss": 0.0611,
+ "loss": 0.0574,
"macro_f1": 0.3333333432674408,
"num_tokens": 822268.0,
"repeat_count": 0.0,
- "routers_loss": 0.009179878048598766,
+ "routers_loss": 0.01058756373822689,
"skip_count": 0.0,
"step": 510,
"text_loss": 0.5570021867752075
@@ -4862,13 +4862,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11083984375,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.000999997102038441,
- "loss": 0.0689,
+ "loss": 0.0678,
"macro_f1": 0.3333333432674408,
"num_tokens": 825728.0,
"repeat_count": 0.0,
- "routers_loss": 0.006718529388308525,
+ "routers_loss": 0.008705209009349346,
"skip_count": 0.0,
"step": 512,
"text_loss": 0.6519040465354919
@@ -4881,13 +4881,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.220703125,
"learning_rate": 0.0009999959524354064,
- "loss": 0.0826,
+ "loss": 0.083,
"macro_f1": 0.3272727429866791,
"num_tokens": 829459.0,
"repeat_count": 0.0,
- "routers_loss": 0.049344487488269806,
+ "routers_loss": 0.04024193435907364,
"skip_count": 1.0,
"step": 514,
"text_loss": 0.5290043950080872
@@ -4900,13 +4900,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.259765625,
+ "grad_norm": 0.25390625,
"learning_rate": 0.00099999461123298,
- "loss": 0.0739,
+ "loss": 0.0727,
"macro_f1": 0.3333333432674408,
"num_tokens": 832291.0,
"repeat_count": 0.0,
- "routers_loss": 0.013402626849710941,
+ "routers_loss": 0.015742862597107887,
"skip_count": 0.0,
"step": 516,
"text_loss": 0.7910057902336121
@@ -4919,13 +4919,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.000999993078431675,
- "loss": 0.0761,
+ "loss": 0.0759,
"macro_f1": 0.3076923191547394,
"num_tokens": 835399.0,
"repeat_count": 1.0,
- "routers_loss": 0.16964484751224518,
+ "routers_loss": 0.16753782331943512,
"skip_count": 3.0,
"step": 518,
"text_loss": 0.45196083188056946
@@ -4938,13 +4938,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.236328125,
"learning_rate": 0.0009999913540320792,
- "loss": 0.095,
+ "loss": 0.0968,
"macro_f1": 0.31446540355682373,
"num_tokens": 838993.0,
"repeat_count": 0.0,
- "routers_loss": 0.08609295636415482,
+ "routers_loss": 0.09357143193483353,
"skip_count": 2.0,
"step": 520,
"text_loss": 0.5499435663223267
@@ -4957,13 +4957,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.2392578125,
+ "grad_norm": 0.2451171875,
"learning_rate": 0.0009999894380348536,
- "loss": 0.0816,
+ "loss": 0.0821,
"macro_f1": 0.5492662787437439,
"num_tokens": 842652.0,
"repeat_count": 0.0,
- "routers_loss": 0.05354784056544304,
+ "routers_loss": 0.056803856045007706,
"skip_count": 2.0,
"step": 522,
"text_loss": 0.197520449757576
@@ -4976,13 +4976,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.2333984375,
"learning_rate": 0.000999987330440732,
- "loss": 0.0715,
+ "loss": 0.0725,
"macro_f1": 0.4871794879436493,
"num_tokens": 847061.0,
"repeat_count": 0.0,
- "routers_loss": 0.09146631509065628,
+ "routers_loss": 0.08962195366621017,
"skip_count": 3.0,
"step": 524,
"text_loss": 0.27509039640426636
@@ -4995,13 +4995,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.189453125,
"learning_rate": 0.000999985031250522,
- "loss": 0.0574,
+ "loss": 0.0561,
"macro_f1": 0.3333333432674408,
"num_tokens": 850780.0,
"repeat_count": 0.0,
- "routers_loss": 0.02344255894422531,
+ "routers_loss": 0.022930558770895004,
"skip_count": 0.0,
"step": 526,
"text_loss": 0.13291706144809723
@@ -5014,13 +5014,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1982421875,
+ "grad_norm": 0.197265625,
"learning_rate": 0.0009999825404651053,
- "loss": 0.0621,
+ "loss": 0.0614,
"macro_f1": 0.3333333432674408,
"num_tokens": 853886.0,
"repeat_count": 0.0,
- "routers_loss": 0.018271517008543015,
+ "routers_loss": 0.017097990959882736,
"skip_count": 0.0,
"step": 528,
"text_loss": 0.21706295013427734
@@ -5033,13 +5033,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2060546875,
+ "grad_norm": 0.212890625,
"learning_rate": 0.0009999798580854356,
- "loss": 0.0717,
+ "loss": 0.0724,
"macro_f1": 0.3333333432674408,
"num_tokens": 857364.0,
"repeat_count": 0.0,
- "routers_loss": 0.026990914717316628,
+ "routers_loss": 0.02831801027059555,
"skip_count": 0.0,
"step": 530,
"text_loss": 0.9035662412643433
@@ -5052,13 +5052,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16015625,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.000999976984112541,
- "loss": 0.0681,
+ "loss": 0.0674,
"macro_f1": 0.3333333432674408,
"num_tokens": 860661.0,
"repeat_count": 0.0,
- "routers_loss": 0.019737249240279198,
+ "routers_loss": 0.019671892747282982,
"skip_count": 0.0,
"step": 532,
"text_loss": 0.8354863524436951
@@ -5071,13 +5071,13 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.3046875,
+ "grad_norm": 0.2890625,
"learning_rate": 0.0009999739185475231,
- "loss": 0.0978,
+ "loss": 0.0963,
"macro_f1": 0.47333335876464844,
"num_tokens": 864124.0,
"repeat_count": 2.0,
- "routers_loss": 0.212640181183815,
+ "routers_loss": 0.21383361518383026,
"skip_count": 3.0,
"step": 534,
"text_loss": 0.23422949016094208
@@ -5090,13 +5090,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.0009999706613915565,
- "loss": 0.0602,
+ "loss": 0.0598,
"macro_f1": 0.32098767161369324,
"num_tokens": 866976.0,
"repeat_count": 0.0,
- "routers_loss": 0.07302755117416382,
+ "routers_loss": 0.07158871740102768,
"skip_count": 1.0,
"step": 536,
"text_loss": 0.11800774186849594
@@ -5109,13 +5109,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.296875,
+ "grad_norm": 0.26953125,
"learning_rate": 0.0009999672126458894,
- "loss": 0.0825,
+ "loss": 0.0822,
"macro_f1": 0.3272727429866791,
"num_tokens": 870549.0,
"repeat_count": 0.0,
- "routers_loss": 0.08667246252298355,
+ "routers_loss": 0.08185924589633942,
"skip_count": 1.0,
"step": 538,
"text_loss": 0.19232480227947235
@@ -5128,13 +5128,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1318359375,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.000999963572311843,
- "loss": 0.0597,
+ "loss": 0.0604,
"macro_f1": 0.3333333432674408,
"num_tokens": 873733.0,
"repeat_count": 0.0,
- "routers_loss": 0.015047167427837849,
+ "routers_loss": 0.01633382774889469,
"skip_count": 0.0,
"step": 540,
"text_loss": 0.3725031912326813
@@ -5147,13 +5147,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.15234375,
"learning_rate": 0.0009999597403908128,
- "loss": 0.076,
+ "loss": 0.0761,
"macro_f1": 0.3272727429866791,
"num_tokens": 877099.0,
"repeat_count": 0.0,
- "routers_loss": 0.07481446117162704,
+ "routers_loss": 0.0782657191157341,
"skip_count": 1.0,
"step": 542,
"text_loss": 0.17589199542999268
@@ -5166,13 +5166,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.2177734375,
"learning_rate": 0.0009999557168842669,
- "loss": 0.0724,
+ "loss": 0.0716,
"macro_f1": 0.5492662787437439,
"num_tokens": 879883.0,
"repeat_count": 0.0,
- "routers_loss": 0.049495212733745575,
+ "routers_loss": 0.05275818333029747,
"skip_count": 2.0,
"step": 544,
"text_loss": 0.26448264718055725
@@ -5185,13 +5185,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.0009999515017937468,
- "loss": 0.0718,
+ "loss": 0.071,
"macro_f1": 0.32098764181137085,
"num_tokens": 882223.0,
"repeat_count": 0.0,
- "routers_loss": 0.08043002337217331,
+ "routers_loss": 0.09335892647504807,
"skip_count": 2.0,
"step": 546,
"text_loss": 0.208544060587883
@@ -5204,13 +5204,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.34765625,
+ "grad_norm": 0.376953125,
"learning_rate": 0.0009999470951208684,
- "loss": 0.086,
+ "loss": 0.0855,
"macro_f1": 0.32098764181137085,
"num_tokens": 885241.0,
"repeat_count": 2.0,
- "routers_loss": 0.22461950778961182,
+ "routers_loss": 0.22983254492282867,
"skip_count": 0.0,
"step": 548,
"text_loss": 0.6612338423728943
@@ -5223,13 +5223,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.216796875,
"learning_rate": 0.00099994249686732,
- "loss": 0.0798,
+ "loss": 0.0786,
"macro_f1": 0.3272727429866791,
"num_tokens": 887897.0,
"repeat_count": 1.0,
- "routers_loss": 0.11754962801933289,
+ "routers_loss": 0.12858282029628754,
"skip_count": 0.0,
"step": 550,
"text_loss": 0.4673548936843872
@@ -5242,13 +5242,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1611328125,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.0009999377070348638,
- "loss": 0.0978,
+ "loss": 0.0944,
"macro_f1": 0.3333333432674408,
"num_tokens": 891224.0,
"repeat_count": 0.0,
- "routers_loss": 0.017412789165973663,
+ "routers_loss": 0.017421770840883255,
"skip_count": 0.0,
"step": 552,
"text_loss": 0.6419258117675781
@@ -5261,13 +5261,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.15625,
"learning_rate": 0.000999932725625335,
- "loss": 0.0792,
+ "loss": 0.0791,
"macro_f1": 0.32098764181137085,
"num_tokens": 894578.0,
"repeat_count": 0.0,
- "routers_loss": 0.08969525247812271,
+ "routers_loss": 0.07890026271343231,
"skip_count": 2.0,
"step": 554,
"text_loss": 0.5970752239227295
@@ -5280,13 +5280,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2158203125,
+ "grad_norm": 0.216796875,
"learning_rate": 0.0009999275526406427,
- "loss": 0.0803,
+ "loss": 0.0796,
"macro_f1": 0.31446540355682373,
"num_tokens": 897145.0,
"repeat_count": 1.0,
- "routers_loss": 0.09876437485218048,
+ "routers_loss": 0.09836960583925247,
"skip_count": 1.0,
"step": 556,
"text_loss": 0.752425491809845
@@ -5299,13 +5299,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.1875,
"learning_rate": 0.0009999221880827693,
- "loss": 0.0887,
+ "loss": 0.0882,
"macro_f1": 0.3333333432674408,
"num_tokens": 900565.0,
"repeat_count": 0.0,
- "routers_loss": 0.019108204171061516,
+ "routers_loss": 0.017694659531116486,
"skip_count": 0.0,
"step": 558,
"text_loss": 0.195619136095047
@@ -5318,32 +5318,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.2021484375,
"learning_rate": 0.0009999166319537703,
- "loss": 0.0573,
+ "loss": 0.0561,
"macro_f1": 0.3333333432674408,
"num_tokens": 903506.0,
"repeat_count": 0.0,
- "routers_loss": 0.019048813730478287,
+ "routers_loss": 0.019375264644622803,
"skip_count": 0.0,
"step": 560,
"text_loss": 0.4603337347507477
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
"epoch": 2.638685060170238,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1435546875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.146484375,
"learning_rate": 0.0009999108842557748,
- "loss": 0.0947,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0953,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 906380.0,
"repeat_count": 0.0,
- "routers_loss": 0.11889495700597763,
+ "routers_loss": 0.12013207376003265,
"skip_count": 3.0,
"step": 562,
"text_loss": 0.6279402375221252
@@ -5356,13 +5356,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.228515625,
+ "grad_norm": 0.255859375,
"learning_rate": 0.0009999049449909854,
- "loss": 0.0771,
+ "loss": 0.0799,
"macro_f1": 0.3272727429866791,
"num_tokens": 909116.0,
"repeat_count": 0.0,
- "routers_loss": 0.06202332302927971,
+ "routers_loss": 0.06441342830657959,
"skip_count": 1.0,
"step": 564,
"text_loss": 0.23741699755191803
@@ -5375,13 +5375,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.15234375,
"learning_rate": 0.0009998988141616781,
- "loss": 0.0623,
+ "loss": 0.064,
"macro_f1": 0.32098767161369324,
"num_tokens": 912189.0,
"repeat_count": 0.0,
- "routers_loss": 0.08294244855642319,
+ "routers_loss": 0.08309414982795715,
"skip_count": 1.0,
"step": 566,
"text_loss": 0.27780941128730774
@@ -5394,13 +5394,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009998924917702023,
- "loss": 0.0885,
+ "loss": 0.0876,
"macro_f1": 0.3272727429866791,
"num_tokens": 916279.0,
"repeat_count": 1.0,
- "routers_loss": 0.07545182853937149,
+ "routers_loss": 0.07197169959545135,
"skip_count": 0.0,
"step": 568,
"text_loss": 0.6371755599975586
@@ -5413,13 +5413,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.2255859375,
"learning_rate": 0.0009998859778189806,
- "loss": 0.0712,
+ "loss": 0.0706,
"macro_f1": 0.3333333432674408,
"num_tokens": 919490.0,
"repeat_count": 0.0,
- "routers_loss": 0.008711219765245914,
+ "routers_loss": 0.008022273890674114,
"skip_count": 0.0,
"step": 570,
"text_loss": 0.6028938889503479
@@ -5432,13 +5432,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.000999879272310509,
- "loss": 0.0837,
+ "loss": 0.084,
"macro_f1": 0.3333333432674408,
"num_tokens": 923694.0,
"repeat_count": 0.0,
- "routers_loss": 0.01639273390173912,
+ "routers_loss": 0.01634674146771431,
"skip_count": 0.0,
"step": 572,
"text_loss": 0.7177054286003113
@@ -5451,13 +5451,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1669921875,
+ "grad_norm": 0.17578125,
"learning_rate": 0.0009998723752473574,
- "loss": 0.0707,
+ "loss": 0.0716,
"macro_f1": 0.3272727429866791,
"num_tokens": 926933.0,
"repeat_count": 0.0,
- "routers_loss": 0.04997137933969498,
+ "routers_loss": 0.060559045523405075,
"skip_count": 1.0,
"step": 574,
"text_loss": 0.5203254818916321
@@ -5470,13 +5470,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1845703125,
+ "grad_norm": 0.185546875,
"learning_rate": 0.0009998652866321687,
- "loss": 0.0799,
+ "loss": 0.0801,
"macro_f1": 0.3333333432674408,
"num_tokens": 929832.0,
"repeat_count": 0.0,
- "routers_loss": 0.011360209435224533,
+ "routers_loss": 0.011485611088573933,
"skip_count": 0.0,
"step": 576,
"text_loss": 0.6147452592849731
@@ -5489,13 +5489,13 @@
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1669921875,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.000999858006467659,
- "loss": 0.0658,
+ "loss": 0.0649,
"macro_f1": 0.29333335161209106,
"num_tokens": 933266.0,
"repeat_count": 2.0,
- "routers_loss": 0.31349560618400574,
+ "routers_loss": 0.2929030954837799,
"skip_count": 4.0,
"step": 578,
"text_loss": 0.1720666140317917
@@ -5508,13 +5508,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.24609375,
"learning_rate": 0.0009998505347566186,
- "loss": 0.0801,
+ "loss": 0.0782,
"macro_f1": 0.32098764181137085,
"num_tokens": 937545.0,
"repeat_count": 0.0,
- "routers_loss": 0.058660347014665604,
+ "routers_loss": 0.053780000656843185,
"skip_count": 2.0,
"step": 580,
"text_loss": 0.3258405327796936
@@ -5527,13 +5527,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.1416015625,
"learning_rate": 0.00099984287150191,
- "loss": 0.0578,
+ "loss": 0.0582,
"macro_f1": 0.3333333432674408,
"num_tokens": 941001.0,
"repeat_count": 0.0,
- "routers_loss": 0.025836754590272903,
+ "routers_loss": 0.02637636847794056,
"skip_count": 0.0,
"step": 582,
"text_loss": 0.23762771487236023
@@ -5546,13 +5546,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009998350167064705,
- "loss": 0.0683,
+ "loss": 0.0672,
"macro_f1": 0.3333333432674408,
"num_tokens": 943989.0,
"repeat_count": 0.0,
- "routers_loss": 0.016504868865013123,
+ "routers_loss": 0.01637580618262291,
"skip_count": 0.0,
"step": 584,
"text_loss": 0.7460582852363586
@@ -5565,13 +5565,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1787109375,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009998269703733096,
- "loss": 0.0685,
+ "loss": 0.0686,
"macro_f1": 0.3272727429866791,
"num_tokens": 947245.0,
"repeat_count": 1.0,
- "routers_loss": 0.1379794180393219,
+ "routers_loss": 0.13934117555618286,
"skip_count": 0.0,
"step": 586,
"text_loss": 0.5284690260887146
@@ -5584,13 +5584,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.13671875,
"learning_rate": 0.0009998187325055106,
- "loss": 0.0657,
+ "loss": 0.0667,
"macro_f1": 0.3333333432674408,
"num_tokens": 950116.0,
"repeat_count": 0.0,
- "routers_loss": 0.01802757754921913,
+ "routers_loss": 0.02138397842645645,
"skip_count": 0.0,
"step": 588,
"text_loss": 0.3920256197452545
@@ -5603,13 +5603,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0009998103031062305,
- "loss": 0.0762,
+ "loss": 0.0778,
"macro_f1": 0.3333333432674408,
"num_tokens": 953277.0,
"repeat_count": 0.0,
- "routers_loss": 0.006902900990098715,
+ "routers_loss": 0.007098200265318155,
"skip_count": 0.0,
"step": 590,
"text_loss": 0.7472905516624451
@@ -5622,13 +5622,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3046875,
+ "grad_norm": 0.318359375,
"learning_rate": 0.0009998016821786994,
- "loss": 0.0912,
+ "loss": 0.0872,
"macro_f1": 0.32098764181137085,
"num_tokens": 958229.0,
"repeat_count": 1.0,
- "routers_loss": 0.08348741382360458,
+ "routers_loss": 0.07946522533893585,
"skip_count": 1.0,
"step": 592,
"text_loss": 0.5506448745727539
@@ -5641,13 +5641,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.000999792869726221,
- "loss": 0.0527,
+ "loss": 0.0523,
"macro_f1": 0.3272727429866791,
"num_tokens": 961016.0,
"repeat_count": 0.0,
- "routers_loss": 0.08290062099695206,
+ "routers_loss": 0.0850791186094284,
"skip_count": 1.0,
"step": 594,
"text_loss": 0.3824431002140045
@@ -5660,13 +5660,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.0009997838657521717,
- "loss": 0.0643,
+ "loss": 0.0632,
"macro_f1": 0.3333333432674408,
"num_tokens": 963847.0,
"repeat_count": 0.0,
- "routers_loss": 0.018620988354086876,
+ "routers_loss": 0.016370445489883423,
"skip_count": 0.0,
"step": 596,
"text_loss": 0.2139475792646408
@@ -5679,13 +5679,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009997746702600026,
- "loss": 0.073,
+ "loss": 0.0702,
"macro_f1": 0.307692289352417,
"num_tokens": 966619.0,
"repeat_count": 0.0,
- "routers_loss": 0.1211671382188797,
+ "routers_loss": 0.1310746818780899,
"skip_count": 3.0,
"step": 598,
"text_loss": 0.3651018440723419
@@ -5698,13 +5698,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.23828125,
"learning_rate": 0.0009997652832532372,
- "loss": 0.079,
+ "loss": 0.0792,
"macro_f1": 0.3272727429866791,
"num_tokens": 970418.0,
"repeat_count": 1.0,
- "routers_loss": 0.15485027432441711,
+ "routers_loss": 0.14303378760814667,
"skip_count": 0.0,
"step": 600,
"text_loss": 0.7094736099243164
@@ -5717,13 +5717,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009997557047354722,
- "loss": 0.0562,
+ "loss": 0.0531,
"macro_f1": 0.3272727429866791,
"num_tokens": 973491.0,
"repeat_count": 0.0,
- "routers_loss": 0.036684274673461914,
+ "routers_loss": 0.03334212675690651,
"skip_count": 1.0,
"step": 602,
"text_loss": 0.4812237024307251
@@ -5731,18 +5731,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 2.835926034634576,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.302734375,
+ "grad_norm": 0.2890625,
"learning_rate": 0.0009997459347103783,
- "loss": 0.0985,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0956,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 976672.0,
"repeat_count": 0.0,
- "routers_loss": 0.026901578530669212,
+ "routers_loss": 0.02831871062517166,
"skip_count": 0.0,
"step": 604,
"text_loss": 0.21737146377563477
@@ -5755,13 +5755,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12158203125,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009997359731816998,
- "loss": 0.0632,
+ "loss": 0.0646,
"macro_f1": 0.3333333432674408,
"num_tokens": 979898.0,
"repeat_count": 0.0,
- "routers_loss": 0.01700405217707157,
+ "routers_loss": 0.017968013882637024,
"skip_count": 0.0,
"step": 606,
"text_loss": 0.5458008050918579
@@ -5774,13 +5774,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2099609375,
+ "grad_norm": 0.224609375,
"learning_rate": 0.0009997258201532536,
- "loss": 0.0758,
+ "loss": 0.0751,
"macro_f1": 0.3333333432674408,
"num_tokens": 982811.0,
"repeat_count": 0.0,
- "routers_loss": 0.015013590455055237,
+ "routers_loss": 0.016256732866168022,
"skip_count": 0.0,
"step": 608,
"text_loss": 0.8643257021903992
@@ -5793,13 +5793,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0009997154756289303,
- "loss": 0.0576,
+ "loss": 0.0561,
"macro_f1": 0.3333333432674408,
"num_tokens": 985245.0,
"repeat_count": 0.0,
- "routers_loss": 0.02037946693599224,
+ "routers_loss": 0.021214161068201065,
"skip_count": 0.0,
"step": 610,
"text_loss": 0.2204967886209488
@@ -5812,13 +5812,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.150390625,
"learning_rate": 0.000999704939612694,
- "loss": 0.0648,
+ "loss": 0.0636,
"macro_f1": 0.3006536364555359,
"num_tokens": 988539.0,
"repeat_count": 3.0,
- "routers_loss": 0.22834022343158722,
+ "routers_loss": 0.23249399662017822,
"skip_count": 2.0,
"step": 612,
"text_loss": 0.32489025592803955
@@ -5831,13 +5831,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009996942121085824,
- "loss": 0.0449,
+ "loss": 0.0445,
"macro_f1": 0.3333333432674408,
"num_tokens": 991660.0,
"repeat_count": 0.0,
- "routers_loss": 0.009838113561272621,
+ "routers_loss": 0.010706410743296146,
"skip_count": 0.0,
"step": 614,
"text_loss": 0.4551754891872406
@@ -5850,13 +5850,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.353515625,
+ "grad_norm": 0.3671875,
"learning_rate": 0.000999683293120706,
- "loss": 0.1009,
+ "loss": 0.1016,
"macro_f1": 0.3333333432674408,
"num_tokens": 994828.0,
"repeat_count": 0.0,
- "routers_loss": 0.005943270865827799,
+ "routers_loss": 0.006676184479147196,
"skip_count": 0.0,
"step": 616,
"text_loss": 0.6212068200111389
@@ -5869,13 +5869,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.38671875,
+ "grad_norm": 0.408203125,
"learning_rate": 0.0009996721826532491,
- "loss": 0.0941,
+ "loss": 0.0976,
"macro_f1": 0.3076923191547394,
"num_tokens": 997951.0,
"repeat_count": 2.0,
- "routers_loss": 0.21597740054130554,
+ "routers_loss": 0.2148125320672989,
"skip_count": 2.0,
"step": 618,
"text_loss": 0.26514527201652527
@@ -5888,13 +5888,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.1904296875,
"learning_rate": 0.000999660880710469,
- "loss": 0.0896,
+ "loss": 0.0909,
"macro_f1": 0.3333333432674408,
"num_tokens": 1001139.0,
"repeat_count": 0.0,
- "routers_loss": 0.023726588115096092,
+ "routers_loss": 0.022332455962896347,
"skip_count": 0.0,
"step": 620,
"text_loss": 0.26131340861320496
@@ -5907,13 +5907,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009996493872966971,
"loss": 0.0732,
"macro_f1": 0.3272727429866791,
"num_tokens": 1003678.0,
"repeat_count": 1.0,
- "routers_loss": 0.08467255532741547,
+ "routers_loss": 0.08348730951547623,
"skip_count": 0.0,
"step": 622,
"text_loss": 0.19151706993579865
@@ -5926,13 +5926,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.173828125,
"learning_rate": 0.0009996377024163374,
- "loss": 0.0816,
+ "loss": 0.0822,
"macro_f1": 0.3333333432674408,
"num_tokens": 1007082.0,
"repeat_count": 0.0,
- "routers_loss": 0.029468854889273643,
+ "routers_loss": 0.028577150776982307,
"skip_count": 0.0,
"step": 624,
"text_loss": 0.305387407541275
@@ -5945,13 +5945,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.11279296875,
"learning_rate": 0.0009996258260738676,
- "loss": 0.0891,
+ "loss": 0.0892,
"macro_f1": 0.3272727429866791,
"num_tokens": 1010064.0,
"repeat_count": 1.0,
- "routers_loss": 0.09438466280698776,
+ "routers_loss": 0.08312026411294937,
"skip_count": 0.0,
"step": 626,
"text_loss": 0.49436143040657043
@@ -5964,13 +5964,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009996137582738388,
- "loss": 0.0581,
+ "loss": 0.0591,
"macro_f1": 0.3333333432674408,
"num_tokens": 1013462.0,
"repeat_count": 0.0,
- "routers_loss": 0.013679586350917816,
+ "routers_loss": 0.013337327167391777,
"skip_count": 0.0,
"step": 628,
"text_loss": 0.6515294313430786
@@ -5983,13 +5983,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.140625,
"learning_rate": 0.000999601499020875,
- "loss": 0.0528,
+ "loss": 0.0537,
"macro_f1": 0.3333333432674408,
"num_tokens": 1016246.0,
"repeat_count": 0.0,
- "routers_loss": 0.029532987624406815,
+ "routers_loss": 0.029126765206456184,
"skip_count": 0.0,
"step": 630,
"text_loss": 0.18834827840328217
@@ -6002,13 +6002,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009995890483196746,
- "loss": 0.0601,
+ "loss": 0.0602,
"macro_f1": 0.3272727429866791,
"num_tokens": 1019286.0,
"repeat_count": 0.0,
- "routers_loss": 0.05516733601689339,
+ "routers_loss": 0.054844800382852554,
"skip_count": 1.0,
"step": 632,
"text_loss": 0.6988179087638855
@@ -6021,13 +6021,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.357421875,
+ "grad_norm": 0.322265625,
"learning_rate": 0.0009995764061750086,
- "loss": 0.0785,
+ "loss": 0.0767,
"macro_f1": 0.3333333432674408,
"num_tokens": 1022207.0,
"repeat_count": 0.0,
- "routers_loss": 0.010254866443574429,
+ "routers_loss": 0.010095693171024323,
"skip_count": 0.0,
"step": 634,
"text_loss": 0.558451771736145
@@ -6040,13 +6040,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.2890625,
"learning_rate": 0.000999563572591721,
- "loss": 0.0518,
+ "loss": 0.0521,
"macro_f1": 0.32098764181137085,
"num_tokens": 1025319.0,
"repeat_count": 1.0,
- "routers_loss": 0.07528360933065414,
+ "routers_loss": 0.0698433518409729,
"skip_count": 1.0,
"step": 636,
"text_loss": 0.5961872935295105
@@ -6059,13 +6059,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1064453125,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009995505475747302,
- "loss": 0.0844,
+ "loss": 0.0849,
"macro_f1": 0.3272727429866791,
"num_tokens": 1028362.0,
"repeat_count": 0.0,
- "routers_loss": 0.04301584139466286,
+ "routers_loss": 0.040211405605077744,
"skip_count": 1.0,
"step": 638,
"text_loss": 0.546863317489624
@@ -6078,13 +6078,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11572265625,
+ "grad_norm": 0.119140625,
"learning_rate": 0.0009995373311290272,
- "loss": 0.0699,
+ "loss": 0.0709,
"macro_f1": 0.3144654333591461,
"num_tokens": 1032199.0,
"repeat_count": 2.0,
- "routers_loss": 0.14521080255508423,
+ "routers_loss": 0.1457643061876297,
"skip_count": 1.0,
"step": 640,
"text_loss": 0.2137298285961151
@@ -6097,13 +6097,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1328125,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.0009995239232596764,
- "loss": 0.0543,
+ "loss": 0.0545,
"macro_f1": 0.3333333432674408,
"num_tokens": 1035801.0,
"repeat_count": 0.0,
- "routers_loss": 0.01074797473847866,
+ "routers_loss": 0.011394930072128773,
"skip_count": 0.0,
"step": 642,
"text_loss": 0.43054503202438354
@@ -6116,13 +6116,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009995103239718163,
- "loss": 0.0659,
+ "loss": 0.0665,
"macro_f1": 0.3333333432674408,
"num_tokens": 1039223.0,
"repeat_count": 0.0,
- "routers_loss": 0.009271817281842232,
+ "routers_loss": 0.00997432041913271,
"skip_count": 0.0,
"step": 644,
"text_loss": 0.7749615907669067
@@ -6135,13 +6135,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0009994965332706573,
- "loss": 0.0737,
+ "loss": 0.0755,
"macro_f1": 0.3144654333591461,
"num_tokens": 1042154.0,
"repeat_count": 3.0,
- "routers_loss": 0.10257050395011902,
+ "routers_loss": 0.10589150339365005,
"skip_count": 0.0,
"step": 646,
"text_loss": 0.7812211513519287
@@ -6154,13 +6154,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.0009994825511614846,
- "loss": 0.0363,
+ "loss": 0.0383,
"macro_f1": 0.3272727429866791,
"num_tokens": 1045250.0,
"repeat_count": 0.0,
- "routers_loss": 0.07091924548149109,
+ "routers_loss": 0.0748734176158905,
"skip_count": 1.0,
"step": 648,
"text_loss": 0.844803512096405
@@ -6173,13 +6173,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11572265625,
+ "grad_norm": 0.1220703125,
"learning_rate": 0.0009994683776496562,
- "loss": 0.0421,
+ "loss": 0.0433,
"macro_f1": 0.3272727429866791,
"num_tokens": 1048446.0,
"repeat_count": 0.0,
- "routers_loss": 0.034446243196725845,
+ "routers_loss": 0.03742415830492973,
"skip_count": 1.0,
"step": 650,
"text_loss": 0.2098839282989502
@@ -6192,13 +6192,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009994540127406034,
- "loss": 0.0593,
+ "loss": 0.0591,
"macro_f1": 0.32098764181137085,
"num_tokens": 1051840.0,
"repeat_count": 0.0,
- "routers_loss": 0.06077485531568527,
+ "routers_loss": 0.06025516986846924,
"skip_count": 2.0,
"step": 652,
"text_loss": 0.27727583050727844
@@ -6211,13 +6211,13 @@
"f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.181640625,
"learning_rate": 0.0009994394564398306,
- "loss": 0.0537,
+ "loss": 0.0519,
"macro_f1": 0.521541953086853,
"num_tokens": 1055142.0,
"repeat_count": 4.0,
- "routers_loss": 0.2382282167673111,
+ "routers_loss": 0.22807340323925018,
"skip_count": 2.0,
"step": 654,
"text_loss": 0.9672397971153259
@@ -6230,13 +6230,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.142578125,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009994247087529158,
- "loss": 0.0613,
+ "loss": 0.0618,
"macro_f1": 0.3333333432674408,
"num_tokens": 1057698.0,
"repeat_count": 0.0,
- "routers_loss": 0.011971636675298214,
+ "routers_loss": 0.01348950993269682,
"skip_count": 0.0,
"step": 656,
"text_loss": 0.6375506520271301
@@ -6249,13 +6249,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.212890625,
+ "grad_norm": 0.1953125,
"learning_rate": 0.0009994097696855106,
- "loss": 0.0414,
+ "loss": 0.0412,
"macro_f1": 0.3333333432674408,
"num_tokens": 1060624.0,
"repeat_count": 0.0,
- "routers_loss": 0.010221127420663834,
+ "routers_loss": 0.009649243205785751,
"skip_count": 0.0,
"step": 658,
"text_loss": 0.5315385460853577
@@ -6268,13 +6268,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2265625,
+ "grad_norm": 0.2041015625,
"learning_rate": 0.0009993946392433395,
- "loss": 0.061,
+ "loss": 0.0609,
"macro_f1": 0.307692289352417,
"num_tokens": 1065076.0,
"repeat_count": 0.0,
- "routers_loss": 0.11860335618257523,
+ "routers_loss": 0.1250980943441391,
"skip_count": 3.0,
"step": 660,
"text_loss": 0.25780341029167175
@@ -6287,13 +6287,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009993793174322006,
- "loss": 0.0485,
+ "loss": 0.0471,
"macro_f1": 0.3333333432674408,
"num_tokens": 1068365.0,
"repeat_count": 0.0,
- "routers_loss": 0.011139829643070698,
+ "routers_loss": 0.011544390581548214,
"skip_count": 0.0,
"step": 662,
"text_loss": 0.34876301884651184
@@ -6306,13 +6306,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.166015625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009993638042579654,
- "loss": 0.0478,
+ "loss": 0.0473,
"macro_f1": 0.3272727429866791,
"num_tokens": 1071693.0,
"repeat_count": 0.0,
- "routers_loss": 0.03978770971298218,
+ "routers_loss": 0.03777370601892471,
"skip_count": 1.0,
"step": 664,
"text_loss": 0.21811571717262268
@@ -6327,11 +6327,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.203125,
"learning_rate": 0.0009993480997265783,
- "loss": 0.0481,
+ "loss": 0.0475,
"macro_f1": 0.5492662787437439,
"num_tokens": 1074733.0,
"repeat_count": 0.0,
- "routers_loss": 0.051231011748313904,
+ "routers_loss": 0.049949806183576584,
"skip_count": 2.0,
"step": 666,
"text_loss": 0.38410288095474243
@@ -6344,13 +6344,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.10302734375,
"learning_rate": 0.0009993322038440572,
- "loss": 0.0615,
+ "loss": 0.0605,
"macro_f1": 0.3333333432674408,
"num_tokens": 1077993.0,
"repeat_count": 0.0,
- "routers_loss": 0.024917088449001312,
+ "routers_loss": 0.0247171800583601,
"skip_count": 0.0,
"step": 668,
"text_loss": 0.25576895475387573
@@ -6363,13 +6363,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1982421875,
+ "grad_norm": 0.216796875,
"learning_rate": 0.000999316116616494,
- "loss": 0.0627,
+ "loss": 0.0619,
"macro_f1": 0.3333333432674408,
"num_tokens": 1080491.0,
"repeat_count": 0.0,
- "routers_loss": 0.008834881708025932,
+ "routers_loss": 0.008118715137243271,
"skip_count": 0.0,
"step": 670,
"text_loss": 0.6269792914390564
@@ -6382,13 +6382,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.173828125,
"learning_rate": 0.0009992998380500527,
"loss": 0.0462,
"macro_f1": 0.3272727429866791,
"num_tokens": 1083817.0,
"repeat_count": 0.0,
- "routers_loss": 0.033405229449272156,
+ "routers_loss": 0.03366057574748993,
"skip_count": 1.0,
"step": 672,
"text_loss": 0.26891493797302246
@@ -6401,13 +6401,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.13671875,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009992833681509716,
- "loss": 0.0523,
+ "loss": 0.0529,
"macro_f1": 0.3333333432674408,
"num_tokens": 1087368.0,
"repeat_count": 0.0,
- "routers_loss": 0.020753704011440277,
+ "routers_loss": 0.020552074536681175,
"skip_count": 0.0,
"step": 674,
"text_loss": 0.14421936869621277
@@ -6420,13 +6420,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.18359375,
"learning_rate": 0.0009992667069255619,
- "loss": 0.0698,
+ "loss": 0.0696,
"macro_f1": 0.31446540355682373,
"num_tokens": 1090452.0,
"repeat_count": 0.0,
- "routers_loss": 0.06932353973388672,
+ "routers_loss": 0.06937336176633835,
"skip_count": 2.0,
"step": 676,
"text_loss": 0.24999259412288666
@@ -6439,13 +6439,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.08740234375,
"learning_rate": 0.0009992498543802085,
- "loss": 0.059,
+ "loss": 0.0588,
"macro_f1": 0.3272727429866791,
"num_tokens": 1093996.0,
"repeat_count": 1.0,
- "routers_loss": 0.032903749495744705,
+ "routers_loss": 0.0380021296441555,
"skip_count": 0.0,
"step": 678,
"text_loss": 0.42473849654197693
@@ -6458,32 +6458,32 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.2099609375,
+ "grad_norm": 0.2119140625,
"learning_rate": 0.0009992328105213688,
- "loss": 0.0417,
+ "loss": 0.0411,
"macro_f1": 0.4400000274181366,
"num_tokens": 1096837.0,
"repeat_count": 1.0,
- "routers_loss": 0.19733747839927673,
+ "routers_loss": 0.20885063707828522,
"skip_count": 4.0,
"step": 680,
"text_loss": 0.3829527199268341
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 26.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 3.2019371881420606,
- "f1_execute": 1.0,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.154296875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009992155753555747,
- "loss": 0.0729,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0722,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1100320.0,
"repeat_count": 0.0,
- "routers_loss": 0.013452666811645031,
+ "routers_loss": 0.018230699002742767,
"skip_count": 2.0,
"step": 682,
"text_loss": 0.6190969944000244
@@ -6496,13 +6496,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.30859375,
"learning_rate": 0.0009991981488894303,
"loss": 0.0681,
"macro_f1": 0.32098767161369324,
"num_tokens": 1103682.0,
"repeat_count": 0.0,
- "routers_loss": 0.05302857980132103,
+ "routers_loss": 0.05550144240260124,
"skip_count": 1.0,
"step": 684,
"text_loss": 0.44418027997016907
@@ -6515,13 +6515,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.0009991805311296133,
- "loss": 0.0527,
+ "loss": 0.0507,
"macro_f1": 0.32098764181137085,
"num_tokens": 1106427.0,
"repeat_count": 0.0,
- "routers_loss": 0.08124994486570358,
+ "routers_loss": 0.07990608364343643,
"skip_count": 2.0,
"step": 686,
"text_loss": 0.5577231645584106
@@ -6534,13 +6534,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.22265625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009991627220828753,
- "loss": 0.0579,
+ "loss": 0.0568,
"macro_f1": 0.32098764181137085,
"num_tokens": 1109314.0,
"repeat_count": 0.0,
- "routers_loss": 0.058633625507354736,
+ "routers_loss": 0.05167485028505325,
"skip_count": 2.0,
"step": 688,
"text_loss": 0.27325430512428284
@@ -6553,13 +6553,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009991447217560408,
- "loss": 0.0533,
+ "loss": 0.0521,
"macro_f1": 0.5492662787437439,
"num_tokens": 1112748.0,
"repeat_count": 0.0,
- "routers_loss": 0.04703643172979355,
+ "routers_loss": 0.04621964320540428,
"skip_count": 2.0,
"step": 690,
"text_loss": 0.5288321375846863
@@ -6572,13 +6572,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.000999126530156007,
- "loss": 0.0485,
+ "loss": 0.0499,
"macro_f1": 0.307692289352417,
"num_tokens": 1116965.0,
"repeat_count": 1.0,
- "routers_loss": 0.11615128815174103,
+ "routers_loss": 0.11950276792049408,
"skip_count": 2.0,
"step": 692,
"text_loss": 0.14215624332427979
@@ -6591,13 +6591,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2314453125,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.0009991081472897454,
- "loss": 0.0718,
+ "loss": 0.0722,
"macro_f1": 0.3333333432674408,
"num_tokens": 1120570.0,
"repeat_count": 0.0,
- "routers_loss": 0.017403846606612206,
+ "routers_loss": 0.01905500330030918,
"skip_count": 0.0,
"step": 694,
"text_loss": 0.41862696409225464
@@ -6610,13 +6610,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.0009990895731643002,
- "loss": 0.0444,
+ "loss": 0.0464,
"macro_f1": 0.3272727429866791,
"num_tokens": 1124009.0,
"repeat_count": 1.0,
- "routers_loss": 0.07067303359508514,
+ "routers_loss": 0.06974572688341141,
"skip_count": 0.0,
"step": 696,
"text_loss": 0.41160130500793457
@@ -6629,13 +6629,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.000999070807786789,
- "loss": 0.0527,
+ "loss": 0.0531,
"macro_f1": 0.3272727429866791,
"num_tokens": 1127370.0,
"repeat_count": 1.0,
- "routers_loss": 0.07131028175354004,
+ "routers_loss": 0.07055293023586273,
"skip_count": 0.0,
"step": 698,
"text_loss": 0.48068273067474365
@@ -6648,13 +6648,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.18359375,
+ "grad_norm": 0.197265625,
"learning_rate": 0.000999051851164403,
- "loss": 0.0629,
+ "loss": 0.0619,
"macro_f1": 0.32098764181137085,
"num_tokens": 1130234.0,
"repeat_count": 1.0,
- "routers_loss": 0.1152748316526413,
+ "routers_loss": 0.12506946921348572,
"skip_count": 1.0,
"step": 700,
"text_loss": 0.47925490140914917
@@ -6667,13 +6667,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.216796875,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.000999032703304406,
- "loss": 0.0663,
+ "loss": 0.0674,
"macro_f1": 0.3333333432674408,
"num_tokens": 1132874.0,
"repeat_count": 0.0,
- "routers_loss": 0.0077212234027683735,
+ "routers_loss": 0.00809287466108799,
"skip_count": 0.0,
"step": 702,
"text_loss": 0.47433632612228394
@@ -6686,13 +6686,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.1064453125,
"learning_rate": 0.0009990133642141358,
- "loss": 0.0494,
+ "loss": 0.0497,
"macro_f1": 0.5492662787437439,
"num_tokens": 1136011.0,
"repeat_count": 0.0,
- "routers_loss": 0.02726336568593979,
+ "routers_loss": 0.0319170281291008,
"skip_count": 2.0,
"step": 704,
"text_loss": 0.6574832201004028
@@ -6705,13 +6705,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.32421875,
+ "grad_norm": 0.33984375,
"learning_rate": 0.000998993833901003,
- "loss": 0.0615,
+ "loss": 0.0619,
"macro_f1": 0.32098764181137085,
"num_tokens": 1139674.0,
"repeat_count": 0.0,
- "routers_loss": 0.0958542674779892,
+ "routers_loss": 0.09850362688302994,
"skip_count": 2.0,
"step": 706,
"text_loss": 0.7660127282142639
@@ -6724,13 +6724,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009989741123724919,
- "loss": 0.0583,
+ "loss": 0.0574,
"macro_f1": 0.3333333432674408,
"num_tokens": 1143558.0,
"repeat_count": 0.0,
- "routers_loss": 0.007100600749254227,
+ "routers_loss": 0.006673311349004507,
"skip_count": 0.0,
"step": 708,
"text_loss": 0.5976111888885498
@@ -6743,13 +6743,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009989541996361594,
- "loss": 0.0445,
+ "loss": 0.045,
"macro_f1": 0.3333333432674408,
"num_tokens": 1146122.0,
"repeat_count": 0.0,
- "routers_loss": 0.0047812811098992825,
+ "routers_loss": 0.004988791421055794,
"skip_count": 0.0,
"step": 710,
"text_loss": 0.5256119966506958
@@ -6762,13 +6762,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009989340956996367,
- "loss": 0.052,
+ "loss": 0.0528,
"macro_f1": 0.3333333432674408,
"num_tokens": 1149546.0,
"repeat_count": 0.0,
- "routers_loss": 0.006643407512456179,
+ "routers_loss": 0.0067769973538815975,
"skip_count": 0.0,
"step": 712,
"text_loss": 0.5040497779846191
@@ -6781,13 +6781,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2890625,
+ "grad_norm": 0.26953125,
"learning_rate": 0.0009989138005706273,
- "loss": 0.0719,
+ "loss": 0.0735,
"macro_f1": 0.32098764181137085,
"num_tokens": 1153195.0,
"repeat_count": 0.0,
- "routers_loss": 0.0910436138510704,
+ "routers_loss": 0.09899546951055527,
"skip_count": 2.0,
"step": 714,
"text_loss": 0.20803412795066833
@@ -6800,13 +6800,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1484375,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.000998893314256908,
- "loss": 0.0649,
+ "loss": 0.064,
"macro_f1": 0.3333333432674408,
"num_tokens": 1157081.0,
"repeat_count": 0.0,
- "routers_loss": 0.010978946462273598,
+ "routers_loss": 0.010492355562746525,
"skip_count": 0.0,
"step": 716,
"text_loss": 0.23077639937400818
@@ -6819,13 +6819,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009988726367663298,
- "loss": 0.0543,
+ "loss": 0.0539,
"macro_f1": 0.3333333432674408,
"num_tokens": 1160079.0,
"repeat_count": 0.0,
- "routers_loss": 0.009956461377441883,
+ "routers_loss": 0.01063773687928915,
"skip_count": 0.0,
"step": 718,
"text_loss": 0.6085864901542664
@@ -6838,13 +6838,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009988517681068163,
- "loss": 0.0412,
+ "loss": 0.0421,
"macro_f1": 0.3272727429866791,
"num_tokens": 1163249.0,
"repeat_count": 1.0,
- "routers_loss": 0.057210199534893036,
+ "routers_loss": 0.05981874838471413,
"skip_count": 0.0,
"step": 720,
"text_loss": 0.4047050476074219
@@ -6857,32 +6857,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009988307082863638,
- "loss": 0.0364,
+ "loss": 0.0361,
"macro_f1": 0.3333333432674408,
"num_tokens": 1166259.0,
"repeat_count": 0.0,
- "routers_loss": 0.01035996899008751,
+ "routers_loss": 0.009750043973326683,
"skip_count": 0.0,
"step": 722,
"text_loss": 0.5306474566459656
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 3.3991781626063986,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.2412109375,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.240234375,
"learning_rate": 0.0009988094573130434,
- "loss": 0.0661,
- "macro_f1": 0.3076923191547394,
+ "loss": 0.063,
+ "macro_f1": 0.5359477400779724,
"num_tokens": 1168887.0,
"repeat_count": 2.0,
- "routers_loss": 0.18087820708751678,
+ "routers_loss": 0.18601104617118835,
"skip_count": 2.0,
"step": 724,
"text_loss": 0.53528892993927
@@ -6895,32 +6895,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009987880151949974,
- "loss": 0.0505,
+ "loss": 0.0496,
"macro_f1": 0.3272727429866791,
"num_tokens": 1172625.0,
"repeat_count": 0.0,
- "routers_loss": 0.04720238968729973,
+ "routers_loss": 0.02845010720193386,
"skip_count": 1.0,
"step": 726,
"text_loss": 0.4760453701019287
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 26.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 3.417963017317288,
- "f1_execute": 1.0,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.2216796875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2177734375,
"learning_rate": 0.0009987663819404434,
- "loss": 0.0603,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.06,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1176580.0,
"repeat_count": 0.0,
- "routers_loss": 0.015407778322696686,
+ "routers_loss": 0.017596980556845665,
"skip_count": 2.0,
"step": 728,
"text_loss": 0.5146099328994751
@@ -6933,13 +6933,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.000998744557557671,
- "loss": 0.0489,
+ "loss": 0.0484,
"macro_f1": 0.3272727429866791,
"num_tokens": 1179804.0,
"repeat_count": 0.0,
- "routers_loss": 0.060891781002283096,
+ "routers_loss": 0.0625474750995636,
"skip_count": 1.0,
"step": 730,
"text_loss": 0.27738022804260254
@@ -6947,18 +6947,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 3.436747872028177,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.203125,
"learning_rate": 0.0009987225420550433,
- "loss": 0.0825,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0796,
+ "macro_f1": 0.307692289352417,
"num_tokens": 1182658.0,
"repeat_count": 1.0,
- "routers_loss": 0.1661442220211029,
+ "routers_loss": 0.16188351809978485,
"skip_count": 2.0,
"step": 732,
"text_loss": 0.23231445252895355
@@ -6966,18 +6966,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 3.446140299383622,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.2001953125,
"learning_rate": 0.0009987003354409965,
- "loss": 0.0634,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0626,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1185451.0,
"repeat_count": 0.0,
- "routers_loss": 0.02108248695731163,
+ "routers_loss": 0.02391529455780983,
"skip_count": 0.0,
"step": 734,
"text_loss": 0.4496627151966095
@@ -6990,13 +6990,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.248046875,
+ "grad_norm": 0.234375,
"learning_rate": 0.0009986779377240405,
- "loss": 0.0534,
+ "loss": 0.0513,
"macro_f1": 0.32098767161369324,
"num_tokens": 1188666.0,
"repeat_count": 0.0,
- "routers_loss": 0.08318125456571579,
+ "routers_loss": 0.08435963839292526,
"skip_count": 1.0,
"step": 736,
"text_loss": 0.4950787127017975
@@ -7009,13 +7009,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11962890625,
+ "grad_norm": 0.1220703125,
"learning_rate": 0.000998655348912758,
- "loss": 0.0514,
+ "loss": 0.0515,
"macro_f1": 0.3333333432674408,
"num_tokens": 1193035.0,
"repeat_count": 0.0,
- "routers_loss": 0.015889234840869904,
+ "routers_loss": 0.01648722216486931,
"skip_count": 0.0,
"step": 738,
"text_loss": 0.24761848151683807
@@ -7028,13 +7028,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1513671875,
"learning_rate": 0.0009986325690158051,
"loss": 0.0435,
"macro_f1": 0.3333333432674408,
"num_tokens": 1196840.0,
"repeat_count": 0.0,
- "routers_loss": 0.01378484908491373,
+ "routers_loss": 0.013143910095095634,
"skip_count": 0.0,
"step": 740,
"text_loss": 0.15662719309329987
@@ -7047,13 +7047,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1787109375,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009986095980419113,
- "loss": 0.076,
+ "loss": 0.0757,
"macro_f1": 0.3333333432674408,
"num_tokens": 1200573.0,
"repeat_count": 0.0,
- "routers_loss": 0.02673683874309063,
+ "routers_loss": 0.026706280186772346,
"skip_count": 0.0,
"step": 742,
"text_loss": 0.16725164651870728
@@ -7066,13 +7066,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.185546875,
+ "grad_norm": 0.1982421875,
"learning_rate": 0.0009985864359998787,
- "loss": 0.0778,
+ "loss": 0.0795,
"macro_f1": 0.3006536364555359,
"num_tokens": 1203589.0,
"repeat_count": 2.0,
- "routers_loss": 0.27776041626930237,
+ "routers_loss": 0.28607678413391113,
"skip_count": 3.0,
"step": 744,
"text_loss": 0.6350882053375244
@@ -7085,13 +7085,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009985630828985835,
- "loss": 0.0575,
+ "loss": 0.0572,
"macro_f1": 0.3272727429866791,
"num_tokens": 1206422.0,
"repeat_count": 0.0,
- "routers_loss": 0.0575483962893486,
+ "routers_loss": 0.05685260891914368,
"skip_count": 1.0,
"step": 746,
"text_loss": 0.33779552578926086
@@ -7104,13 +7104,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009985395387469742,
- "loss": 0.0478,
+ "loss": 0.0458,
"macro_f1": 0.5492662787437439,
"num_tokens": 1211588.0,
"repeat_count": 0.0,
- "routers_loss": 0.0458797849714756,
+ "routers_loss": 0.0437830351293087,
"skip_count": 2.0,
"step": 748,
"text_loss": 0.28664472699165344
@@ -7123,13 +7123,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.15625,
"learning_rate": 0.0009985158035540735,
- "loss": 0.0701,
+ "loss": 0.0714,
"macro_f1": 0.32098764181137085,
"num_tokens": 1214580.0,
"repeat_count": 2.0,
- "routers_loss": 0.07850238680839539,
+ "routers_loss": 0.07074898481369019,
"skip_count": 0.0,
"step": 750,
"text_loss": 0.3939313292503357
@@ -7142,13 +7142,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.21484375,
"learning_rate": 0.0009984918773289762,
- "loss": 0.0702,
+ "loss": 0.0699,
"macro_f1": 0.3333333432674408,
"num_tokens": 1217388.0,
"repeat_count": 0.0,
- "routers_loss": 0.009507967159152031,
+ "routers_loss": 0.009757856838405132,
"skip_count": 0.0,
"step": 752,
"text_loss": 0.37641215324401855
@@ -7161,13 +7161,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1484375,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009984677600808512,
- "loss": 0.0543,
+ "loss": 0.054,
"macro_f1": 0.3333333432674408,
"num_tokens": 1219960.0,
"repeat_count": 0.0,
- "routers_loss": 0.02620997279882431,
+ "routers_loss": 0.02515069581568241,
"skip_count": 0.0,
"step": 754,
"text_loss": 0.155938982963562
@@ -7180,13 +7180,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3359375,
+ "grad_norm": 0.30078125,
"learning_rate": 0.0009984434518189405,
- "loss": 0.0791,
+ "loss": 0.0764,
"macro_f1": 0.3333333432674408,
"num_tokens": 1223234.0,
"repeat_count": 0.0,
- "routers_loss": 0.02798631228506565,
+ "routers_loss": 0.025766927748918533,
"skip_count": 0.0,
"step": 756,
"text_loss": 0.691118061542511
@@ -7201,11 +7201,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1416015625,
"learning_rate": 0.0009984189525525584,
- "loss": 0.046,
+ "loss": 0.0451,
"macro_f1": 0.5359477400779724,
"num_tokens": 1225764.0,
"repeat_count": 2.0,
- "routers_loss": 0.16614431142807007,
+ "routers_loss": 0.1782722771167755,
"skip_count": 2.0,
"step": 758,
"text_loss": 0.3592209219932556
@@ -7218,13 +7218,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.193359375,
+ "grad_norm": 0.189453125,
"learning_rate": 0.0009983942622910935,
- "loss": 0.0669,
+ "loss": 0.0659,
"macro_f1": 0.3333333432674408,
"num_tokens": 1230097.0,
"repeat_count": 0.0,
- "routers_loss": 0.008541896007955074,
+ "routers_loss": 0.00825568474829197,
"skip_count": 0.0,
"step": 760,
"text_loss": 0.4646475315093994
@@ -7237,13 +7237,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009983693810440074,
- "loss": 0.0478,
+ "loss": 0.0477,
"macro_f1": 0.32098764181137085,
"num_tokens": 1233140.0,
"repeat_count": 0.0,
- "routers_loss": 0.045411624014377594,
+ "routers_loss": 0.04156976938247681,
"skip_count": 2.0,
"step": 762,
"text_loss": 0.298682302236557
@@ -7256,13 +7256,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.380859375,
+ "grad_norm": 0.3515625,
"learning_rate": 0.000998344308820834,
- "loss": 0.0689,
+ "loss": 0.0666,
"macro_f1": 0.3272727429866791,
"num_tokens": 1236305.0,
"repeat_count": 0.0,
- "routers_loss": 0.052299100905656815,
+ "routers_loss": 0.05697929114103317,
"skip_count": 1.0,
"step": 764,
"text_loss": 0.5249121189117432
@@ -7275,13 +7275,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.18359375,
"learning_rate": 0.0009983190456311817,
- "loss": 0.0602,
+ "loss": 0.0592,
"macro_f1": 0.3144654333591461,
"num_tokens": 1239673.0,
"repeat_count": 0.0,
- "routers_loss": 0.09140212833881378,
+ "routers_loss": 0.09547408670186996,
"skip_count": 3.0,
"step": 766,
"text_loss": 0.41277334094047546
@@ -7294,13 +7294,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.201171875,
+ "grad_norm": 0.185546875,
"learning_rate": 0.000998293591484731,
- "loss": 0.0475,
+ "loss": 0.0484,
"macro_f1": 0.5492662787437439,
"num_tokens": 1242292.0,
"repeat_count": 0.0,
- "routers_loss": 0.030750583857297897,
+ "routers_loss": 0.030693158507347107,
"skip_count": 2.0,
"step": 768,
"text_loss": 0.1583656519651413
@@ -7313,13 +7313,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000998267946391236,
- "loss": 0.052,
+ "loss": 0.051,
"macro_f1": 0.3333333432674408,
"num_tokens": 1244661.0,
"repeat_count": 0.0,
- "routers_loss": 0.010202950797975063,
+ "routers_loss": 0.01211300864815712,
"skip_count": 0.0,
"step": 770,
"text_loss": 0.4629349112510681
@@ -7332,13 +7332,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0009982421103605238,
- "loss": 0.0434,
+ "loss": 0.0441,
"macro_f1": 0.32098764181137085,
"num_tokens": 1248688.0,
"repeat_count": 0.0,
- "routers_loss": 0.07364192605018616,
+ "routers_loss": 0.0665968507528305,
"skip_count": 2.0,
"step": 772,
"text_loss": 0.4019293785095215
@@ -7353,11 +7353,11 @@
"f1_skip": 0.0,
"grad_norm": 0.2890625,
"learning_rate": 0.000998216083402495,
- "loss": 0.0606,
+ "loss": 0.0613,
"macro_f1": 0.32098764181137085,
"num_tokens": 1251395.0,
"repeat_count": 0.0,
- "routers_loss": 0.06553081423044205,
+ "routers_loss": 0.07186859846115112,
"skip_count": 2.0,
"step": 774,
"text_loss": 0.4659276604652405
@@ -7370,13 +7370,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.302734375,
"learning_rate": 0.0009981898655271235,
- "loss": 0.0475,
+ "loss": 0.0488,
"macro_f1": 0.3333333432674408,
"num_tokens": 1254888.0,
"repeat_count": 0.0,
- "routers_loss": 0.008751659654080868,
+ "routers_loss": 0.007823926396667957,
"skip_count": 0.0,
"step": 776,
"text_loss": 0.5160359740257263
@@ -7389,13 +7389,13 @@
"f1_execute": 0.9130434989929199,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.11962890625,
"learning_rate": 0.0009981634567444557,
- "loss": 0.0777,
+ "loss": 0.0775,
"macro_f1": 0.590062141418457,
"num_tokens": 1258250.0,
"repeat_count": 3.0,
- "routers_loss": 0.24522721767425537,
+ "routers_loss": 0.24624499678611755,
"skip_count": 4.0,
"step": 778,
"text_loss": 0.29319918155670166
@@ -7408,13 +7408,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.263671875,
"learning_rate": 0.0009981368570646115,
"loss": 0.0885,
"macro_f1": 0.3272727429866791,
"num_tokens": 1260916.0,
"repeat_count": 0.0,
- "routers_loss": 0.03767623379826546,
+ "routers_loss": 0.030730176717042923,
"skip_count": 1.0,
"step": 780,
"text_loss": 0.624981164932251
@@ -7427,13 +7427,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009981100664977838,
- "loss": 0.0708,
+ "loss": 0.0699,
"macro_f1": 0.3333333432674408,
"num_tokens": 1264004.0,
"repeat_count": 0.0,
- "routers_loss": 0.006098059006035328,
+ "routers_loss": 0.006829176563769579,
"skip_count": 0.0,
"step": 782,
"text_loss": 0.6137266159057617
@@ -7446,13 +7446,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009980830850542391,
- "loss": 0.0589,
+ "loss": 0.058,
"macro_f1": 0.3333333432674408,
"num_tokens": 1267130.0,
"repeat_count": 0.0,
- "routers_loss": 0.01731623336672783,
+ "routers_loss": 0.018471000716090202,
"skip_count": 0.0,
"step": 784,
"text_loss": 0.15213175117969513
@@ -7465,13 +7465,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.0009980559127443166,
- "loss": 0.0526,
+ "loss": 0.052,
"macro_f1": 0.3333333432674408,
"num_tokens": 1271129.0,
"repeat_count": 0.0,
- "routers_loss": 0.0076471962966024876,
+ "routers_loss": 0.007903140969574451,
"skip_count": 0.0,
"step": 786,
"text_loss": 0.5768613219261169
@@ -7484,13 +7484,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.130859375,
"learning_rate": 0.000998028549578429,
- "loss": 0.0745,
+ "loss": 0.0719,
"macro_f1": 0.307692289352417,
"num_tokens": 1274232.0,
"repeat_count": 0.0,
- "routers_loss": 0.0637628585100174,
+ "routers_loss": 0.06737866252660751,
"skip_count": 3.0,
"step": 788,
"text_loss": 0.2877073585987091
@@ -7503,13 +7503,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009980009955670615,
- "loss": 0.0699,
+ "loss": 0.0698,
"macro_f1": 0.3144654333591461,
"num_tokens": 1277193.0,
"repeat_count": 0.0,
- "routers_loss": 0.10882514715194702,
+ "routers_loss": 0.10194934904575348,
"skip_count": 3.0,
"step": 790,
"text_loss": 0.11860492825508118
@@ -7522,13 +7522,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.126953125,
"learning_rate": 0.000997973250720773,
- "loss": 0.056,
+ "loss": 0.0552,
"macro_f1": 0.32098764181137085,
"num_tokens": 1280960.0,
"repeat_count": 0.0,
- "routers_loss": 0.10924118757247925,
+ "routers_loss": 0.10297708213329315,
"skip_count": 2.0,
"step": 792,
"text_loss": 0.13477706909179688
@@ -7541,13 +7541,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009979453150501954,
- "loss": 0.0664,
+ "loss": 0.0663,
"macro_f1": 0.32098764181137085,
"num_tokens": 1284611.0,
"repeat_count": 1.0,
- "routers_loss": 0.06571807712316513,
+ "routers_loss": 0.06122037023305893,
"skip_count": 1.0,
"step": 794,
"text_loss": 0.40569379925727844
@@ -7560,13 +7560,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1181640625,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.000997917188566034,
- "loss": 0.0616,
+ "loss": 0.062,
"macro_f1": 0.32098764181137085,
"num_tokens": 1287834.0,
"repeat_count": 0.0,
- "routers_loss": 0.058966971933841705,
+ "routers_loss": 0.061135001480579376,
"skip_count": 2.0,
"step": 796,
"text_loss": 0.2829287648200989
@@ -7579,32 +7579,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.109375,
"learning_rate": 0.0009978888712790664,
- "loss": 0.067,
+ "loss": 0.0654,
"macro_f1": 0.3272727429866791,
"num_tokens": 1291666.0,
"repeat_count": 0.0,
- "routers_loss": 0.04844636470079422,
+ "routers_loss": 0.04841872677206993,
"skip_count": 1.0,
"step": 798,
"text_loss": 1.011757254600525
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.4000000059604645,
- "avg_layers": 26.0,
+ "acc_skip": 0.20000000298023224,
+ "avg_layers": 27.0,
"epoch": 3.756090402113296,
- "f1_execute": 0.9166666865348816,
+ "f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
- "f1_skip": 0.5714285969734192,
- "grad_norm": 0.1416015625,
+ "f1_skip": 0.3333333134651184,
+ "grad_norm": 0.14453125,
"learning_rate": 0.0009978603632001444,
- "loss": 0.0634,
- "macro_f1": 0.4960317611694336,
+ "loss": 0.0636,
+ "macro_f1": 0.4104308485984802,
"num_tokens": 1294627.0,
"repeat_count": 1.0,
- "routers_loss": 0.1591777801513672,
+ "routers_loss": 0.15698759257793427,
"skip_count": 5.0,
"step": 800,
"text_loss": 0.4457623362541199
@@ -7617,13 +7617,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.283203125,
"learning_rate": 0.0009978316643401916,
- "loss": 0.0694,
+ "loss": 0.0688,
"macro_f1": 0.3333333432674408,
"num_tokens": 1297711.0,
"repeat_count": 0.0,
- "routers_loss": 0.017735568806529045,
+ "routers_loss": 0.018952010199427605,
"skip_count": 0.0,
"step": 802,
"text_loss": 0.2069481462240219
@@ -7636,13 +7636,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.14453125,
"learning_rate": 0.0009978027747102062,
- "loss": 0.0477,
+ "loss": 0.0479,
"macro_f1": 0.3333333432674408,
"num_tokens": 1300569.0,
"repeat_count": 0.0,
- "routers_loss": 0.012401525862514973,
+ "routers_loss": 0.014538386836647987,
"skip_count": 0.0,
"step": 804,
"text_loss": 0.4983852505683899
@@ -7655,13 +7655,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2080078125,
+ "grad_norm": 0.2109375,
"learning_rate": 0.0009977736943212584,
- "loss": 0.0735,
+ "loss": 0.0721,
"macro_f1": 0.32098764181137085,
"num_tokens": 1303969.0,
"repeat_count": 0.0,
- "routers_loss": 0.10736164450645447,
+ "routers_loss": 0.11164087057113647,
"skip_count": 2.0,
"step": 806,
"text_loss": 0.2910642921924591
@@ -7674,13 +7674,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2001953125,
+ "grad_norm": 0.1826171875,
"learning_rate": 0.000997744423184492,
- "loss": 0.0428,
+ "loss": 0.0424,
"macro_f1": 0.3272727429866791,
"num_tokens": 1307263.0,
"repeat_count": 0.0,
- "routers_loss": 0.0595436617732048,
+ "routers_loss": 0.06073406711220741,
"skip_count": 1.0,
"step": 808,
"text_loss": 0.18831779062747955
@@ -7693,13 +7693,13 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.26171875,
"learning_rate": 0.0009977149613111236,
- "loss": 0.0494,
+ "loss": 0.0486,
"macro_f1": 0.4400000274181366,
"num_tokens": 1309953.0,
"repeat_count": 1.0,
- "routers_loss": 0.12617000937461853,
+ "routers_loss": 0.11035524308681488,
"skip_count": 4.0,
"step": 810,
"text_loss": 0.7872759699821472
@@ -7712,13 +7712,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1669921875,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.0009976853087124433,
- "loss": 0.0537,
+ "loss": 0.0536,
"macro_f1": 0.3333333432674408,
"num_tokens": 1313243.0,
"repeat_count": 0.0,
- "routers_loss": 0.021242506802082062,
+ "routers_loss": 0.021804286167025566,
"skip_count": 0.0,
"step": 812,
"text_loss": 0.22349292039871216
@@ -7731,13 +7731,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.318359375,
+ "grad_norm": 0.28125,
"learning_rate": 0.0009976554653998138,
- "loss": 0.0617,
+ "loss": 0.0612,
"macro_f1": 0.31446540355682373,
"num_tokens": 1316165.0,
"repeat_count": 0.0,
- "routers_loss": 0.10387415438890457,
+ "routers_loss": 0.10715524107217789,
"skip_count": 2.0,
"step": 814,
"text_loss": 0.18035532534122467
@@ -7750,13 +7750,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.000997625431384671,
- "loss": 0.0565,
+ "loss": 0.0564,
"macro_f1": 0.3333333432674408,
"num_tokens": 1319206.0,
"repeat_count": 0.0,
- "routers_loss": 0.007816939614713192,
+ "routers_loss": 0.007173649035394192,
"skip_count": 0.0,
"step": 816,
"text_loss": 0.48928648233413696
@@ -7769,13 +7769,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.0009975952066785243,
- "loss": 0.0654,
+ "loss": 0.0655,
"macro_f1": 0.3006536364555359,
"num_tokens": 1322549.0,
"repeat_count": 1.0,
- "routers_loss": 0.22526368498802185,
+ "routers_loss": 0.22308112680912018,
"skip_count": 4.0,
"step": 818,
"text_loss": 0.5211259722709656
@@ -7788,13 +7788,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.0009975647912929557,
- "loss": 0.056,
+ "loss": 0.0564,
"macro_f1": 0.3333333432674408,
"num_tokens": 1325213.0,
"repeat_count": 0.0,
- "routers_loss": 0.010998851619660854,
+ "routers_loss": 0.00998698640614748,
"skip_count": 0.0,
"step": 820,
"text_loss": 0.7117052674293518
@@ -7807,13 +7807,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.15234375,
"learning_rate": 0.0009975341852396205,
- "loss": 0.0712,
+ "loss": 0.0723,
"macro_f1": 0.32098764181137085,
"num_tokens": 1328383.0,
"repeat_count": 0.0,
- "routers_loss": 0.07115054875612259,
+ "routers_loss": 0.07454588264226913,
"skip_count": 2.0,
"step": 822,
"text_loss": 0.34539610147476196
@@ -7826,13 +7826,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009975033885302469,
- "loss": 0.0611,
+ "loss": 0.0604,
"macro_f1": 0.3333333432674408,
"num_tokens": 1331406.0,
"repeat_count": 0.0,
- "routers_loss": 0.008062695153057575,
+ "routers_loss": 0.009157589636743069,
"skip_count": 0.0,
"step": 824,
"text_loss": 0.7484824657440186
@@ -7845,13 +7845,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.0009974724011766363,
- "loss": 0.0496,
+ "loss": 0.0474,
"macro_f1": 0.3272727429866791,
"num_tokens": 1334410.0,
"repeat_count": 1.0,
- "routers_loss": 0.16666285693645477,
+ "routers_loss": 0.17149391770362854,
"skip_count": 0.0,
"step": 826,
"text_loss": 0.5913820266723633
@@ -7864,13 +7864,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1708984375,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009974412231906632,
- "loss": 0.0567,
+ "loss": 0.058,
"macro_f1": 0.32098764181137085,
"num_tokens": 1337653.0,
"repeat_count": 1.0,
- "routers_loss": 0.0908689796924591,
+ "routers_loss": 0.09743282198905945,
"skip_count": 1.0,
"step": 828,
"text_loss": 0.2505693733692169
@@ -7883,13 +7883,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16015625,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0009974098545842748,
- "loss": 0.0648,
+ "loss": 0.0638,
"macro_f1": 0.3272727429866791,
"num_tokens": 1340860.0,
"repeat_count": 0.0,
- "routers_loss": 0.04364728182554245,
+ "routers_loss": 0.041490405797958374,
"skip_count": 1.0,
"step": 830,
"text_loss": 0.5585370063781738
@@ -7897,18 +7897,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 3.906369239800411,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2060546875,
+ "grad_norm": 0.193359375,
"learning_rate": 0.0009973782953694918,
- "loss": 0.0772,
- "macro_f1": 0.3076923191547394,
- "num_tokens": 1344232.0,
+ "loss": 0.0746,
+ "macro_f1": 0.3006536066532135,
+ "num_tokens": 1344232.0,
"repeat_count": 1.0,
- "routers_loss": 0.15315109491348267,
+ "routers_loss": 0.16080693900585175,
"skip_count": 3.0,
"step": 832,
"text_loss": 0.4782734513282776
@@ -7921,13 +7921,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.000997346545558408,
- "loss": 0.0527,
+ "loss": 0.0522,
"macro_f1": 0.3333333432674408,
"num_tokens": 1347667.0,
"repeat_count": 0.0,
- "routers_loss": 0.01342768594622612,
+ "routers_loss": 0.01173500344157219,
"skip_count": 0.0,
"step": 834,
"text_loss": 0.25036177039146423
@@ -7940,13 +7940,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1748046875,
+ "grad_norm": 0.173828125,
"learning_rate": 0.0009973146051631895,
- "loss": 0.0513,
+ "loss": 0.0522,
"macro_f1": 0.3333333432674408,
"num_tokens": 1350707.0,
"repeat_count": 0.0,
- "routers_loss": 0.01158806961029768,
+ "routers_loss": 0.011477196589112282,
"skip_count": 0.0,
"step": 836,
"text_loss": 0.5482863187789917
@@ -7959,13 +7959,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.0009972824741960764,
- "loss": 0.0549,
+ "loss": 0.0536,
"macro_f1": 0.3333333432674408,
"num_tokens": 1353704.0,
"repeat_count": 0.0,
- "routers_loss": 0.01255605649203062,
+ "routers_loss": 0.010528896935284138,
"skip_count": 0.0,
"step": 838,
"text_loss": 0.6732596158981323
@@ -7978,13 +7978,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12255859375,
+ "grad_norm": 0.1181640625,
"learning_rate": 0.000997250152669381,
- "loss": 0.0578,
+ "loss": 0.0573,
"macro_f1": 0.3333333432674408,
"num_tokens": 1356608.0,
"repeat_count": 0.0,
- "routers_loss": 0.010225459933280945,
+ "routers_loss": 0.010678744874894619,
"skip_count": 0.0,
"step": 840,
"text_loss": 0.5479338765144348
@@ -7997,13 +7997,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.181640625,
"learning_rate": 0.000997217640595489,
- "loss": 0.0633,
+ "loss": 0.0631,
"macro_f1": 0.3333333432674408,
"num_tokens": 1359809.0,
"repeat_count": 0.0,
- "routers_loss": 0.007837744429707527,
+ "routers_loss": 0.00835978239774704,
"skip_count": 0.0,
"step": 842,
"text_loss": 0.42543259263038635
@@ -8016,13 +8016,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.0009971849379868593,
- "loss": 0.0674,
+ "loss": 0.0653,
"macro_f1": 0.3333333432674408,
"num_tokens": 1362201.0,
"repeat_count": 0.0,
- "routers_loss": 0.008631376549601555,
+ "routers_loss": 0.009930923581123352,
"skip_count": 0.0,
"step": 844,
"text_loss": 0.720462441444397
@@ -8035,13 +8035,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009971520448560235,
- "loss": 0.0612,
+ "loss": 0.0615,
"macro_f1": 0.3272727429866791,
"num_tokens": 1365790.0,
"repeat_count": 0.0,
- "routers_loss": 0.06206027418375015,
+ "routers_loss": 0.06344373524188995,
"skip_count": 1.0,
"step": 846,
"text_loss": 0.8423607349395752
@@ -8049,18 +8049,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 3.9815086586439685,
- "f1_execute": 0.9411765336990356,
+ "f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
- "f1_skip": 0.5,
- "grad_norm": 0.16015625,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.16796875,
"learning_rate": 0.000997118961215586,
- "loss": 0.0678,
- "macro_f1": 0.480392187833786,
+ "loss": 0.0674,
+ "macro_f1": 0.4533333480358124,
"num_tokens": 1368387.0,
"repeat_count": 1.0,
- "routers_loss": 0.1463794708251953,
+ "routers_loss": 0.14688406884670258,
"skip_count": 3.0,
"step": 848,
"text_loss": 0.3933577537536621
@@ -8073,13 +8073,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000997085687078225,
- "loss": 0.052,
+ "loss": 0.0518,
"macro_f1": 0.3333333432674408,
"num_tokens": 1371189.0,
"repeat_count": 0.0,
- "routers_loss": 0.01140492781996727,
+ "routers_loss": 0.009953443892300129,
"skip_count": 0.0,
"step": 850,
"text_loss": 0.41469162702560425
@@ -8092,13 +8092,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.15625,
"learning_rate": 0.0009970522224566909,
- "loss": 0.0563,
+ "loss": 0.0555,
"macro_f1": 0.32098767161369324,
"num_tokens": 1374008.0,
"repeat_count": 0.0,
- "routers_loss": 0.05136030167341232,
+ "routers_loss": 0.048870690166950226,
"skip_count": 1.0,
"step": 852,
"text_loss": 0.613615870475769
@@ -8111,32 +8111,32 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.283203125,
"learning_rate": 0.0009970185673638075,
- "loss": 0.0627,
+ "loss": 0.0629,
"macro_f1": 0.32098764181137085,
"num_tokens": 1376662.0,
"repeat_count": 1.0,
- "routers_loss": 0.07274381071329117,
+ "routers_loss": 0.06865929812192917,
"skip_count": 1.0,
"step": 854,
"text_loss": 0.4392736256122589
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 4.01878485471089,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.162109375,
"learning_rate": 0.0009969847218124716,
- "loss": 0.0503,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0506,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1380049.0,
"repeat_count": 0.0,
- "routers_loss": 0.024335317313671112,
+ "routers_loss": 0.02382219396531582,
"skip_count": 1.0,
"step": 856,
"text_loss": 0.19115346670150757
@@ -8149,13 +8149,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009969506858156527,
- "loss": 0.0359,
+ "loss": 0.0344,
"macro_f1": 0.3272727429866791,
"num_tokens": 1383008.0,
"repeat_count": 0.0,
- "routers_loss": 0.046614740043878555,
+ "routers_loss": 0.03907281160354614,
"skip_count": 1.0,
"step": 858,
"text_loss": 0.34842637181282043
@@ -8168,13 +8168,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11181640625,
+ "grad_norm": 0.12060546875,
"learning_rate": 0.0009969164593863935,
- "loss": 0.0372,
+ "loss": 0.0365,
"macro_f1": 0.3333333432674408,
"num_tokens": 1387051.0,
"repeat_count": 0.0,
- "routers_loss": 0.006380240898579359,
+ "routers_loss": 0.007645803038030863,
"skip_count": 0.0,
"step": 860,
"text_loss": 0.3810436725616455
@@ -8187,13 +8187,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009968820425378098,
- "loss": 0.0473,
+ "loss": 0.0463,
"macro_f1": 0.3272727429866791,
"num_tokens": 1390244.0,
"repeat_count": 1.0,
- "routers_loss": 0.04770716652274132,
+ "routers_loss": 0.04435238987207413,
"skip_count": 0.0,
"step": 862,
"text_loss": 0.34853485226631165
@@ -8206,32 +8206,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3359375,
+ "grad_norm": 0.28515625,
"learning_rate": 0.00099684743528309,
- "loss": 0.0434,
+ "loss": 0.0424,
"macro_f1": 0.3333333432674408,
"num_tokens": 1392976.0,
"repeat_count": 0.0,
- "routers_loss": 0.006983708590269089,
+ "routers_loss": 0.006071661598980427,
"skip_count": 0.0,
"step": 864,
"text_loss": 0.6395178437232971
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 4.065746991488113,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.080078125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0810546875,
"learning_rate": 0.0009968126376354958,
- "loss": 0.0476,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0477,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1396061.0,
"repeat_count": 0.0,
- "routers_loss": 0.046313900500535965,
+ "routers_loss": 0.05011235550045967,
"skip_count": 2.0,
"step": 866,
"text_loss": 0.09103966504335403
@@ -8244,32 +8244,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009967776496083616,
"loss": 0.0509,
"macro_f1": 0.3272727429866791,
"num_tokens": 1398993.0,
"repeat_count": 1.0,
- "routers_loss": 0.0401870422065258,
+ "routers_loss": 0.03979124873876572,
"skip_count": 0.0,
"step": 868,
"text_loss": 0.27257058024406433
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 4.084531846199002,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.14453125,
"learning_rate": 0.000996742471215095,
- "loss": 0.0505,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0516,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1402080.0,
"repeat_count": 0.0,
- "routers_loss": 0.03313451260328293,
+ "routers_loss": 0.030823837965726852,
"skip_count": 2.0,
"step": 870,
"text_loss": 0.7047103047370911
@@ -8282,13 +8282,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009967071024691763,
- "loss": 0.0468,
+ "loss": 0.0461,
"macro_f1": 0.3333333432674408,
"num_tokens": 1404890.0,
"repeat_count": 0.0,
- "routers_loss": 0.010118982754647732,
+ "routers_loss": 0.009721715934574604,
"skip_count": 0.0,
"step": 872,
"text_loss": 0.959106981754303
@@ -8301,13 +8301,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.000996671543384159,
- "loss": 0.0498,
+ "loss": 0.05,
"macro_f1": 0.3333333432674408,
"num_tokens": 1407853.0,
"repeat_count": 0.0,
- "routers_loss": 0.005856200121343136,
+ "routers_loss": 0.006025883834809065,
"skip_count": 0.0,
"step": 874,
"text_loss": 0.47571972012519836
@@ -8320,13 +8320,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.09765625,
"learning_rate": 0.0009966357939736692,
- "loss": 0.0417,
+ "loss": 0.0416,
"macro_f1": 0.3272727429866791,
"num_tokens": 1410723.0,
"repeat_count": 0.0,
- "routers_loss": 0.02768322452902794,
+ "routers_loss": 0.025964925065636635,
"skip_count": 0.0,
"step": 876,
"text_loss": 0.4964611530303955
@@ -8339,13 +8339,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1025390625,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009965998542514065,
- "loss": 0.0419,
+ "loss": 0.0415,
"macro_f1": 0.32098764181137085,
"num_tokens": 1414008.0,
"repeat_count": 0.0,
- "routers_loss": 0.09382032603025436,
+ "routers_loss": 0.09509637206792831,
"skip_count": 2.0,
"step": 878,
"text_loss": 0.621494710445404
@@ -8358,32 +8358,32 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009965637242311427,
- "loss": 0.0466,
+ "loss": 0.0472,
"macro_f1": 0.542222261428833,
"num_tokens": 1417447.0,
"repeat_count": 0.0,
- "routers_loss": 0.026867631822824478,
+ "routers_loss": 0.02520318515598774,
"skip_count": 4.0,
"step": 880,
"text_loss": 0.40209758281707764
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6666666865348816,
- "avg_layers": 24.0,
+ "acc_skip": 0.5,
+ "avg_layers": 25.0,
"epoch": 4.14088641033167,
- "f1_execute": 0.95652174949646,
+ "f1_execute": 0.936170220375061,
"f1_repeat": 0.0,
- "f1_skip": 0.800000011920929,
- "grad_norm": 0.26171875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000996527403926723,
- "loss": 0.0496,
- "macro_f1": 0.5855072736740112,
+ "loss": 0.0495,
+ "macro_f1": 0.5342789888381958,
"num_tokens": 1419905.0,
"repeat_count": 0.0,
- "routers_loss": 0.12731307744979858,
+ "routers_loss": 0.13183781504631042,
"skip_count": 6.0,
"step": 882,
"text_loss": 0.642185389995575
@@ -8396,13 +8396,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.0009964908933520655,
- "loss": 0.039,
+ "loss": 0.0375,
"macro_f1": 0.3333333432674408,
"num_tokens": 1423436.0,
"repeat_count": 0.0,
- "routers_loss": 0.008483970537781715,
+ "routers_loss": 0.009429510682821274,
"skip_count": 0.0,
"step": 884,
"text_loss": 0.48232755064964294
@@ -8415,13 +8415,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.18359375,
+ "grad_norm": 0.1669921875,
"learning_rate": 0.0009964541925211613,
- "loss": 0.0348,
+ "loss": 0.0349,
"macro_f1": 0.32098764181137085,
"num_tokens": 1426842.0,
"repeat_count": 0.0,
- "routers_loss": 0.07847871631383896,
+ "routers_loss": 0.07629609107971191,
"skip_count": 2.0,
"step": 886,
"text_loss": 0.16620934009552002
@@ -8434,13 +8434,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0009964173014480738,
- "loss": 0.036,
+ "loss": 0.0348,
"macro_f1": 0.5492662787437439,
"num_tokens": 1430430.0,
"repeat_count": 0.0,
- "routers_loss": 0.04574459046125412,
+ "routers_loss": 0.036814019083976746,
"skip_count": 2.0,
"step": 888,
"text_loss": 0.4866008758544922
@@ -8453,13 +8453,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10595703125,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009963802201469398,
- "loss": 0.0485,
+ "loss": 0.0476,
"macro_f1": 0.3333333432674408,
"num_tokens": 1433821.0,
"repeat_count": 0.0,
- "routers_loss": 0.004683624487370253,
+ "routers_loss": 0.0041250260546803474,
"skip_count": 0.0,
"step": 890,
"text_loss": 0.578216552734375
@@ -8472,13 +8472,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2158203125,
+ "grad_norm": 0.2373046875,
"learning_rate": 0.0009963429486319693,
- "loss": 0.0476,
+ "loss": 0.0463,
"macro_f1": 0.32098764181137085,
"num_tokens": 1436976.0,
"repeat_count": 0.0,
- "routers_loss": 0.06499828398227692,
+ "routers_loss": 0.06213559955358505,
"skip_count": 2.0,
"step": 892,
"text_loss": 0.221701517701149
@@ -8486,18 +8486,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
- "avg_layers": 25.0,
+ "avg_layers": 26.0,
"epoch": 4.197240974464338,
- "f1_execute": 0.9411764740943909,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.4000000059604645,
- "grad_norm": 0.310546875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.361328125,
"learning_rate": 0.0009963054869174446,
- "loss": 0.0326,
- "macro_f1": 0.44705885648727417,
+ "loss": 0.0313,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 1440397.0,
"repeat_count": 0.0,
- "routers_loss": 0.08285653591156006,
+ "routers_loss": 0.07532428950071335,
"skip_count": 2.0,
"step": 894,
"text_loss": 0.6922838091850281
@@ -8510,13 +8510,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.154296875,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.0009962678350177209,
- "loss": 0.0497,
+ "loss": 0.0472,
"macro_f1": 0.3272727429866791,
"num_tokens": 1443604.0,
"repeat_count": 0.0,
- "routers_loss": 0.04252336546778679,
+ "routers_loss": 0.0419243648648262,
"skip_count": 1.0,
"step": 896,
"text_loss": 0.22092342376708984
@@ -8524,18 +8524,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 4.216025829175227,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10302734375,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009962299929472268,
- "loss": 0.0349,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.034,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 1446257.0,
"repeat_count": 2.0,
- "routers_loss": 0.126711905002594,
+ "routers_loss": 0.10849297791719437,
"skip_count": 0.0,
"step": 898,
"text_loss": 0.26394811272621155
@@ -8548,13 +8548,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.000996191960720463,
- "loss": 0.0392,
+ "loss": 0.0394,
"macro_f1": 0.3333333432674408,
"num_tokens": 1449669.0,
"repeat_count": 0.0,
- "routers_loss": 0.00955706462264061,
+ "routers_loss": 0.0092767970636487,
"skip_count": 0.0,
"step": 900,
"text_loss": 0.5338577628135681
@@ -8567,13 +8567,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009961537383520042,
- "loss": 0.0377,
+ "loss": 0.0354,
"macro_f1": 0.3272727429866791,
"num_tokens": 1452450.0,
"repeat_count": 1.0,
- "routers_loss": 0.03127318620681763,
+ "routers_loss": 0.02985367365181446,
"skip_count": 0.0,
"step": 902,
"text_loss": 0.5875228047370911
@@ -8586,13 +8586,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.0009961153258564966,
- "loss": 0.0389,
+ "loss": 0.0378,
"macro_f1": 0.3144654333591461,
"num_tokens": 1456909.0,
"repeat_count": 0.0,
- "routers_loss": 0.06743519753217697,
+ "routers_loss": 0.06794842332601547,
"skip_count": 3.0,
"step": 904,
"text_loss": 0.40959444642066956
@@ -8605,13 +8605,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009960767232486604,
- "loss": 0.0477,
+ "loss": 0.0476,
"macro_f1": 0.3333333432674408,
"num_tokens": 1461712.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025313226506114006,
+ "routers_loss": 0.0023562447167932987,
"skip_count": 0.0,
"step": 906,
"text_loss": 0.3932875096797943
@@ -8624,13 +8624,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.08203125,
"learning_rate": 0.000996037930543288,
- "loss": 0.052,
+ "loss": 0.0505,
"macro_f1": 0.3272727429866791,
"num_tokens": 1464817.0,
"repeat_count": 0.0,
- "routers_loss": 0.037147488445043564,
+ "routers_loss": 0.03880339860916138,
"skip_count": 1.0,
"step": 908,
"text_loss": 0.17482402920722961
@@ -8643,13 +8643,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.2119140625,
"learning_rate": 0.000995998947755245,
- "loss": 0.0501,
+ "loss": 0.0479,
"macro_f1": 0.3272727429866791,
"num_tokens": 1467810.0,
"repeat_count": 0.0,
- "routers_loss": 0.021232586354017258,
+ "routers_loss": 0.01736828312277794,
"skip_count": 1.0,
"step": 910,
"text_loss": 0.4140470325946808
@@ -8662,13 +8662,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009959597748994695,
- "loss": 0.0759,
+ "loss": 0.0752,
"macro_f1": 0.3333333432674408,
"num_tokens": 1470802.0,
"repeat_count": 0.0,
- "routers_loss": 0.010563847608864307,
+ "routers_loss": 0.011824851855635643,
"skip_count": 0.0,
"step": 912,
"text_loss": 0.7153383493423462
@@ -8681,13 +8681,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009959204119909726,
- "loss": 0.0425,
+ "loss": 0.0421,
"macro_f1": 0.3272727429866791,
"num_tokens": 1474539.0,
"repeat_count": 0.0,
- "routers_loss": 0.0267612524330616,
+ "routers_loss": 0.025456594303250313,
"skip_count": 0.0,
"step": 914,
"text_loss": 0.42812058329582214
@@ -8700,13 +8700,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009958808590448385,
- "loss": 0.0501,
+ "loss": 0.0489,
"macro_f1": 0.3333333432674408,
"num_tokens": 1477552.0,
"repeat_count": 0.0,
- "routers_loss": 0.005838244222104549,
+ "routers_loss": 0.006795851048082113,
"skip_count": 0.0,
"step": 916,
"text_loss": 0.5402814149856567
@@ -8719,13 +8719,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009958411160762234,
- "loss": 0.0383,
+ "loss": 0.039,
"macro_f1": 0.3333333432674408,
"num_tokens": 1482547.0,
"repeat_count": 0.0,
- "routers_loss": 0.014642171561717987,
+ "routers_loss": 0.015615932643413544,
"skip_count": 0.0,
"step": 918,
"text_loss": 0.3836168050765991
@@ -8738,32 +8738,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009958011831003577,
- "loss": 0.0457,
+ "loss": 0.0448,
"macro_f1": 0.3272727429866791,
"num_tokens": 1485807.0,
"repeat_count": 0.0,
- "routers_loss": 0.04119620472192764,
+ "routers_loss": 0.043541423976421356,
"skip_count": 1.0,
"step": 920,
"text_loss": 0.4333936274051666
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 4.328734957440563,
- "f1_execute": 0.943396270275116,
- "f1_repeat": 0.0,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.154296875,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.000995761060132543,
- "loss": 0.0433,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0418,
+ "macro_f1": 0.6538461446762085,
"num_tokens": 1488941.0,
"repeat_count": 1.0,
- "routers_loss": 0.06713195145130157,
+ "routers_loss": 0.05866432189941406,
"skip_count": 2.0,
"step": 922,
"text_loss": 0.4106994867324829
@@ -8776,13 +8776,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009957207471881552,
- "loss": 0.0533,
+ "loss": 0.0531,
"macro_f1": 0.5492662787437439,
"num_tokens": 1492026.0,
"repeat_count": 0.0,
- "routers_loss": 0.024023180827498436,
+ "routers_loss": 0.02714901603758335,
"skip_count": 2.0,
"step": 924,
"text_loss": 0.542091429233551
@@ -8795,13 +8795,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.17578125,
+ "grad_norm": 0.1796875,
"learning_rate": 0.0009956802442826415,
- "loss": 0.0373,
+ "loss": 0.0386,
"macro_f1": 0.3272727429866791,
"num_tokens": 1494543.0,
"repeat_count": 1.0,
- "routers_loss": 0.05399841442704201,
+ "routers_loss": 0.0563737191259861,
"skip_count": 0.0,
"step": 926,
"text_loss": 0.47209203243255615
@@ -8814,13 +8814,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0009956395514315235,
- "loss": 0.0488,
+ "loss": 0.0496,
"macro_f1": 0.3272727429866791,
"num_tokens": 1497831.0,
"repeat_count": 1.0,
- "routers_loss": 0.0299264844506979,
+ "routers_loss": 0.03285066783428192,
"skip_count": 0.0,
"step": 928,
"text_loss": 0.6628931164741516
@@ -8833,13 +8833,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009955986686503943,
- "loss": 0.0467,
+ "loss": 0.0466,
"macro_f1": 0.3272727429866791,
"num_tokens": 1501375.0,
"repeat_count": 0.0,
- "routers_loss": 0.023478010669350624,
+ "routers_loss": 0.024297121912240982,
"skip_count": 1.0,
"step": 930,
"text_loss": 0.495676189661026
@@ -8852,13 +8852,13 @@
"f1_execute": 0.9387754797935486,
"f1_repeat": 1.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0009955575959549202,
- "loss": 0.0447,
+ "loss": 0.0424,
"macro_f1": 0.7795917987823486,
"num_tokens": 1504363.0,
"repeat_count": 1.0,
- "routers_loss": 0.12116194516420364,
+ "routers_loss": 0.12196464836597443,
"skip_count": 4.0,
"step": 932,
"text_loss": 0.26123273372650146
@@ -8871,13 +8871,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.0009955163333608408,
- "loss": 0.053,
+ "loss": 0.0538,
"macro_f1": 0.3333333432674408,
"num_tokens": 1507178.0,
"repeat_count": 0.0,
- "routers_loss": 0.011879723519086838,
+ "routers_loss": 0.012947078794240952,
"skip_count": 0.0,
"step": 934,
"text_loss": 0.32552677392959595
@@ -8890,13 +8890,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009954748808839674,
- "loss": 0.0373,
+ "loss": 0.0379,
"macro_f1": 0.3333333432674408,
"num_tokens": 1509910.0,
"repeat_count": 0.0,
- "routers_loss": 0.009245929308235645,
+ "routers_loss": 0.008946365676820278,
"skip_count": 0.0,
"step": 936,
"text_loss": 0.533141016960144
@@ -8909,13 +8909,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.140625,
"learning_rate": 0.000995433238540185,
- "loss": 0.0461,
+ "loss": 0.0466,
"macro_f1": 0.6538461446762085,
"num_tokens": 1512826.0,
"repeat_count": 1.0,
- "routers_loss": 0.032464127987623215,
+ "routers_loss": 0.029975678771734238,
"skip_count": 1.0,
"step": 938,
"text_loss": 0.2953577935695648
@@ -8928,13 +8928,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009953914063454512,
- "loss": 0.0515,
+ "loss": 0.0497,
"macro_f1": 0.3144654333591461,
"num_tokens": 1517230.0,
"repeat_count": 1.0,
- "routers_loss": 0.08835392445325851,
+ "routers_loss": 0.0889134630560875,
"skip_count": 2.0,
"step": 940,
"text_loss": 0.5368834733963013
@@ -8947,13 +8947,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.193359375,
"learning_rate": 0.000995349384315796,
- "loss": 0.0405,
+ "loss": 0.0413,
"macro_f1": 0.3333333432674408,
"num_tokens": 1519876.0,
"repeat_count": 0.0,
- "routers_loss": 0.014307246543467045,
+ "routers_loss": 0.013458753935992718,
"skip_count": 0.0,
"step": 942,
"text_loss": 0.2005518227815628
@@ -8966,13 +8966,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.000995307172467322,
- "loss": 0.0449,
+ "loss": 0.0444,
"macro_f1": 0.31446540355682373,
"num_tokens": 1522998.0,
"repeat_count": 1.0,
- "routers_loss": 0.10261563211679459,
+ "routers_loss": 0.08850377053022385,
"skip_count": 1.0,
"step": 944,
"text_loss": 0.227926567196846
@@ -8985,13 +8985,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009952647708162054,
- "loss": 0.0507,
+ "loss": 0.0503,
"macro_f1": 0.3272727429866791,
"num_tokens": 1527100.0,
"repeat_count": 0.0,
- "routers_loss": 0.03316422924399376,
+ "routers_loss": 0.03199794515967369,
"skip_count": 1.0,
"step": 946,
"text_loss": 0.4859686493873596
@@ -9004,13 +9004,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009952221793786942,
- "loss": 0.0352,
+ "loss": 0.0354,
"macro_f1": 0.3333333432674408,
"num_tokens": 1530028.0,
"repeat_count": 0.0,
- "routers_loss": 0.00902469176799059,
+ "routers_loss": 0.006507779937237501,
"skip_count": 0.0,
"step": 948,
"text_loss": 0.6855354905128479
@@ -9023,13 +9023,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.0009951793981711097,
- "loss": 0.0581,
+ "loss": 0.0584,
"macro_f1": 0.6538461446762085,
"num_tokens": 1533254.0,
"repeat_count": 1.0,
- "routers_loss": 0.06710167229175568,
+ "routers_loss": 0.06175103038549423,
"skip_count": 1.0,
"step": 950,
"text_loss": 0.7590400576591492
@@ -9042,13 +9042,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009951364272098458,
- "loss": 0.0294,
+ "loss": 0.0295,
"macro_f1": 0.5492662787437439,
"num_tokens": 1536239.0,
"repeat_count": 0.0,
- "routers_loss": 0.04208769276738167,
+ "routers_loss": 0.03773383051156998,
"skip_count": 2.0,
"step": 952,
"text_loss": 0.669784665107727
@@ -9061,13 +9061,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009950932665113688,
- "loss": 0.0505,
+ "loss": 0.0507,
"macro_f1": 0.32098764181137085,
"num_tokens": 1539682.0,
"repeat_count": 0.0,
- "routers_loss": 0.06530380249023438,
+ "routers_loss": 0.07280613481998444,
"skip_count": 2.0,
"step": 954,
"text_loss": 0.3365570902824402
@@ -9080,13 +9080,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009950499160922184,
- "loss": 0.0545,
+ "loss": 0.0541,
"macro_f1": 0.3333333432674408,
"num_tokens": 1542875.0,
"repeat_count": 0.0,
- "routers_loss": 0.01803453080356121,
+ "routers_loss": 0.01770266517996788,
"skip_count": 0.0,
"step": 956,
"text_loss": 0.0921545997262001
@@ -9099,13 +9099,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.09375,
"learning_rate": 0.000995006375969006,
- "loss": 0.0481,
+ "loss": 0.0473,
"macro_f1": 0.3272727429866791,
"num_tokens": 1547135.0,
"repeat_count": 1.0,
- "routers_loss": 0.08461762219667435,
+ "routers_loss": 0.07672002166509628,
"skip_count": 0.0,
"step": 958,
"text_loss": 0.5887606739997864
@@ -9120,11 +9120,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1376953125,
"learning_rate": 0.0009949626461584165,
- "loss": 0.0441,
+ "loss": 0.043,
"macro_f1": 0.3333333432674408,
"num_tokens": 1550100.0,
"repeat_count": 0.0,
- "routers_loss": 0.007111486047506332,
+ "routers_loss": 0.006247182376682758,
"skip_count": 0.0,
"step": 960,
"text_loss": 0.5777931213378906
@@ -9137,13 +9137,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.11181640625,
+ "grad_norm": 0.119140625,
"learning_rate": 0.0009949187266772076,
- "loss": 0.0361,
+ "loss": 0.0366,
"macro_f1": 0.5492662787437439,
"num_tokens": 1553192.0,
"repeat_count": 0.0,
- "routers_loss": 0.029776185750961304,
+ "routers_loss": 0.030319908633828163,
"skip_count": 2.0,
"step": 962,
"text_loss": 0.2370252162218094
@@ -9156,13 +9156,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009948746175422088,
- "loss": 0.0506,
+ "loss": 0.0511,
"macro_f1": 0.3333333432674408,
"num_tokens": 1556318.0,
"repeat_count": 0.0,
- "routers_loss": 0.007108999416232109,
+ "routers_loss": 0.006004320923238993,
"skip_count": 0.0,
"step": 964,
"text_loss": 0.6271032094955444
@@ -9175,13 +9175,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000994830318770323,
- "loss": 0.0498,
+ "loss": 0.0514,
"macro_f1": 0.3333333432674408,
"num_tokens": 1559195.0,
"repeat_count": 0.0,
- "routers_loss": 0.01126947533339262,
+ "routers_loss": 0.011544366367161274,
"skip_count": 0.0,
"step": 966,
"text_loss": 0.47256720066070557
@@ -9194,13 +9194,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009947858303785255,
- "loss": 0.0366,
+ "loss": 0.0374,
"macro_f1": 0.6603773832321167,
"num_tokens": 1561813.0,
"repeat_count": 1.0,
- "routers_loss": 0.05142999067902565,
+ "routers_loss": 0.05258861929178238,
"skip_count": 1.0,
"step": 968,
"text_loss": 0.7703132629394531
@@ -9213,13 +9213,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.0009947411523838648,
- "loss": 0.0461,
+ "loss": 0.0453,
"macro_f1": 0.3333333432674408,
"num_tokens": 1564634.0,
"repeat_count": 0.0,
- "routers_loss": 0.010770819149911404,
+ "routers_loss": 0.011216280050575733,
"skip_count": 0.0,
"step": 970,
"text_loss": 0.4666804075241089
@@ -9232,13 +9232,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0009946962848034608,
- "loss": 0.0692,
+ "loss": 0.0696,
"macro_f1": 0.3333333432674408,
"num_tokens": 1567959.0,
"repeat_count": 0.0,
- "routers_loss": 0.008775795809924603,
+ "routers_loss": 0.009387624450027943,
"skip_count": 0.0,
"step": 972,
"text_loss": 0.4067264199256897
@@ -9251,13 +9251,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.203125,
"learning_rate": 0.0009946512276545075,
- "loss": 0.0403,
+ "loss": 0.0397,
"macro_f1": 0.3272727429866791,
"num_tokens": 1571221.0,
"repeat_count": 1.0,
- "routers_loss": 0.05100395902991295,
+ "routers_loss": 0.041713520884513855,
"skip_count": 0.0,
"step": 974,
"text_loss": 0.5242366194725037
@@ -9270,13 +9270,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.228515625,
"learning_rate": 0.0009946059809542705,
- "loss": 0.0503,
+ "loss": 0.0487,
"macro_f1": 0.7644445300102234,
"num_tokens": 1575033.0,
"repeat_count": 2.0,
- "routers_loss": 0.06653711199760437,
+ "routers_loss": 0.05748331546783447,
"skip_count": 2.0,
"step": 976,
"text_loss": 0.5704690217971802
@@ -9284,18 +9284,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 4.591722923393014,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0009945605447200887,
- "loss": 0.0435,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0445,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1579050.0,
"repeat_count": 0.0,
- "routers_loss": 0.009865665808320045,
+ "routers_loss": 0.016765203326940536,
"skip_count": 0.0,
"step": 978,
"text_loss": 0.4804173707962036
@@ -9308,13 +9308,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.0009945149189693732,
- "loss": 0.0399,
+ "loss": 0.0406,
"macro_f1": 0.5492662787437439,
"num_tokens": 1582967.0,
"repeat_count": 0.0,
- "routers_loss": 0.021175632253289223,
+ "routers_loss": 0.021518222987651825,
"skip_count": 2.0,
"step": 980,
"text_loss": 0.4138598144054413
@@ -9327,32 +9327,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11181640625,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.0009944691037196078,
- "loss": 0.0472,
+ "loss": 0.0456,
"macro_f1": 0.3333333432674408,
"num_tokens": 1586282.0,
"repeat_count": 0.0,
- "routers_loss": 0.011803832836449146,
+ "routers_loss": 0.012246460653841496,
"skip_count": 0.0,
"step": 982,
"text_loss": 0.22561736404895782
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 0.5,
"acc_skip": 0.800000011920929,
- "avg_layers": 23.0,
+ "avg_layers": 24.0,
"epoch": 4.6199002054593485,
- "f1_execute": 0.9090908765792847,
- "f1_repeat": 0.0,
+ "f1_execute": 0.930232584476471,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 0.8000000715255737,
- "grad_norm": 0.142578125,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009944230989883491,
- "loss": 0.0467,
- "macro_f1": 0.5696970224380493,
+ "loss": 0.0456,
+ "macro_f1": 0.7989664077758789,
"num_tokens": 1589279.0,
"repeat_count": 2.0,
- "routers_loss": 0.08856551349163055,
+ "routers_loss": 0.09344895929098129,
"skip_count": 5.0,
"step": 984,
"text_loss": 0.4416656494140625
@@ -9365,13 +9365,13 @@
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.111328125,
"learning_rate": 0.0009943769047932264,
- "loss": 0.0413,
+ "loss": 0.0404,
"macro_f1": 0.5359477400779724,
"num_tokens": 1592398.0,
"repeat_count": 2.0,
- "routers_loss": 0.08593414723873138,
+ "routers_loss": 0.08916857838630676,
"skip_count": 2.0,
"step": 986,
"text_loss": 0.5536438822746277
@@ -9384,13 +9384,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.154296875,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000994330521151941,
- "loss": 0.0399,
+ "loss": 0.039,
"macro_f1": 0.32098764181137085,
"num_tokens": 1596213.0,
"repeat_count": 1.0,
- "routers_loss": 0.07049509882926941,
+ "routers_loss": 0.06114347651600838,
"skip_count": 1.0,
"step": 988,
"text_loss": 0.5835405588150024
@@ -9403,13 +9403,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.1953125,
"learning_rate": 0.000994283948082267,
- "loss": 0.0595,
+ "loss": 0.0573,
"macro_f1": 0.3333333432674408,
"num_tokens": 1598827.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019258069805800915,
+ "routers_loss": 0.0017335431184619665,
"skip_count": 0.0,
"step": 990,
"text_loss": 0.5857380032539368
@@ -9422,13 +9422,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009942371856020522,
- "loss": 0.0335,
+ "loss": 0.0341,
"macro_f1": 0.3333333432674408,
"num_tokens": 1602915.0,
"repeat_count": 0.0,
- "routers_loss": 0.014094089157879353,
+ "routers_loss": 0.014606470242142677,
"skip_count": 0.0,
"step": 992,
"text_loss": 0.6939892768859863
@@ -9436,18 +9436,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 31.0,
"epoch": 4.666862342236572,
- "f1_execute": 0.9583333134651184,
+ "f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009941902337292155,
- "loss": 0.0603,
- "macro_f1": 0.6527777910232544,
+ "loss": 0.06,
+ "macro_f1": 0.6598639488220215,
"num_tokens": 1605776.0,
"repeat_count": 3.0,
- "routers_loss": 0.06360147893428802,
+ "routers_loss": 0.06297315657138824,
"skip_count": 1.0,
"step": 994,
"text_loss": 0.37616831064224243
@@ -9460,13 +9460,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009941430924817487,
- "loss": 0.0573,
+ "loss": 0.0572,
"macro_f1": 0.5492662787437439,
"num_tokens": 1609856.0,
"repeat_count": 0.0,
- "routers_loss": 0.0326208658516407,
+ "routers_loss": 0.03297794610261917,
"skip_count": 2.0,
"step": 996,
"text_loss": 0.2098303586244583
@@ -9479,13 +9479,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09912109375,
+ "grad_norm": 0.10107421875,
"learning_rate": 0.000994095761877717,
- "loss": 0.0502,
+ "loss": 0.0499,
"macro_f1": 0.3333333432674408,
"num_tokens": 1612904.0,
"repeat_count": 0.0,
- "routers_loss": 0.012660752050578594,
+ "routers_loss": 0.012901155278086662,
"skip_count": 0.0,
"step": 998,
"text_loss": 0.20103533565998077
@@ -9498,13 +9498,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.265625,
+ "grad_norm": 0.259765625,
"learning_rate": 0.000994048241935257,
- "loss": 0.0537,
+ "loss": 0.0535,
"macro_f1": 0.3272727429866791,
"num_tokens": 1615540.0,
"repeat_count": 0.0,
- "routers_loss": 0.021756287664175034,
+ "routers_loss": 0.020434845238924026,
"skip_count": 0.0,
"step": 1000,
"text_loss": 0.32709044218063354
@@ -9512,37 +9512,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 4.70443205165835,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1669921875,
"learning_rate": 0.0009940005326725789,
- "loss": 0.0447,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.0453,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 1618786.0,
"repeat_count": 0.0,
- "routers_loss": 0.07292548567056656,
+ "routers_loss": 0.07831378281116486,
"skip_count": 2.0,
"step": 1002,
"text_loss": 0.5789632797241211
},
{
- "acc_repeat": 0.5,
+ "acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 4.713824479013795,
- "f1_execute": 0.9811320900917053,
- "f1_repeat": 0.6666666865348816,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1787109375,
+ "grad_norm": 0.21875,
"learning_rate": 0.0009939526341079647,
- "loss": 0.0505,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0511,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 1621736.0,
"repeat_count": 2.0,
- "routers_loss": 0.03397528454661369,
+ "routers_loss": 0.04863874986767769,
"skip_count": 0.0,
"step": 1004,
"text_loss": 0.6128849387168884
@@ -9555,13 +9555,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009939045462597693,
- "loss": 0.0544,
+ "loss": 0.0538,
"macro_f1": 0.3333333432674408,
"num_tokens": 1624649.0,
"repeat_count": 0.0,
- "routers_loss": 0.005987613927572966,
+ "routers_loss": 0.00677989237010479,
"skip_count": 0.0,
"step": 1006,
"text_loss": 0.6168264150619507
@@ -9574,13 +9574,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009938562691464202,
- "loss": 0.0522,
+ "loss": 0.0524,
"macro_f1": 0.3333333432674408,
"num_tokens": 1627700.0,
"repeat_count": 0.0,
- "routers_loss": 0.021656684577465057,
+ "routers_loss": 0.019490402191877365,
"skip_count": 0.0,
"step": 1008,
"text_loss": 0.17463822662830353
@@ -9593,32 +9593,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.000993807802786417,
- "loss": 0.0487,
+ "loss": 0.0475,
"macro_f1": 0.3333333432674408,
"num_tokens": 1630714.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014992234064266086,
+ "routers_loss": 0.0019022391643375158,
"skip_count": 0.0,
"step": 1010,
"text_loss": 0.5675593018531799
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.5,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 4.751394188435574,
- "f1_execute": 0.9411764740943909,
- "f1_repeat": 0.0,
+ "f1_execute": 0.9599999785423279,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.158203125,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009937591471983322,
- "loss": 0.0491,
- "macro_f1": 0.5359477400779724,
+ "loss": 0.0501,
+ "macro_f1": 0.7644444704055786,
"num_tokens": 1633770.0,
"repeat_count": 1.0,
- "routers_loss": 0.03448791801929474,
+ "routers_loss": 0.042485643178224564,
"skip_count": 2.0,
"step": 1012,
"text_loss": 0.42387229204177856
@@ -9631,13 +9631,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1357421875,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0009937103024008109,
- "loss": 0.0541,
+ "loss": 0.0545,
"macro_f1": 0.3272727429866791,
"num_tokens": 1637120.0,
"repeat_count": 0.0,
- "routers_loss": 0.08285929262638092,
+ "routers_loss": 0.09427817165851593,
"skip_count": 1.0,
"step": 1014,
"text_loss": 0.49511051177978516
@@ -9650,13 +9650,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.125,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009936612684125702,
- "loss": 0.0515,
+ "loss": 0.0503,
"macro_f1": 0.3333333432674408,
"num_tokens": 1640165.0,
"repeat_count": 0.0,
- "routers_loss": 0.00486504752188921,
+ "routers_loss": 0.005106127820909023,
"skip_count": 0.0,
"step": 1016,
"text_loss": 0.5398799180984497
@@ -9669,13 +9669,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.2734375,
"learning_rate": 0.0009936120452524004,
- "loss": 0.051,
+ "loss": 0.0506,
"macro_f1": 0.3333333432674408,
"num_tokens": 1643251.0,
"repeat_count": 0.0,
- "routers_loss": 0.017805909737944603,
+ "routers_loss": 0.016914300620555878,
"skip_count": 0.0,
"step": 1018,
"text_loss": 0.20882178843021393
@@ -9688,13 +9688,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009935626329391637,
- "loss": 0.0547,
+ "loss": 0.0537,
"macro_f1": 0.32098764181137085,
"num_tokens": 1646560.0,
"repeat_count": 0.0,
- "routers_loss": 0.12958799302577972,
+ "routers_loss": 0.13481520116329193,
"skip_count": 2.0,
"step": 1020,
"text_loss": 0.5719883441925049
@@ -9707,13 +9707,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009935130314917948,
- "loss": 0.0595,
+ "loss": 0.0602,
"macro_f1": 0.5492662787437439,
"num_tokens": 1649538.0,
"repeat_count": 0.0,
- "routers_loss": 0.07447081059217453,
+ "routers_loss": 0.07700438797473907,
"skip_count": 2.0,
"step": 1022,
"text_loss": 0.1303367167711258
@@ -9726,13 +9726,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009934632409293015,
- "loss": 0.0619,
+ "loss": 0.0611,
"macro_f1": 0.32098764181137085,
"num_tokens": 1652397.0,
"repeat_count": 1.0,
- "routers_loss": 0.12529553472995758,
+ "routers_loss": 0.11416907608509064,
"skip_count": 1.0,
"step": 1024,
"text_loss": 0.24076920747756958
@@ -9745,13 +9745,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.306640625,
"learning_rate": 0.0009934132612707631,
- "loss": 0.0491,
+ "loss": 0.0507,
"macro_f1": 0.31446540355682373,
"num_tokens": 1654938.0,
"repeat_count": 0.0,
- "routers_loss": 0.08664281666278839,
+ "routers_loss": 0.09484589844942093,
"skip_count": 2.0,
"step": 1026,
"text_loss": 0.1652517318725586
@@ -9764,13 +9764,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009933630925353324,
- "loss": 0.0394,
+ "loss": 0.0395,
"macro_f1": 0.3333333432674408,
"num_tokens": 1658536.0,
"repeat_count": 0.0,
- "routers_loss": 0.0067965323105454445,
+ "routers_loss": 0.00741987070068717,
"skip_count": 0.0,
"step": 1028,
"text_loss": 0.49296700954437256
@@ -9783,13 +9783,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1845703125,
"learning_rate": 0.0009933127347422337,
- "loss": 0.0607,
+ "loss": 0.0602,
"macro_f1": 0.32098764181137085,
"num_tokens": 1661446.0,
"repeat_count": 0.0,
- "routers_loss": 0.08319470286369324,
+ "routers_loss": 0.08399344235658646,
"skip_count": 2.0,
"step": 1030,
"text_loss": 0.22363591194152832
@@ -9802,13 +9802,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.158203125,
"learning_rate": 0.0009932621879107648,
- "loss": 0.0476,
+ "loss": 0.0475,
"macro_f1": 0.3333333432674408,
"num_tokens": 1664612.0,
"repeat_count": 0.0,
- "routers_loss": 0.002826537238433957,
+ "routers_loss": 0.0031781597062945366,
"skip_count": 0.0,
"step": 1032,
"text_loss": 0.36083245277404785
@@ -9823,11 +9823,11 @@
"f1_skip": 0.0,
"grad_norm": 0.2275390625,
"learning_rate": 0.000993211452060295,
- "loss": 0.0431,
+ "loss": 0.042,
"macro_f1": 0.3272727429866791,
"num_tokens": 1667467.0,
"repeat_count": 0.0,
- "routers_loss": 0.03491095453500748,
+ "routers_loss": 0.03595469892024994,
"skip_count": 1.0,
"step": 1034,
"text_loss": 0.16372856497764587
@@ -9840,13 +9840,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.173828125,
+ "grad_norm": 0.189453125,
"learning_rate": 0.000993160527210266,
- "loss": 0.0616,
+ "loss": 0.061,
"macro_f1": 0.3144654333591461,
"num_tokens": 1670675.0,
"repeat_count": 3.0,
- "routers_loss": 0.1828247457742691,
+ "routers_loss": 0.1597205102443695,
"skip_count": 0.0,
"step": 1036,
"text_loss": 0.6049913763999939
@@ -9859,13 +9859,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2099609375,
+ "grad_norm": 0.2197265625,
"learning_rate": 0.000993109413380193,
- "loss": 0.0563,
+ "loss": 0.0562,
"macro_f1": 0.3333333432674408,
"num_tokens": 1673477.0,
"repeat_count": 0.0,
- "routers_loss": 0.010931054130196571,
+ "routers_loss": 0.009756010957062244,
"skip_count": 0.0,
"step": 1038,
"text_loss": 0.7034620642662048
@@ -9878,13 +9878,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.158203125,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.0009930581105896624,
- "loss": 0.0569,
+ "loss": 0.0559,
"macro_f1": 0.3272727429866791,
"num_tokens": 1676809.0,
"repeat_count": 0.0,
- "routers_loss": 0.023222090676426888,
+ "routers_loss": 0.020718922838568687,
"skip_count": 0.0,
"step": 1040,
"text_loss": 0.2814720571041107
@@ -9897,13 +9897,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.0009930066188583338,
- "loss": 0.0453,
+ "loss": 0.0445,
"macro_f1": 0.32098764181137085,
"num_tokens": 1679398.0,
"repeat_count": 1.0,
- "routers_loss": 0.07085686922073364,
+ "routers_loss": 0.04755603149533272,
"skip_count": 1.0,
"step": 1042,
"text_loss": 0.5445759296417236
@@ -9916,13 +9916,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.126953125,
"learning_rate": 0.0009929549382059388,
- "loss": 0.0515,
+ "loss": 0.0509,
"macro_f1": 0.3333333432674408,
"num_tokens": 1682269.0,
"repeat_count": 0.0,
- "routers_loss": 0.010158216580748558,
+ "routers_loss": 0.01040949858725071,
"skip_count": 0.0,
"step": 1044,
"text_loss": 0.2876914143562317
@@ -9935,13 +9935,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0009929030686522816,
- "loss": 0.0372,
+ "loss": 0.0363,
"macro_f1": 0.3333333432674408,
"num_tokens": 1685428.0,
"repeat_count": 0.0,
- "routers_loss": 0.007876895368099213,
+ "routers_loss": 0.008158888667821884,
"skip_count": 0.0,
"step": 1046,
"text_loss": 0.49053525924682617
@@ -9954,13 +9954,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009928510102172386,
- "loss": 0.0501,
+ "loss": 0.0498,
"macro_f1": 0.3333333432674408,
"num_tokens": 1688252.0,
"repeat_count": 0.0,
- "routers_loss": 0.004859173204749823,
+ "routers_loss": 0.005102572031319141,
"skip_count": 0.0,
"step": 1048,
"text_loss": 0.5274341106414795
@@ -9973,13 +9973,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.17578125,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.0009927987629207587,
- "loss": 0.0582,
+ "loss": 0.0564,
"macro_f1": 0.3333333432674408,
"num_tokens": 1691289.0,
"repeat_count": 0.0,
- "routers_loss": 0.01798083633184433,
+ "routers_loss": 0.016768503934144974,
"skip_count": 0.0,
"step": 1050,
"text_loss": 0.9935035109519958
@@ -9987,18 +9987,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 4.939242735544467,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009927463267828634,
"loss": 0.0488,
- "macro_f1": 0.3272727429866791,
+ "macro_f1": 0.3333333432674408,
"num_tokens": 1694148.0,
"repeat_count": 0.0,
- "routers_loss": 0.014295363798737526,
+ "routers_loss": 0.010905829258263111,
"skip_count": 0.0,
"step": 1052,
"text_loss": 0.20895758271217346
@@ -10011,13 +10011,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.000992693701823646,
- "loss": 0.0635,
+ "loss": 0.0624,
"macro_f1": 0.3272727429866791,
"num_tokens": 1698543.0,
"repeat_count": 1.0,
- "routers_loss": 0.1038367822766304,
+ "routers_loss": 0.10533971339464188,
"skip_count": 0.0,
"step": 1054,
"text_loss": 0.5776236653327942
@@ -10030,13 +10030,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.255859375,
"learning_rate": 0.0009926408880632726,
- "loss": 0.057,
+ "loss": 0.0556,
"macro_f1": 0.3272727429866791,
"num_tokens": 1702460.0,
"repeat_count": 0.0,
- "routers_loss": 0.029780643060803413,
+ "routers_loss": 0.026313411071896553,
"skip_count": 1.0,
"step": 1056,
"text_loss": 0.34990596771240234
@@ -10049,13 +10049,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10107421875,
+ "grad_norm": 0.099609375,
"learning_rate": 0.0009925878855219818,
- "loss": 0.0398,
+ "loss": 0.0391,
"macro_f1": 0.3333333432674408,
"num_tokens": 1705686.0,
"repeat_count": 0.0,
- "routers_loss": 0.008537676185369492,
+ "routers_loss": 0.007763393223285675,
"skip_count": 0.0,
"step": 1058,
"text_loss": 0.4980163276195526
@@ -10068,13 +10068,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.171875,
+ "grad_norm": 0.177734375,
"learning_rate": 0.000992534694220084,
- "loss": 0.0617,
+ "loss": 0.0613,
"macro_f1": 0.3272727429866791,
"num_tokens": 1708739.0,
"repeat_count": 0.0,
- "routers_loss": 0.03966755419969559,
+ "routers_loss": 0.03998444974422455,
"skip_count": 1.0,
"step": 1060,
"text_loss": 0.29092350602149963
@@ -10087,13 +10087,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1484375,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.000992481314177962,
- "loss": 0.0311,
+ "loss": 0.0312,
"macro_f1": 0.32098764181137085,
"num_tokens": 1711903.0,
"repeat_count": 1.0,
- "routers_loss": 0.06651833653450012,
+ "routers_loss": 0.06966045498847961,
"skip_count": 1.0,
"step": 1062,
"text_loss": 0.6267179250717163
@@ -10106,13 +10106,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2431640625,
+ "grad_norm": 0.244140625,
"learning_rate": 0.0009924277454160717,
- "loss": 0.0557,
+ "loss": 0.0548,
"macro_f1": 0.3272727429866791,
"num_tokens": 1715974.0,
"repeat_count": 0.0,
- "routers_loss": 0.05130369961261749,
+ "routers_loss": 0.05536063387989998,
"skip_count": 1.0,
"step": 1064,
"text_loss": 0.5813798904418945
@@ -10125,13 +10125,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009923739879549402,
- "loss": 0.0435,
+ "loss": 0.0423,
"macro_f1": 0.3333333432674408,
"num_tokens": 1718828.0,
"repeat_count": 0.0,
- "routers_loss": 0.020534176379442215,
+ "routers_loss": 0.020993782207369804,
"skip_count": 0.0,
"step": 1066,
"text_loss": 0.22665327787399292
@@ -10144,13 +10144,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.0888671875,
"learning_rate": 0.0009923200418151677,
- "loss": 0.0305,
+ "loss": 0.0301,
"macro_f1": 0.3333333432674408,
"num_tokens": 1722419.0,
"repeat_count": 0.0,
- "routers_loss": 0.007514918688684702,
+ "routers_loss": 0.007351701147854328,
"skip_count": 0.0,
"step": 1068,
"text_loss": 0.5796169638633728
@@ -10163,13 +10163,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009922659070174264,
- "loss": 0.0461,
+ "loss": 0.0452,
"macro_f1": 0.3272727429866791,
"num_tokens": 1725663.0,
"repeat_count": 1.0,
- "routers_loss": 0.024598751217126846,
+ "routers_loss": 0.026033315807580948,
"skip_count": 0.0,
"step": 1070,
"text_loss": 0.25742828845977783
@@ -10182,32 +10182,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009922115835824612,
- "loss": 0.0408,
+ "loss": 0.041,
"macro_f1": 0.3333333432674408,
"num_tokens": 1729239.0,
"repeat_count": 0.0,
- "routers_loss": 0.011866633780300617,
+ "routers_loss": 0.0118600158020854,
"skip_count": 0.0,
"step": 1072,
"text_loss": 0.21630282700061798
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 5.042265923099501,
- "f1_execute": 0.9818181991577148,
- "f1_repeat": 0.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009921570715310884,
- "loss": 0.036,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0364,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 1732507.0,
"repeat_count": 1.0,
- "routers_loss": 0.01755746826529503,
+ "routers_loss": 0.016118815168738365,
"skip_count": 0.0,
"step": 1074,
"text_loss": 0.5639925003051758
@@ -10220,13 +10220,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009921023708841974,
- "loss": 0.0415,
+ "loss": 0.0407,
"macro_f1": 0.3333333432674408,
"num_tokens": 1736182.0,
"repeat_count": 0.0,
- "routers_loss": 0.003976983483880758,
+ "routers_loss": 0.004275390412658453,
"skip_count": 0.0,
"step": 1076,
"text_loss": 0.5758615136146545
@@ -10239,13 +10239,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.0009920474816627496,
- "loss": 0.0378,
+ "loss": 0.037,
"macro_f1": 0.3333333432674408,
"num_tokens": 1739559.0,
"repeat_count": 0.0,
- "routers_loss": 0.013548235408961773,
+ "routers_loss": 0.01299292128533125,
"skip_count": 0.0,
"step": 1078,
"text_loss": 0.18221625685691833
@@ -10258,13 +10258,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009919924038877788,
"loss": 0.0343,
"macro_f1": 0.32098764181137085,
"num_tokens": 1742890.0,
"repeat_count": 0.0,
- "routers_loss": 0.03923165053129196,
+ "routers_loss": 0.038295745849609375,
"skip_count": 2.0,
"step": 1080,
"text_loss": 0.17354349792003632
@@ -10277,13 +10277,13 @@
"f1_execute": 0.9583333134651184,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009919371375803905,
- "loss": 0.0464,
+ "loss": 0.0455,
"macro_f1": 0.8194444179534912,
"num_tokens": 1746433.0,
"repeat_count": 2.0,
- "routers_loss": 0.046429626643657684,
+ "routers_loss": 0.04052971675992012,
"skip_count": 3.0,
"step": 1082,
"text_loss": 0.2250112146139145
@@ -10296,13 +10296,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1025390625,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009918816827617632,
- "loss": 0.0346,
+ "loss": 0.0353,
"macro_f1": 0.3333333432674408,
"num_tokens": 1750802.0,
"repeat_count": 0.0,
- "routers_loss": 0.008998732082545757,
+ "routers_loss": 0.009114136919379234,
"skip_count": 0.0,
"step": 1084,
"text_loss": 0.2526719272136688
@@ -10315,13 +10315,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.000991826039453147,
- "loss": 0.0386,
+ "loss": 0.0392,
"macro_f1": 0.3333333432674408,
"num_tokens": 1754272.0,
"repeat_count": 0.0,
- "routers_loss": 0.005173585377633572,
+ "routers_loss": 0.004904678091406822,
"skip_count": 0.0,
"step": 1086,
"text_loss": 0.7308789491653442
@@ -10334,13 +10334,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.138671875,
"learning_rate": 0.000991770207675865,
- "loss": 0.0308,
+ "loss": 0.0327,
"macro_f1": 0.6666666865348816,
"num_tokens": 1757231.0,
"repeat_count": 0.0,
- "routers_loss": 0.024098891764879227,
+ "routers_loss": 0.02129189297556877,
"skip_count": 2.0,
"step": 1088,
"text_loss": 0.21764220297336578
@@ -10353,13 +10353,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009917141874513113,
"loss": 0.0315,
"macro_f1": 0.3333333432674408,
"num_tokens": 1760003.0,
"repeat_count": 0.0,
- "routers_loss": 0.014002764597535133,
+ "routers_loss": 0.01310618408024311,
"skip_count": 0.0,
"step": 1090,
"text_loss": 0.33892181515693665
@@ -10372,32 +10372,32 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009916579788009537,
- "loss": 0.0462,
+ "loss": 0.0457,
"macro_f1": 0.5492662787437439,
"num_tokens": 1763052.0,
"repeat_count": 0.0,
- "routers_loss": 0.017871137708425522,
+ "routers_loss": 0.02059309557080269,
"skip_count": 2.0,
"step": 1092,
"text_loss": 0.6551769375801086
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 5.136190196653947,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1044921875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10546875,
"learning_rate": 0.0009916015817463312,
"loss": 0.0385,
- "macro_f1": 0.32098764181137085,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1766655.0,
"repeat_count": 0.0,
- "routers_loss": 0.033123619854450226,
+ "routers_loss": 0.0274797435849905,
"skip_count": 2.0,
"step": 1094,
"text_loss": 0.3984372019767761
@@ -10410,13 +10410,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.000991544996309055,
- "loss": 0.0267,
+ "loss": 0.0271,
"macro_f1": 0.3333333432674408,
"num_tokens": 1769997.0,
"repeat_count": 0.0,
- "routers_loss": 0.01279227901250124,
+ "routers_loss": 0.01437368243932724,
"skip_count": 0.0,
"step": 1096,
"text_loss": 0.4203338921070099
@@ -10429,13 +10429,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.000991488222510809,
- "loss": 0.0295,
+ "loss": 0.0292,
"macro_f1": 0.3333333432674408,
"num_tokens": 1773130.0,
"repeat_count": 0.0,
- "routers_loss": 0.001354650012217462,
+ "routers_loss": 0.001382062560878694,
"skip_count": 0.0,
"step": 1098,
"text_loss": 0.43132516741752625
@@ -10448,13 +10448,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.123046875,
"learning_rate": 0.000991431260373349,
- "loss": 0.0326,
+ "loss": 0.0329,
"macro_f1": 0.3144654333591461,
"num_tokens": 1775682.0,
"repeat_count": 1.0,
- "routers_loss": 0.1097714751958847,
+ "routers_loss": 0.1115434318780899,
"skip_count": 2.0,
"step": 1100,
"text_loss": 0.3218227028846741
@@ -10467,13 +10467,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.111328125,
"learning_rate": 0.000991374109918503,
- "loss": 0.0187,
+ "loss": 0.0185,
"macro_f1": 0.3333333432674408,
"num_tokens": 1778407.0,
"repeat_count": 0.0,
- "routers_loss": 0.009649592451751232,
+ "routers_loss": 0.009529678151011467,
"skip_count": 0.0,
"step": 1102,
"text_loss": 0.17183731496334076
@@ -10486,13 +10486,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.11083984375,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.000991316771168171,
- "loss": 0.0447,
+ "loss": 0.044,
"macro_f1": 0.5492662787437439,
"num_tokens": 1781518.0,
"repeat_count": 0.0,
- "routers_loss": 0.020858706906437874,
+ "routers_loss": 0.018668074160814285,
"skip_count": 2.0,
"step": 1104,
"text_loss": 1.1324785947799683
@@ -10505,13 +10505,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.125,
"learning_rate": 0.0009912592441443258,
- "loss": 0.0428,
+ "loss": 0.0411,
"macro_f1": 0.3272727429866791,
"num_tokens": 1784878.0,
"repeat_count": 0.0,
- "routers_loss": 0.048101235181093216,
+ "routers_loss": 0.04145100712776184,
"skip_count": 1.0,
"step": 1106,
"text_loss": 0.6082063317298889
@@ -10524,13 +10524,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009912015288690112,
- "loss": 0.0435,
+ "loss": 0.0421,
"macro_f1": 0.3272727429866791,
"num_tokens": 1788978.0,
"repeat_count": 0.0,
- "routers_loss": 0.02875671721994877,
+ "routers_loss": 0.021450644358992577,
"skip_count": 1.0,
"step": 1108,
"text_loss": 0.5597621202468872
@@ -10543,13 +10543,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08349609375,
+ "grad_norm": 0.083984375,
"learning_rate": 0.0009911436253643444,
- "loss": 0.0247,
+ "loss": 0.0238,
"macro_f1": 0.3333333432674408,
"num_tokens": 1792321.0,
"repeat_count": 0.0,
- "routers_loss": 0.019005145877599716,
+ "routers_loss": 0.017405325546860695,
"skip_count": 0.0,
"step": 1110,
"text_loss": 0.2560598850250244
@@ -10562,13 +10562,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.0009910855336525137,
- "loss": 0.0393,
+ "loss": 0.0383,
"macro_f1": 0.3333333432674408,
"num_tokens": 1795182.0,
"repeat_count": 0.0,
- "routers_loss": 0.007238700054585934,
+ "routers_loss": 0.007162237539887428,
"skip_count": 0.0,
"step": 1112,
"text_loss": 0.3438240587711334
@@ -10581,13 +10581,13 @@
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.125,
+ "grad_norm": 0.115234375,
"learning_rate": 0.00099102725375578,
"loss": 0.0326,
"macro_f1": 0.480392187833786,
"num_tokens": 1798987.0,
"repeat_count": 1.0,
- "routers_loss": 0.12206140905618668,
+ "routers_loss": 0.11149197816848755,
"skip_count": 3.0,
"step": 1114,
"text_loss": 0.20455503463745117
@@ -10595,18 +10595,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 5.239506897563839,
- "f1_execute": 0.8799999952316284,
+ "f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.10791015625,
"learning_rate": 0.0009909687856964767,
- "loss": 0.0366,
- "macro_f1": 0.29333335161209106,
+ "loss": 0.035,
+ "macro_f1": 0.3006536364555359,
"num_tokens": 1802064.0,
"repeat_count": 2.0,
- "routers_loss": 0.15721899271011353,
+ "routers_loss": 0.12679415941238403,
"skip_count": 3.0,
"step": 1116,
"text_loss": 0.11996729671955109
@@ -10619,32 +10619,32 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.125,
+ "grad_norm": 0.12451171875,
"learning_rate": 0.0009909101294970082,
- "loss": 0.0366,
+ "loss": 0.0365,
"macro_f1": 0.5492662787437439,
"num_tokens": 1805412.0,
"repeat_count": 0.0,
- "routers_loss": 0.05058665946125984,
+ "routers_loss": 0.05108053982257843,
"skip_count": 2.0,
"step": 1118,
"text_loss": 0.13224145770072937
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 5.258291752274729,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.123046875,
"learning_rate": 0.0009908512851798522,
- "loss": 0.0454,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0455,
+ "macro_f1": 0.6603773832321167,
"num_tokens": 1808196.0,
"repeat_count": 1.0,
- "routers_loss": 0.023021472617983818,
+ "routers_loss": 0.02131766639649868,
"skip_count": 1.0,
"step": 1120,
"text_loss": 0.7824069261550903
@@ -10657,13 +10657,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1435546875,
+ "grad_norm": 0.138671875,
"learning_rate": 0.0009907922527675576,
- "loss": 0.0409,
+ "loss": 0.0405,
"macro_f1": 0.3333333432674408,
"num_tokens": 1811622.0,
"repeat_count": 0.0,
- "routers_loss": 0.006660689599812031,
+ "routers_loss": 0.006226244382560253,
"skip_count": 0.0,
"step": 1122,
"text_loss": 0.5419743061065674
@@ -10676,13 +10676,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.12890625,
"learning_rate": 0.000990733032282746,
- "loss": 0.0547,
+ "loss": 0.0535,
"macro_f1": 0.5492662787437439,
"num_tokens": 1814628.0,
"repeat_count": 0.0,
- "routers_loss": 0.031727343797683716,
+ "routers_loss": 0.03088250942528248,
"skip_count": 2.0,
"step": 1124,
"text_loss": 0.37100958824157715
@@ -10695,13 +10695,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.0810546875,
"learning_rate": 0.000990673623748111,
- "loss": 0.0351,
+ "loss": 0.0348,
"macro_f1": 0.32098767161369324,
"num_tokens": 1817205.0,
"repeat_count": 0.0,
- "routers_loss": 0.06140992045402527,
+ "routers_loss": 0.05495348572731018,
"skip_count": 1.0,
"step": 1126,
"text_loss": 0.20241330564022064
@@ -10709,18 +10709,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
- "avg_layers": 25.0,
+ "avg_layers": 26.0,
"epoch": 5.295861461696507,
- "f1_execute": 0.9411764740943909,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.4000000059604645,
- "grad_norm": 0.09814453125,
+ "f1_skip": 0.5,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0009906140271864173,
- "loss": 0.0436,
- "macro_f1": 0.44705885648727417,
+ "loss": 0.0433,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 1820141.0,
"repeat_count": 0.0,
- "routers_loss": 0.03872275352478027,
+ "routers_loss": 0.037809282541275024,
"skip_count": 2.0,
"step": 1128,
"text_loss": 0.32965806126594543
@@ -10728,18 +10728,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 5.305253889051952,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09228515625,
+ "grad_norm": 0.0908203125,
"learning_rate": 0.0009905542426205032,
- "loss": 0.0353,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0348,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 1824011.0,
"repeat_count": 0.0,
- "routers_loss": 0.031013142317533493,
+ "routers_loss": 0.03320181369781494,
"skip_count": 1.0,
"step": 1130,
"text_loss": 0.36329755187034607
@@ -10752,13 +10752,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1123046875,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009904942700732777,
- "loss": 0.0333,
+ "loss": 0.0335,
"macro_f1": 0.3333333432674408,
"num_tokens": 1826873.0,
"repeat_count": 0.0,
- "routers_loss": 0.004357635974884033,
+ "routers_loss": 0.004102326463907957,
"skip_count": 0.0,
"step": 1132,
"text_loss": 0.6692602038383484
@@ -10771,13 +10771,13 @@
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11279296875,
+ "grad_norm": 0.08544921875,
"learning_rate": 0.0009904341095677226,
"loss": 0.03,
"macro_f1": 0.29333335161209106,
"num_tokens": 1830103.0,
"repeat_count": 2.0,
- "routers_loss": 0.2376353144645691,
+ "routers_loss": 0.2376193106174469,
"skip_count": 4.0,
"step": 1134,
"text_loss": 0.19212862849235535
@@ -10790,13 +10790,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10888671875,
+ "grad_norm": 0.119140625,
"learning_rate": 0.0009903737611268919,
- "loss": 0.0446,
+ "loss": 0.0445,
"macro_f1": 0.3333333432674408,
"num_tokens": 1833201.0,
"repeat_count": 0.0,
- "routers_loss": 0.004978097043931484,
+ "routers_loss": 0.005253395065665245,
"skip_count": 0.0,
"step": 1136,
"text_loss": 0.6773360371589661
@@ -10809,13 +10809,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009903132247739107,
- "loss": 0.0309,
+ "loss": 0.0305,
"macro_f1": 0.3076923191547394,
"num_tokens": 1836045.0,
"repeat_count": 1.0,
- "routers_loss": 0.14195409417152405,
+ "routers_loss": 0.14382585883140564,
"skip_count": 3.0,
"step": 1138,
"text_loss": 0.2882297933101654
@@ -10828,13 +10828,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.15234375,
+ "grad_norm": 0.150390625,
"learning_rate": 0.0009902525005319766,
- "loss": 0.0403,
+ "loss": 0.04,
"macro_f1": 0.5427350401878357,
"num_tokens": 1839721.0,
"repeat_count": 1.0,
- "routers_loss": 0.04005253314971924,
+ "routers_loss": 0.04033960774540901,
"skip_count": 2.0,
"step": 1140,
"text_loss": 0.7172559499740601
@@ -10847,13 +10847,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.12109375,
"learning_rate": 0.0009901915884243597,
- "loss": 0.0353,
+ "loss": 0.0351,
"macro_f1": 0.6666666865348816,
"num_tokens": 1842614.0,
"repeat_count": 1.0,
- "routers_loss": 0.006839688867330551,
+ "routers_loss": 0.005162308923900127,
"skip_count": 0.0,
"step": 1142,
"text_loss": 0.42892804741859436
@@ -10866,13 +10866,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.0009901304884744014,
- "loss": 0.0396,
+ "loss": 0.0386,
"macro_f1": 0.3144654333591461,
"num_tokens": 1845444.0,
"repeat_count": 1.0,
- "routers_loss": 0.10174567997455597,
+ "routers_loss": 0.10117656737565994,
"skip_count": 2.0,
"step": 1144,
"text_loss": 0.20806430280208588
@@ -10885,13 +10885,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009900692007055152,
- "loss": 0.0365,
+ "loss": 0.0357,
"macro_f1": 0.3333333432674408,
"num_tokens": 1848558.0,
"repeat_count": 0.0,
- "routers_loss": 0.014655748382210732,
+ "routers_loss": 0.014107038266956806,
"skip_count": 0.0,
"step": 1146,
"text_loss": 0.5355974435806274
@@ -10904,13 +10904,13 @@
"f1_execute": 0.9166666865348816,
"f1_repeat": 0.4000000059604645,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.158203125,
+ "grad_norm": 0.16015625,
"learning_rate": 0.000990007725141187,
- "loss": 0.0467,
+ "loss": 0.0449,
"macro_f1": 0.6611111164093018,
"num_tokens": 1852723.0,
"repeat_count": 4.0,
- "routers_loss": 0.16960746049880981,
+ "routers_loss": 0.15537866950035095,
"skip_count": 2.0,
"step": 1148,
"text_loss": 0.6388513445854187
@@ -10923,32 +10923,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1220703125,
+ "grad_norm": 0.1181640625,
"learning_rate": 0.0009899460618049741,
- "loss": 0.0399,
+ "loss": 0.0397,
"macro_f1": 0.3333333432674408,
"num_tokens": 1856181.0,
"repeat_count": 0.0,
- "routers_loss": 0.011591178365051746,
+ "routers_loss": 0.011800912208855152,
"skip_count": 0.0,
"step": 1150,
"text_loss": 0.6113069653511047
},
{
- "acc_repeat": 0.5,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 30.0,
"epoch": 5.408570589961843,
- "f1_execute": 0.9811320900917053,
- "f1_repeat": 0.6666666865348816,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09912109375,
+ "grad_norm": 0.1005859375,
"learning_rate": 0.000989884210720506,
- "loss": 0.0332,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0331,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 1859685.0,
"repeat_count": 2.0,
- "routers_loss": 0.04036068916320801,
+ "routers_loss": 0.022900646552443504,
"skip_count": 0.0,
"step": 1152,
"text_loss": 0.25718021392822266
@@ -10961,13 +10961,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009898221719114844,
- "loss": 0.0366,
+ "loss": 0.0354,
"macro_f1": 0.3272727429866791,
"num_tokens": 1862505.0,
"repeat_count": 0.0,
- "routers_loss": 0.030165785923600197,
+ "routers_loss": 0.026814989745616913,
"skip_count": 1.0,
"step": 1154,
"text_loss": 0.5426549911499023
@@ -10980,13 +10980,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0966796875,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009897599454016823,
- "loss": 0.0421,
+ "loss": 0.0401,
"macro_f1": 0.3333333432674408,
"num_tokens": 1866266.0,
"repeat_count": 0.0,
- "routers_loss": 0.003615695284679532,
+ "routers_loss": 0.0032623792067170143,
"skip_count": 0.0,
"step": 1156,
"text_loss": 0.37752896547317505
@@ -10999,13 +10999,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.07080078125,
"learning_rate": 0.0009896975312149454,
- "loss": 0.0377,
+ "loss": 0.0369,
"macro_f1": 0.3333333432674408,
"num_tokens": 1870216.0,
"repeat_count": 0.0,
- "routers_loss": 0.01679840311408043,
+ "routers_loss": 0.015617577359080315,
"skip_count": 0.0,
"step": 1158,
"text_loss": 0.18207129836082458
@@ -11018,13 +11018,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009896349293751906,
- "loss": 0.0422,
+ "loss": 0.0423,
"macro_f1": 0.3272727429866791,
"num_tokens": 1873338.0,
"repeat_count": 0.0,
- "routers_loss": 0.024936161935329437,
+ "routers_loss": 0.02250153198838234,
"skip_count": 1.0,
"step": 1160,
"text_loss": 0.548884391784668
@@ -11037,13 +11037,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009895721399064072,
- "loss": 0.0407,
+ "loss": 0.0388,
"macro_f1": 0.32098764181137085,
"num_tokens": 1876470.0,
"repeat_count": 1.0,
- "routers_loss": 0.06472968310117722,
+ "routers_loss": 0.055204521864652634,
"skip_count": 1.0,
"step": 1162,
"text_loss": 0.48052409291267395
@@ -11056,13 +11056,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009895091628326564,
- "loss": 0.031,
+ "loss": 0.0293,
"macro_f1": 0.3333333432674408,
"num_tokens": 1879354.0,
"repeat_count": 0.0,
- "routers_loss": 0.009633494541049004,
+ "routers_loss": 0.009093789383769035,
"skip_count": 0.0,
"step": 1164,
"text_loss": 0.3908069431781769
@@ -11075,13 +11075,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.140625,
"learning_rate": 0.000989445998178071,
"loss": 0.0323,
"macro_f1": 0.3272727429866791,
"num_tokens": 1881941.0,
"repeat_count": 0.0,
- "routers_loss": 0.01458993274718523,
+ "routers_loss": 0.015086972154676914,
"skip_count": 1.0,
"step": 1166,
"text_loss": 0.4884725511074066
@@ -11094,13 +11094,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009893826459668558,
- "loss": 0.0389,
+ "loss": 0.0386,
"macro_f1": 0.3144654333591461,
"num_tokens": 1885374.0,
"repeat_count": 0.0,
- "routers_loss": 0.06636982411146164,
+ "routers_loss": 0.06587666273117065,
"skip_count": 3.0,
"step": 1168,
"text_loss": 0.12760137021541595
@@ -11113,13 +11113,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.0009893191062232873,
- "loss": 0.0325,
+ "loss": 0.0322,
"macro_f1": 0.3333333432674408,
"num_tokens": 1888612.0,
"repeat_count": 0.0,
- "routers_loss": 0.005644182674586773,
+ "routers_loss": 0.006088624242693186,
"skip_count": 0.0,
"step": 1170,
"text_loss": 0.4821319580078125
@@ -11132,13 +11132,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.0009892553789717143,
- "loss": 0.0402,
+ "loss": 0.0389,
"macro_f1": 0.3333333432674408,
"num_tokens": 1891463.0,
"repeat_count": 0.0,
- "routers_loss": 0.010273848660290241,
+ "routers_loss": 0.010113578289747238,
"skip_count": 0.0,
"step": 1172,
"text_loss": 0.3613642454147339
@@ -11151,13 +11151,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009891914642365573,
- "loss": 0.0415,
+ "loss": 0.0404,
"macro_f1": 0.3333333432674408,
"num_tokens": 1894230.0,
"repeat_count": 0.0,
- "routers_loss": 0.004529652185738087,
+ "routers_loss": 0.004947459790855646,
"skip_count": 0.0,
"step": 1174,
"text_loss": 0.5037549138069153
@@ -11170,13 +11170,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.0009891273620423083,
- "loss": 0.045,
+ "loss": 0.0428,
"macro_f1": 0.3272727429866791,
"num_tokens": 1897294.0,
"repeat_count": 1.0,
- "routers_loss": 0.024671228602528572,
+ "routers_loss": 0.026075217872858047,
"skip_count": 0.0,
"step": 1176,
"text_loss": 0.32558977603912354
@@ -11189,13 +11189,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009890630724135314,
- "loss": 0.0354,
+ "loss": 0.0351,
"macro_f1": 0.3272727429866791,
"num_tokens": 1901553.0,
"repeat_count": 0.0,
- "routers_loss": 0.06466450542211533,
+ "routers_loss": 0.06650999188423157,
"skip_count": 1.0,
"step": 1178,
"text_loss": 0.23473620414733887
@@ -11208,13 +11208,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1767578125,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009889985953748625,
- "loss": 0.0278,
+ "loss": 0.0268,
"macro_f1": 0.6666666865348816,
"num_tokens": 1904556.0,
"repeat_count": 0.0,
- "routers_loss": 0.010566026903688908,
+ "routers_loss": 0.010361116379499435,
"skip_count": 1.0,
"step": 1180,
"text_loss": 0.6927042007446289
@@ -11227,13 +11227,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.103515625,
"learning_rate": 0.0009889339309510094,
- "loss": 0.037,
+ "loss": 0.0351,
"macro_f1": 0.3333333432674408,
"num_tokens": 1908053.0,
"repeat_count": 0.0,
- "routers_loss": 0.013842248357832432,
+ "routers_loss": 0.013286533765494823,
"skip_count": 0.0,
"step": 1182,
"text_loss": 0.19977325201034546
@@ -11246,13 +11246,13 @@
"f1_execute": 0.9387754797935486,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.5,
- "grad_norm": 0.07373046875,
+ "grad_norm": 0.058837890625,
"learning_rate": 0.0009888690791667518,
- "loss": 0.0215,
+ "loss": 0.0204,
"macro_f1": 0.7018141150474548,
"num_tokens": 1911754.0,
"repeat_count": 2.0,
- "routers_loss": 0.122759610414505,
+ "routers_loss": 0.11920545995235443,
"skip_count": 3.0,
"step": 1184,
"text_loss": 0.4072858691215515
@@ -11265,32 +11265,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009888040400469408,
- "loss": 0.0402,
+ "loss": 0.0391,
"macro_f1": 0.3272727429866791,
"num_tokens": 1914862.0,
"repeat_count": 0.0,
- "routers_loss": 0.035315629094839096,
+ "routers_loss": 0.03652849420905113,
"skip_count": 1.0,
"step": 1186,
"text_loss": 0.2654043138027191
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 5.577634282359847,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.0009887388136164996,
- "loss": 0.034,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0336,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1918542.0,
"repeat_count": 0.0,
- "routers_loss": 0.040048226714134216,
+ "routers_loss": 0.03991910070180893,
"skip_count": 2.0,
"step": 1188,
"text_loss": 0.21130657196044922
@@ -11298,18 +11298,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 5.587026709715292,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.09521484375,
"learning_rate": 0.000988673399900423,
- "loss": 0.044,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0429,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1921589.0,
"repeat_count": 0.0,
- "routers_loss": 0.012814820744097233,
+ "routers_loss": 0.014900135807693005,
"skip_count": 0.0,
"step": 1190,
"text_loss": 0.5519335865974426
@@ -11322,13 +11322,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009886077989237777,
- "loss": 0.0407,
+ "loss": 0.0405,
"macro_f1": 0.3272727429866791,
"num_tokens": 1924320.0,
"repeat_count": 0.0,
- "routers_loss": 0.05977959558367729,
+ "routers_loss": 0.06271552294492722,
"skip_count": 1.0,
"step": 1192,
"text_loss": 0.213813915848732
@@ -11341,13 +11341,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.0,
"f1_skip": 0.888888955116272,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.1875,
"learning_rate": 0.000988542010711702,
- "loss": 0.0334,
+ "loss": 0.0342,
"macro_f1": 0.6225374937057495,
"num_tokens": 1927178.0,
"repeat_count": 0.0,
- "routers_loss": 0.031448643654584885,
+ "routers_loss": 0.03081391751766205,
"skip_count": 5.0,
"step": 1194,
"text_loss": 0.7524349093437195
@@ -11360,13 +11360,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.265625,
+ "grad_norm": 0.255859375,
"learning_rate": 0.0009884760352894064,
- "loss": 0.0523,
+ "loss": 0.0518,
"macro_f1": 0.3333333432674408,
"num_tokens": 1930216.0,
"repeat_count": 0.0,
- "routers_loss": 0.008164947852492332,
+ "routers_loss": 0.008556773886084557,
"skip_count": 0.0,
"step": 1196,
"text_loss": 0.28230375051498413
@@ -11379,32 +11379,32 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.5,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.1064453125,
"learning_rate": 0.0009884098726821726,
- "loss": 0.0478,
+ "loss": 0.0472,
"macro_f1": 0.4871794879436493,
"num_tokens": 1933312.0,
"repeat_count": 3.0,
- "routers_loss": 0.04045635461807251,
+ "routers_loss": 0.05344727262854576,
"skip_count": 0.0,
"step": 1198,
"text_loss": 0.5509607195854187
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6666666865348816,
- "avg_layers": 26.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
"epoch": 5.633988846492516,
- "f1_execute": 0.9600000381469727,
+ "f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
- "f1_skip": 0.800000011920929,
- "grad_norm": 0.1240234375,
+ "f1_skip": 0.5,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.000988343522915354,
- "loss": 0.0447,
- "macro_f1": 0.5866667032241821,
+ "loss": 0.0441,
+ "macro_f1": 0.480392187833786,
"num_tokens": 1936160.0,
"repeat_count": 1.0,
- "routers_loss": 0.06872973591089249,
+ "routers_loss": 0.07324771583080292,
"skip_count": 3.0,
"step": 1200,
"text_loss": 0.30565372109413147
@@ -11412,18 +11412,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
- "avg_layers": 24.0,
+ "avg_layers": 25.0,
"epoch": 5.64338127384796,
- "f1_execute": 0.8695651888847351,
+ "f1_execute": 0.8936169743537903,
"f1_repeat": 0.0,
- "f1_skip": 0.4000000059604645,
- "grad_norm": 0.25390625,
+ "f1_skip": 0.444444477558136,
+ "grad_norm": 0.2470703125,
"learning_rate": 0.0009882769860143764,
- "loss": 0.0331,
- "macro_f1": 0.4231884181499481,
+ "loss": 0.0317,
+ "macro_f1": 0.4460204839706421,
"num_tokens": 1939266.0,
"repeat_count": 0.0,
- "routers_loss": 0.20964151620864868,
+ "routers_loss": 0.18620699644088745,
"skip_count": 6.0,
"step": 1202,
"text_loss": 0.976121723651886
@@ -11442,26 +11442,26 @@
"macro_f1": 0.6666666865348816,
"num_tokens": 1942173.0,
"repeat_count": 0.0,
- "routers_loss": 0.00690250750631094,
+ "routers_loss": 0.007703613489866257,
"skip_count": 1.0,
"step": 1204,
"text_loss": 0.5647401809692383
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 5.66216612855885,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009881433509120036,
- "loss": 0.0372,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0376,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1945071.0,
"repeat_count": 0.0,
- "routers_loss": 0.022315658628940582,
+ "routers_loss": 0.02162683941423893,
"skip_count": 2.0,
"step": 1206,
"text_loss": 0.24229218065738678
@@ -11474,13 +11474,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1083984375,
+ "grad_norm": 0.0966796875,
"learning_rate": 0.0009880762527618176,
- "loss": 0.0388,
+ "loss": 0.0383,
"macro_f1": 0.3333333432674408,
"num_tokens": 1949060.0,
"repeat_count": 0.0,
- "routers_loss": 0.017015069723129272,
+ "routers_loss": 0.017667081207036972,
"skip_count": 0.0,
"step": 1208,
"text_loss": 0.4035970866680145
@@ -11493,13 +11493,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009880089675798908,
- "loss": 0.0372,
+ "loss": 0.0367,
"macro_f1": 0.3333333432674408,
"num_tokens": 1951698.0,
"repeat_count": 0.0,
- "routers_loss": 0.006532609928399324,
+ "routers_loss": 0.006405784282833338,
"skip_count": 0.0,
"step": 1210,
"text_loss": 0.5319879055023193
@@ -11512,13 +11512,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009879414953920071,
- "loss": 0.0301,
+ "loss": 0.0294,
"macro_f1": 0.3333333432674408,
"num_tokens": 1955266.0,
"repeat_count": 0.0,
- "routers_loss": 0.009720963425934315,
+ "routers_loss": 0.009859707206487656,
"skip_count": 0.0,
"step": 1212,
"text_loss": 0.6687407493591309
@@ -11531,32 +11531,32 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009878738362240219,
- "loss": 0.046,
+ "loss": 0.045,
"macro_f1": 0.5492662787437439,
"num_tokens": 1958538.0,
"repeat_count": 0.0,
- "routers_loss": 0.03176085278391838,
+ "routers_loss": 0.030890554189682007,
"skip_count": 2.0,
"step": 1214,
"text_loss": 0.20820017158985138
},
{
"acc_repeat": 0.5,
- "acc_skip": 0.5,
- "avg_layers": 29.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
"epoch": 5.709128265336073,
- "f1_execute": 0.9387754797935486,
+ "f1_execute": 0.9200000166893005,
"f1_repeat": 0.5,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.2021484375,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.000987805990101862,
- "loss": 0.0323,
- "macro_f1": 0.7018141150474548,
+ "loss": 0.0317,
+ "macro_f1": 0.47333335876464844,
"num_tokens": 1961419.0,
"repeat_count": 2.0,
- "routers_loss": 0.08626245707273483,
+ "routers_loss": 0.10383198410272598,
"skip_count": 2.0,
"step": 1216,
"text_loss": 0.8664976358413696
@@ -11569,13 +11569,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009877379570515268,
- "loss": 0.0374,
+ "loss": 0.0366,
"macro_f1": 0.3333333432674408,
"num_tokens": 1964836.0,
"repeat_count": 0.0,
- "routers_loss": 0.012099343352019787,
+ "routers_loss": 0.013376163318753242,
"skip_count": 0.0,
"step": 1218,
"text_loss": 0.4223395884037018
@@ -11588,13 +11588,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.0859375,
"learning_rate": 0.0009876697370990865,
- "loss": 0.0342,
+ "loss": 0.0343,
"macro_f1": 0.3333333432674408,
"num_tokens": 1967620.0,
"repeat_count": 0.0,
- "routers_loss": 0.007713846862316132,
+ "routers_loss": 0.008577900938689709,
"skip_count": 0.0,
"step": 1220,
"text_loss": 0.4789901375770569
@@ -11607,13 +11607,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.0009876013302706828,
- "loss": 0.0499,
+ "loss": 0.049,
"macro_f1": 0.3333333432674408,
"num_tokens": 1971100.0,
"repeat_count": 0.0,
- "routers_loss": 0.004629489034414291,
+ "routers_loss": 0.004730266984552145,
"skip_count": 0.0,
"step": 1222,
"text_loss": 0.6799837946891785
@@ -11626,13 +11626,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08837890625,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009875327365925295,
- "loss": 0.035,
+ "loss": 0.0341,
"macro_f1": 0.3333333432674408,
"num_tokens": 1974408.0,
"repeat_count": 0.0,
- "routers_loss": 0.010654795914888382,
+ "routers_loss": 0.010849526152014732,
"skip_count": 0.0,
"step": 1224,
"text_loss": 0.18967926502227783
@@ -11640,18 +11640,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 26.0,
+ "avg_layers": 27.0,
"epoch": 5.756090402113296,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009874639560909118,
- "loss": 0.0516,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.0498,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 1977046.0,
"repeat_count": 0.0,
- "routers_loss": 0.05963074415922165,
+ "routers_loss": 0.04841252416372299,
"skip_count": 1.0,
"step": 1226,
"text_loss": 0.6133310198783875
@@ -11664,13 +11664,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1328125,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.0009873949887921867,
- "loss": 0.04,
+ "loss": 0.0402,
"macro_f1": 0.3272727429866791,
"num_tokens": 1980330.0,
"repeat_count": 0.0,
- "routers_loss": 0.028920643031597137,
+ "routers_loss": 0.029638588428497314,
"skip_count": 1.0,
"step": 1228,
"text_loss": 0.15649555623531342
@@ -11678,18 +11678,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 5.774875256824186,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10595703125,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.0009873258347227823,
- "loss": 0.0327,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0331,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1983173.0,
"repeat_count": 0.0,
- "routers_loss": 0.006852717138826847,
+ "routers_loss": 0.009955910965800285,
"skip_count": 0.0,
"step": 1230,
"text_loss": 0.4741005599498749
@@ -11702,13 +11702,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009872564939091989,
- "loss": 0.0346,
+ "loss": 0.0342,
"macro_f1": 0.3333333432674408,
"num_tokens": 1986825.0,
"repeat_count": 0.0,
- "routers_loss": 0.010968753136694431,
+ "routers_loss": 0.010205300524830818,
"skip_count": 0.0,
"step": 1232,
"text_loss": 0.5315462350845337
@@ -11721,13 +11721,13 @@
"f1_execute": 0.9302325248718262,
"f1_repeat": 1.0,
"f1_skip": 0.7272727489471436,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.11865234375,
"learning_rate": 0.0009871869663780077,
- "loss": 0.0344,
+ "loss": 0.0336,
"macro_f1": 0.8858351111412048,
"num_tokens": 1990448.0,
"repeat_count": 1.0,
- "routers_loss": 0.0906950980424881,
+ "routers_loss": 0.09120134264230728,
"skip_count": 7.0,
"step": 1234,
"text_loss": 0.6187508702278137
@@ -11740,13 +11740,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.125,
"learning_rate": 0.0009871172521558522,
- "loss": 0.0484,
+ "loss": 0.0475,
"macro_f1": 0.6666666865348816,
"num_tokens": 1993474.0,
"repeat_count": 0.0,
- "routers_loss": 0.016306072473526,
+ "routers_loss": 0.016188839450478554,
"skip_count": 1.0,
"step": 1236,
"text_loss": 0.20783066749572754
@@ -11759,13 +11759,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.208984375,
+ "grad_norm": 0.216796875,
"learning_rate": 0.0009870473512694465,
- "loss": 0.038,
+ "loss": 0.0373,
"macro_f1": 0.5934640765190125,
"num_tokens": 1996536.0,
"repeat_count": 0.0,
- "routers_loss": 0.05804471671581268,
+ "routers_loss": 0.05046704784035683,
"skip_count": 3.0,
"step": 1238,
"text_loss": 0.247748002409935
@@ -11773,18 +11773,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 5.821837393601409,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.091796875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.09033203125,
"learning_rate": 0.0009869772637455772,
- "loss": 0.0256,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0251,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 1999530.0,
"repeat_count": 0.0,
- "routers_loss": 0.045395996421575546,
+ "routers_loss": 0.044926248490810394,
"skip_count": 2.0,
"step": 1240,
"text_loss": 0.26001980900764465
@@ -11797,13 +11797,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11767578125,
+ "grad_norm": 0.1513671875,
"learning_rate": 0.000986906989611102,
- "loss": 0.0438,
+ "loss": 0.0446,
"macro_f1": 0.3272727429866791,
"num_tokens": 2002782.0,
"repeat_count": 0.0,
- "routers_loss": 0.020834850147366524,
+ "routers_loss": 0.025911526754498482,
"skip_count": 0.0,
"step": 1242,
"text_loss": 0.9009982943534851
@@ -11816,13 +11816,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1123046875,
+ "grad_norm": 0.115234375,
"learning_rate": 0.0009868365288929492,
- "loss": 0.0377,
+ "loss": 0.0371,
"macro_f1": 0.3333333432674408,
"num_tokens": 2005331.0,
"repeat_count": 0.0,
- "routers_loss": 0.005241698585450649,
+ "routers_loss": 0.0043760035187006,
"skip_count": 0.0,
"step": 1244,
"text_loss": 0.5547386407852173
@@ -11835,13 +11835,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0966796875,
+ "grad_norm": 0.1005859375,
"learning_rate": 0.0009867658816181206,
- "loss": 0.038,
+ "loss": 0.0374,
"macro_f1": 0.3333333432674408,
"num_tokens": 2008115.0,
"repeat_count": 0.0,
- "routers_loss": 0.008387803100049496,
+ "routers_loss": 0.009227181784808636,
"skip_count": 0.0,
"step": 1246,
"text_loss": 1.0067731142044067
@@ -11854,13 +11854,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.126953125,
"learning_rate": 0.000986695047813688,
- "loss": 0.0256,
+ "loss": 0.0261,
"macro_f1": 0.3272727429866791,
"num_tokens": 2011137.0,
"repeat_count": 1.0,
- "routers_loss": 0.02261745184659958,
+ "routers_loss": 0.023822437971830368,
"skip_count": 0.0,
"step": 1248,
"text_loss": 0.30058956146240234
@@ -11873,32 +11873,32 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009866240275067948,
- "loss": 0.0435,
+ "loss": 0.044,
"macro_f1": 0.47333335876464844,
"num_tokens": 2014159.0,
"repeat_count": 2.0,
- "routers_loss": 0.21678555011749268,
+ "routers_loss": 0.21523773670196533,
"skip_count": 3.0,
"step": 1250,
"text_loss": 0.39072203636169434
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 5.878191957734077,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.0009865528207246563,
- "loss": 0.0358,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0351,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 2017731.0,
"repeat_count": 0.0,
- "routers_loss": 0.06554054468870163,
+ "routers_loss": 0.06184682995080948,
"skip_count": 2.0,
"step": 1252,
"text_loss": 0.35751575231552124
@@ -11911,13 +11911,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.166015625,
"learning_rate": 0.000986481427494559,
- "loss": 0.0337,
+ "loss": 0.0336,
"macro_f1": 0.3333333432674408,
"num_tokens": 2020485.0,
"repeat_count": 0.0,
- "routers_loss": 0.007237187586724758,
+ "routers_loss": 0.007573372684419155,
"skip_count": 0.0,
"step": 1254,
"text_loss": 0.4061077833175659
@@ -11930,13 +11930,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1845703125,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.000986409847843861,
- "loss": 0.0387,
+ "loss": 0.0382,
"macro_f1": 0.3272727429866791,
"num_tokens": 2024149.0,
"repeat_count": 1.0,
- "routers_loss": 0.08003793656826019,
+ "routers_loss": 0.07447971403598785,
"skip_count": 0.0,
"step": 1256,
"text_loss": 0.41876497864723206
@@ -11949,13 +11949,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000986338081799992,
- "loss": 0.0341,
+ "loss": 0.0351,
"macro_f1": 0.3333333432674408,
"num_tokens": 2026545.0,
"repeat_count": 0.0,
- "routers_loss": 0.006424390245229006,
+ "routers_loss": 0.006609147880226374,
"skip_count": 0.0,
"step": 1258,
"text_loss": 0.4673794209957123
@@ -11968,13 +11968,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10009765625,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009862661293904523,
- "loss": 0.0482,
+ "loss": 0.0498,
"macro_f1": 0.32098764181137085,
"num_tokens": 2029581.0,
"repeat_count": 0.0,
- "routers_loss": 0.10797854512929916,
+ "routers_loss": 0.10624702274799347,
"skip_count": 2.0,
"step": 1260,
"text_loss": 0.3483233153820038
@@ -11987,13 +11987,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.111328125,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.0009861939906428145,
- "loss": 0.053,
+ "loss": 0.0525,
"macro_f1": 0.3333333432674408,
"num_tokens": 2033936.0,
"repeat_count": 0.0,
- "routers_loss": 0.006734046153724194,
+ "routers_loss": 0.007944886572659016,
"skip_count": 0.0,
"step": 1262,
"text_loss": 0.16362667083740234
@@ -12006,13 +12006,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009861216655847225,
- "loss": 0.0373,
+ "loss": 0.0376,
"macro_f1": 0.6666666865348816,
"num_tokens": 2037876.0,
"repeat_count": 1.0,
- "routers_loss": 0.00564212491735816,
+ "routers_loss": 0.007004092447459698,
"skip_count": 0.0,
"step": 1264,
"text_loss": 0.43228110671043396
@@ -12025,13 +12025,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1044921875,
+ "grad_norm": 0.1005859375,
"learning_rate": 0.0009860491542438912,
- "loss": 0.0472,
+ "loss": 0.047,
"macro_f1": 0.3272727429866791,
"num_tokens": 2040842.0,
"repeat_count": 0.0,
- "routers_loss": 0.026137735694646835,
+ "routers_loss": 0.026916226372122765,
"skip_count": 1.0,
"step": 1266,
"text_loss": 0.5901188850402832
@@ -12044,13 +12044,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.000985976456648107,
- "loss": 0.0343,
+ "loss": 0.0353,
"macro_f1": 0.3333333432674408,
"num_tokens": 2043890.0,
"repeat_count": 0.0,
- "routers_loss": 0.0069669694639742374,
+ "routers_loss": 0.007325216196477413,
"skip_count": 0.0,
"step": 1268,
"text_loss": 0.8780109882354736
@@ -12063,13 +12063,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.000985903572825228,
- "loss": 0.0323,
+ "loss": 0.0306,
"macro_f1": 0.4871794879436493,
"num_tokens": 2048848.0,
"repeat_count": 0.0,
- "routers_loss": 0.05618409812450409,
+ "routers_loss": 0.05007527023553848,
"skip_count": 2.0,
"step": 1270,
"text_loss": 0.5863722562789917
@@ -12084,11 +12084,11 @@
"f1_skip": 0.0,
"grad_norm": 0.173828125,
"learning_rate": 0.000985830502803183,
- "loss": 0.0391,
+ "loss": 0.0396,
"macro_f1": 0.3272727429866791,
"num_tokens": 2051561.0,
"repeat_count": 0.0,
- "routers_loss": 0.025900620967149734,
+ "routers_loss": 0.023995524272322655,
"skip_count": 0.0,
"step": 1272,
"text_loss": 0.7460709810256958
@@ -12101,13 +12101,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.0009857572466099732,
- "loss": 0.0426,
+ "loss": 0.0431,
"macro_f1": 0.3333333432674408,
"num_tokens": 2054752.0,
"repeat_count": 0.0,
- "routers_loss": 0.006236737594008446,
+ "routers_loss": 0.006928362417966127,
"skip_count": 0.0,
"step": 1274,
"text_loss": 0.5130293369293213
@@ -12120,13 +12120,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.171875,
+ "grad_norm": 0.162109375,
"learning_rate": 0.0009856838042736698,
- "loss": 0.0503,
+ "loss": 0.0501,
"macro_f1": 0.3333333432674408,
"num_tokens": 2058151.0,
"repeat_count": 0.0,
- "routers_loss": 0.006367063149809837,
+ "routers_loss": 0.006969396956264973,
"skip_count": 0.0,
"step": 1276,
"text_loss": 0.5911393761634827
@@ -12139,13 +12139,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.0009856101758224166,
- "loss": 0.0442,
+ "loss": 0.0441,
"macro_f1": 0.3333333432674408,
"num_tokens": 2061012.0,
"repeat_count": 0.0,
- "routers_loss": 0.003392914542928338,
+ "routers_loss": 0.003499418031424284,
"skip_count": 0.0,
"step": 1278,
"text_loss": 0.25347545742988586
@@ -12158,13 +12158,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.000985536361284428,
- "loss": 0.0231,
+ "loss": 0.0229,
"macro_f1": 0.3333333432674408,
"num_tokens": 2064597.0,
"repeat_count": 0.0,
- "routers_loss": 0.007376343477517366,
+ "routers_loss": 0.007856054231524467,
"skip_count": 0.0,
"step": 1280,
"text_loss": 0.7476963400840759
@@ -12177,13 +12177,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.0888671875,
"learning_rate": 0.0009854623606879898,
- "loss": 0.0243,
+ "loss": 0.0245,
"macro_f1": 0.3272727429866791,
"num_tokens": 2067972.0,
"repeat_count": 0.0,
- "routers_loss": 0.02773376554250717,
+ "routers_loss": 0.02617792971432209,
"skip_count": 1.0,
"step": 1282,
"text_loss": 0.5775872468948364
@@ -12196,13 +12196,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.09033203125,
"learning_rate": 0.000985388174061459,
- "loss": 0.0363,
+ "loss": 0.0356,
"macro_f1": 0.32098767161369324,
"num_tokens": 2071812.0,
"repeat_count": 0.0,
- "routers_loss": 0.03535797819495201,
+ "routers_loss": 0.035979997366666794,
"skip_count": 1.0,
"step": 1284,
"text_loss": 0.2933400869369507
@@ -12215,13 +12215,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08447265625,
"learning_rate": 0.0009853138014332646,
- "loss": 0.0269,
+ "loss": 0.0273,
"macro_f1": 0.3333333432674408,
"num_tokens": 2074868.0,
"repeat_count": 0.0,
- "routers_loss": 0.004910993855446577,
+ "routers_loss": 0.005142854526638985,
"skip_count": 0.0,
"step": 1286,
"text_loss": 0.29085102677345276
@@ -12234,13 +12234,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0888671875,
+ "grad_norm": 0.09033203125,
"learning_rate": 0.0009852392428319058,
- "loss": 0.0301,
+ "loss": 0.0306,
"macro_f1": 0.3333333432674408,
"num_tokens": 2078225.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032444109674543142,
+ "routers_loss": 0.0032799106556922197,
"skip_count": 0.0,
"step": 1288,
"text_loss": 0.7293626070022583
@@ -12253,13 +12253,13 @@
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.08935546875,
"learning_rate": 0.0009851644982859537,
- "loss": 0.0272,
+ "loss": 0.0273,
"macro_f1": 0.480392187833786,
"num_tokens": 2081495.0,
"repeat_count": 1.0,
- "routers_loss": 0.12451831251382828,
+ "routers_loss": 0.12224318832159042,
"skip_count": 3.0,
"step": 1290,
"text_loss": 0.26125892996788025
@@ -12272,13 +12272,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009850895678240508,
- "loss": 0.0289,
+ "loss": 0.0283,
"macro_f1": 0.6666666865348816,
"num_tokens": 2084390.0,
"repeat_count": 1.0,
- "routers_loss": 0.011074979789555073,
+ "routers_loss": 0.010662888176739216,
"skip_count": 0.0,
"step": 1292,
"text_loss": 0.3510764539241791
@@ -12291,13 +12291,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.0009850144514749104,
- "loss": 0.0336,
+ "loss": 0.0332,
"macro_f1": 0.5492662787437439,
"num_tokens": 2087210.0,
"repeat_count": 0.0,
- "routers_loss": 0.01774786226451397,
+ "routers_loss": 0.01979079470038414,
"skip_count": 2.0,
"step": 1294,
"text_loss": 0.40202176570892334
@@ -12310,13 +12310,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.000984939149267317,
- "loss": 0.0251,
+ "loss": 0.0253,
"macro_f1": 0.6666666865348816,
"num_tokens": 2090777.0,
"repeat_count": 0.0,
- "routers_loss": 0.0052874404937028885,
+ "routers_loss": 0.005172552540898323,
"skip_count": 1.0,
"step": 1296,
"text_loss": 0.5275651216506958
@@ -12329,13 +12329,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10107421875,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009848636612301272,
- "loss": 0.031,
+ "loss": 0.0299,
"macro_f1": 0.3333333432674408,
"num_tokens": 2094248.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034106262028217316,
+ "routers_loss": 0.0029599082190543413,
"skip_count": 0.0,
"step": 1298,
"text_loss": 0.4517653286457062
@@ -12348,13 +12348,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2177734375,
+ "grad_norm": 0.23046875,
"learning_rate": 0.0009847879873922675,
"loss": 0.0357,
"macro_f1": 0.3333333432674408,
"num_tokens": 2097139.0,
"repeat_count": 0.0,
- "routers_loss": 0.010383229702711105,
+ "routers_loss": 0.011455860920250416,
"skip_count": 0.0,
"step": 1300,
"text_loss": 0.16888445615768433
@@ -12367,13 +12367,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0966796875,
+ "grad_norm": 0.09619140625,
"learning_rate": 0.0009847121277827366,
- "loss": 0.0304,
+ "loss": 0.0301,
"macro_f1": 0.3333333432674408,
"num_tokens": 2100415.0,
"repeat_count": 0.0,
- "routers_loss": 0.0076674893498420715,
+ "routers_loss": 0.008091195486485958,
"skip_count": 0.0,
"step": 1302,
"text_loss": 0.40061676502227783
@@ -12386,13 +12386,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.109375,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.000984636082430604,
- "loss": 0.0287,
+ "loss": 0.0285,
"macro_f1": 0.3333333432674408,
"num_tokens": 2103285.0,
"repeat_count": 0.0,
- "routers_loss": 0.010486516170203686,
+ "routers_loss": 0.009593960829079151,
"skip_count": 0.0,
"step": 1304,
"text_loss": 0.7211073637008667
@@ -12405,13 +12405,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.107421875,
"learning_rate": 0.0009845598513650103,
- "loss": 0.0237,
+ "loss": 0.0231,
"macro_f1": 0.3333333432674408,
"num_tokens": 2106255.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023783023934811354,
+ "routers_loss": 0.0023068038281053305,
"skip_count": 0.0,
"step": 1306,
"text_loss": 0.7077119946479797
@@ -12424,13 +12424,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009844834346151674,
- "loss": 0.044,
+ "loss": 0.043,
"macro_f1": 0.3333333432674408,
"num_tokens": 2109305.0,
"repeat_count": 0.0,
- "routers_loss": 0.006714595016092062,
+ "routers_loss": 0.007703019306063652,
"skip_count": 0.0,
"step": 1308,
"text_loss": 0.3534316122531891
@@ -12443,13 +12443,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009844068322103585,
- "loss": 0.0281,
+ "loss": 0.0287,
"macro_f1": 0.3272727429866791,
"num_tokens": 2112216.0,
"repeat_count": 0.0,
- "routers_loss": 0.022373953834176064,
+ "routers_loss": 0.023549847304821014,
"skip_count": 1.0,
"step": 1310,
"text_loss": 0.6792599558830261
@@ -12462,13 +12462,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.150390625,
"learning_rate": 0.0009843300441799378,
- "loss": 0.0205,
+ "loss": 0.0211,
"macro_f1": 0.3333333432674408,
"num_tokens": 2114925.0,
"repeat_count": 0.0,
- "routers_loss": 0.007452849764376879,
+ "routers_loss": 0.007605871185660362,
"skip_count": 0.0,
"step": 1312,
"text_loss": 0.1571389138698578
@@ -12481,13 +12481,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009842530705533304,
- "loss": 0.0251,
+ "loss": 0.0253,
"macro_f1": 0.3272727429866791,
"num_tokens": 2117744.0,
"repeat_count": 0.0,
- "routers_loss": 0.016413308680057526,
+ "routers_loss": 0.014964760281145573,
"skip_count": 0.0,
"step": 1314,
"text_loss": 0.7840361595153809
@@ -12500,13 +12500,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.000984175911360033,
- "loss": 0.0243,
+ "loss": 0.0238,
"macro_f1": 0.3333333432674408,
"num_tokens": 2120848.0,
"repeat_count": 0.0,
- "routers_loss": 0.004676427226513624,
+ "routers_loss": 0.004663798492401838,
"skip_count": 0.0,
"step": 1316,
"text_loss": 0.536246120929718
@@ -12519,13 +12519,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.000984098566629613,
- "loss": 0.0284,
+ "loss": 0.0288,
"macro_f1": 0.5492662787437439,
"num_tokens": 2123651.0,
"repeat_count": 0.0,
- "routers_loss": 0.024454625323414803,
+ "routers_loss": 0.022852955386042595,
"skip_count": 2.0,
"step": 1318,
"text_loss": 0.43372172117233276
@@ -12538,13 +12538,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.07958984375,
"learning_rate": 0.0009840210363917087,
- "loss": 0.022,
+ "loss": 0.0216,
"macro_f1": 0.3333333432674408,
"num_tokens": 2128011.0,
"repeat_count": 0.0,
- "routers_loss": 0.013495884835720062,
+ "routers_loss": 0.012578422203660011,
"skip_count": 0.0,
"step": 1320,
"text_loss": 0.28190380334854126
@@ -12557,13 +12557,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.0009839433206760306,
- "loss": 0.0213,
+ "loss": 0.0204,
"macro_f1": 0.3333333432674408,
"num_tokens": 2131035.0,
"repeat_count": 0.0,
- "routers_loss": 0.006397814955562353,
+ "routers_loss": 0.006863643880933523,
"skip_count": 0.0,
"step": 1322,
"text_loss": 0.6340444087982178
@@ -12576,13 +12576,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1796875,
"learning_rate": 0.0009838654195123589,
- "loss": 0.0246,
+ "loss": 0.0243,
"macro_f1": 0.3333333432674408,
"num_tokens": 2133856.0,
"repeat_count": 0.0,
- "routers_loss": 0.00503434706479311,
+ "routers_loss": 0.00468854233622551,
"skip_count": 0.0,
"step": 1324,
"text_loss": 0.5138425827026367
@@ -12595,13 +12595,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.115234375,
"learning_rate": 0.0009837873329305458,
- "loss": 0.0402,
+ "loss": 0.0396,
"macro_f1": 0.6666666865348816,
"num_tokens": 2136451.0,
"repeat_count": 1.0,
- "routers_loss": 0.005150494631379843,
+ "routers_loss": 0.005731126759201288,
"skip_count": 0.0,
"step": 1326,
"text_loss": 0.742124617099762
@@ -12614,13 +12614,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000983709060960514,
- "loss": 0.041,
+ "loss": 0.0416,
"macro_f1": 0.3333333432674408,
"num_tokens": 2139496.0,
"repeat_count": 0.0,
- "routers_loss": 0.004570818971842527,
+ "routers_loss": 0.0056343949399888515,
"skip_count": 0.0,
"step": 1328,
"text_loss": 0.7317464351654053
@@ -12633,13 +12633,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09326171875,
+ "grad_norm": 0.10791015625,
"learning_rate": 0.0009836306036322576,
- "loss": 0.0314,
+ "loss": 0.0312,
"macro_f1": 0.3333333432674408,
"num_tokens": 2143120.0,
"repeat_count": 0.0,
- "routers_loss": 0.005299333017319441,
+ "routers_loss": 0.005127966403961182,
"skip_count": 0.0,
"step": 1330,
"text_loss": 0.538652241230011
@@ -12652,13 +12652,13 @@
"f1_execute": 0.9130434989929199,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.111328125,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009835519609758415,
- "loss": 0.0303,
+ "loss": 0.0301,
"macro_f1": 0.590062141418457,
"num_tokens": 2145807.0,
"repeat_count": 3.0,
- "routers_loss": 0.168672576546669,
+ "routers_loss": 0.1673707216978073,
"skip_count": 4.0,
"step": 1332,
"text_loss": 0.3498198091983795
@@ -12671,32 +12671,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009834731330214017,
- "loss": 0.0302,
+ "loss": 0.0293,
"macro_f1": 0.3272727429866791,
"num_tokens": 2148397.0,
"repeat_count": 1.0,
- "routers_loss": 0.05187409743666649,
+ "routers_loss": 0.04026653990149498,
"skip_count": 0.0,
"step": 1334,
"text_loss": 0.8153424859046936
},
{
"acc_repeat": 1.0,
- "acc_skip": 1.0,
- "avg_layers": 26.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 27.0,
"epoch": 6.272380393307896,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.8999999761581421,
"f1_repeat": 0.6666666865348816,
- "f1_skip": 0.9090909361839294,
- "grad_norm": 0.1669921875,
+ "f1_skip": 0.8000000715255737,
+ "grad_norm": 0.16015625,
"learning_rate": 0.0009833941197991455,
- "loss": 0.0339,
- "macro_f1": 0.8329448699951172,
+ "loss": 0.0329,
+ "macro_f1": 0.7888889312744141,
"num_tokens": 2152226.0,
"repeat_count": 2.0,
- "routers_loss": 0.05786697566509247,
+ "routers_loss": 0.05481519177556038,
"skip_count": 5.0,
"step": 1336,
"text_loss": 0.7802760004997253
@@ -12709,13 +12709,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009833149213393506,
- "loss": 0.0315,
+ "loss": 0.0304,
"macro_f1": 0.3272727429866791,
"num_tokens": 2156023.0,
"repeat_count": 0.0,
- "routers_loss": 0.017055779695510864,
+ "routers_loss": 0.01760484278202057,
"skip_count": 0.0,
"step": 1338,
"text_loss": 0.19721226394176483
@@ -12728,13 +12728,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.000983235537672366,
- "loss": 0.0249,
+ "loss": 0.0256,
"macro_f1": 0.3333333432674408,
"num_tokens": 2160037.0,
"repeat_count": 0.0,
- "routers_loss": 0.011614206247031689,
+ "routers_loss": 0.013206037692725658,
"skip_count": 0.0,
"step": 1340,
"text_loss": 0.5003817081451416
@@ -12747,13 +12747,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.000983155968828612,
- "loss": 0.033,
+ "loss": 0.0315,
"macro_f1": 0.6666666865348816,
"num_tokens": 2163910.0,
"repeat_count": 1.0,
- "routers_loss": 0.012611300684511662,
+ "routers_loss": 0.01256406120955944,
"skip_count": 0.0,
"step": 1342,
"text_loss": 0.5996923446655273
@@ -12766,13 +12766,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.11962890625,
"learning_rate": 0.0009830762148385793,
- "loss": 0.0315,
+ "loss": 0.0313,
"macro_f1": 0.3272727429866791,
"num_tokens": 2166921.0,
"repeat_count": 0.0,
- "routers_loss": 0.018757276237010956,
+ "routers_loss": 0.015086234547197819,
"skip_count": 1.0,
"step": 1344,
"text_loss": 0.45356282591819763
@@ -12785,13 +12785,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08447265625,
"learning_rate": 0.0009829962757328297,
- "loss": 0.0229,
+ "loss": 0.0223,
"macro_f1": 0.32098764181137085,
"num_tokens": 2170135.0,
"repeat_count": 0.0,
- "routers_loss": 0.08197146654129028,
+ "routers_loss": 0.07909081131219864,
"skip_count": 2.0,
"step": 1346,
"text_loss": 0.2874644994735718
@@ -12804,13 +12804,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0009829161515419959,
- "loss": 0.0256,
+ "loss": 0.0246,
"macro_f1": 0.6666666865348816,
"num_tokens": 2173029.0,
"repeat_count": 0.0,
- "routers_loss": 0.014122758992016315,
+ "routers_loss": 0.013569854199886322,
"skip_count": 2.0,
"step": 1348,
"text_loss": 0.25533875823020935
@@ -12823,13 +12823,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06005859375,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0009828358422967823,
- "loss": 0.0221,
+ "loss": 0.0226,
"macro_f1": 0.32098764181137085,
"num_tokens": 2176605.0,
"repeat_count": 1.0,
- "routers_loss": 0.08215996623039246,
+ "routers_loss": 0.08111091703176498,
"skip_count": 1.0,
"step": 1350,
"text_loss": 0.32827726006507874
@@ -12842,13 +12842,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09375,
+ "grad_norm": 0.091796875,
"learning_rate": 0.0009827553480279627,
- "loss": 0.0312,
+ "loss": 0.03,
"macro_f1": 0.5427350401878357,
"num_tokens": 2179406.0,
"repeat_count": 0.0,
- "routers_loss": 0.026304977014660835,
+ "routers_loss": 0.026550088077783585,
"skip_count": 2.0,
"step": 1352,
"text_loss": 0.2966301143169403
@@ -12861,13 +12861,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009826746687663832,
- "loss": 0.0302,
+ "loss": 0.0301,
"macro_f1": 0.3333333432674408,
"num_tokens": 2182353.0,
"repeat_count": 0.0,
- "routers_loss": 0.003616038942709565,
+ "routers_loss": 0.003914554137736559,
"skip_count": 0.0,
"step": 1354,
"text_loss": 0.7596251964569092
@@ -12880,13 +12880,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.0849609375,
+ "grad_norm": 0.0859375,
"learning_rate": 0.0009825938045429602,
- "loss": 0.0323,
+ "loss": 0.0324,
"macro_f1": 0.5866667032241821,
"num_tokens": 2185786.0,
"repeat_count": 1.0,
- "routers_loss": 0.060399893671274185,
+ "routers_loss": 0.059612665325403214,
"skip_count": 3.0,
"step": 1356,
"text_loss": 0.12325898557901382
@@ -12899,13 +12899,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10302734375,
+ "grad_norm": 0.10009765625,
"learning_rate": 0.0009825127553886807,
- "loss": 0.0384,
+ "loss": 0.0375,
"macro_f1": 0.3333333432674408,
"num_tokens": 2190157.0,
"repeat_count": 0.0,
- "routers_loss": 0.007164204493165016,
+ "routers_loss": 0.0071132429875433445,
"skip_count": 0.0,
"step": 1358,
"text_loss": 0.9287898540496826
@@ -12918,13 +12918,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0009824315213346033,
- "loss": 0.0343,
+ "loss": 0.0348,
"macro_f1": 0.3333333432674408,
"num_tokens": 2193077.0,
"repeat_count": 0.0,
- "routers_loss": 0.010965060442686081,
+ "routers_loss": 0.009611099027097225,
"skip_count": 0.0,
"step": 1360,
"text_loss": 0.20427259802818298
@@ -12937,13 +12937,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009823501024118569,
- "loss": 0.0276,
+ "loss": 0.0285,
"macro_f1": 0.3333333432674408,
"num_tokens": 2196494.0,
"repeat_count": 0.0,
- "routers_loss": 0.00784136913716793,
+ "routers_loss": 0.006913455203175545,
"skip_count": 0.0,
"step": 1362,
"text_loss": 0.574759840965271
@@ -12956,13 +12956,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009822684986516411,
- "loss": 0.0251,
+ "loss": 0.0245,
"macro_f1": 0.3333333432674408,
"num_tokens": 2199839.0,
"repeat_count": 0.0,
- "routers_loss": 0.009101065807044506,
+ "routers_loss": 0.009208920411765575,
"skip_count": 0.0,
"step": 1364,
"text_loss": 0.42422571778297424
@@ -12970,37 +12970,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 6.413266803639566,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.000982186710085227,
- "loss": 0.0206,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.0208,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 2203212.0,
"repeat_count": 1.0,
- "routers_loss": 0.05967295169830322,
+ "routers_loss": 0.059975091367959976,
"skip_count": 1.0,
"step": 1366,
"text_loss": 0.29213017225265503
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 26.0,
+ "acc_skip": 0.25,
+ "avg_layers": 27.0,
"epoch": 6.42265923099501,
- "f1_execute": 0.9600000381469727,
+ "f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.1875,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.181640625,
"learning_rate": 0.0009821047367439561,
- "loss": 0.0356,
- "macro_f1": 0.542222261428833,
+ "loss": 0.0358,
+ "macro_f1": 0.44705885648727417,
"num_tokens": 2206240.0,
"repeat_count": 0.0,
- "routers_loss": 0.05016552656888962,
+ "routers_loss": 0.048244867473840714,
"skip_count": 4.0,
"step": 1368,
"text_loss": 0.3072395324707031
@@ -13013,13 +13013,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0009820225786592405,
- "loss": 0.038,
+ "loss": 0.0375,
"macro_f1": 0.3272727429866791,
"num_tokens": 2209903.0,
"repeat_count": 1.0,
- "routers_loss": 0.02483060024678707,
+ "routers_loss": 0.026068156585097313,
"skip_count": 0.0,
"step": 1370,
"text_loss": 0.5961400270462036
@@ -13032,13 +13032,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.109375,
"learning_rate": 0.0009819402358625634,
- "loss": 0.0373,
+ "loss": 0.0366,
"macro_f1": 0.3272727429866791,
"num_tokens": 2213439.0,
"repeat_count": 0.0,
- "routers_loss": 0.01982821337878704,
+ "routers_loss": 0.022615568712353706,
"skip_count": 1.0,
"step": 1372,
"text_loss": 0.19375644624233246
@@ -13051,13 +13051,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.000981857708385479,
- "loss": 0.0353,
+ "loss": 0.0346,
"macro_f1": 0.3333333432674408,
"num_tokens": 2216457.0,
"repeat_count": 0.0,
- "routers_loss": 0.004753436427563429,
+ "routers_loss": 0.005855285096913576,
"skip_count": 0.0,
"step": 1374,
"text_loss": 0.5123368501663208
@@ -13070,13 +13070,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09912109375,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009817749962596114,
- "loss": 0.0246,
+ "loss": 0.0249,
"macro_f1": 0.3272727429866791,
"num_tokens": 2219975.0,
"repeat_count": 1.0,
- "routers_loss": 0.06541594862937927,
+ "routers_loss": 0.0651634931564331,
"skip_count": 0.0,
"step": 1376,
"text_loss": 0.5999220609664917
@@ -13089,13 +13089,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.0009816920995166568,
- "loss": 0.0376,
+ "loss": 0.0371,
"macro_f1": 0.6666666865348816,
"num_tokens": 2222833.0,
"repeat_count": 1.0,
- "routers_loss": 0.01156456395983696,
+ "routers_loss": 0.011408994905650616,
"skip_count": 0.0,
"step": 1378,
"text_loss": 0.5323230624198914
@@ -13108,13 +13108,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2392578125,
+ "grad_norm": 0.205078125,
"learning_rate": 0.0009816090181883807,
- "loss": 0.033,
+ "loss": 0.0313,
"macro_f1": 0.32098764181137085,
"num_tokens": 2225842.0,
"repeat_count": 0.0,
- "routers_loss": 0.05175521597266197,
+ "routers_loss": 0.039720915257930756,
"skip_count": 2.0,
"step": 1380,
"text_loss": 0.23363439738750458
@@ -13127,13 +13127,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009815257523066204,
- "loss": 0.0251,
+ "loss": 0.0249,
"macro_f1": 0.3333333432674408,
"num_tokens": 2229430.0,
"repeat_count": 0.0,
- "routers_loss": 0.002684591803699732,
+ "routers_loss": 0.002765297656878829,
"skip_count": 0.0,
"step": 1382,
"text_loss": 0.718977689743042
@@ -13146,13 +13146,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.12890625,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009814423019032835,
- "loss": 0.0397,
+ "loss": 0.0396,
"macro_f1": 0.5492662787437439,
"num_tokens": 2232594.0,
"repeat_count": 2.0,
- "routers_loss": 0.054509978741407394,
+ "routers_loss": 0.05362323671579361,
"skip_count": 0.0,
"step": 1384,
"text_loss": 0.6392166614532471
@@ -13165,13 +13165,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.150390625,
"learning_rate": 0.0009813586670103483,
"loss": 0.0426,
"macro_f1": 0.6603773832321167,
"num_tokens": 2236327.0,
"repeat_count": 1.0,
- "routers_loss": 0.04031623527407646,
+ "routers_loss": 0.031728316098451614,
"skip_count": 1.0,
"step": 1386,
"text_loss": 0.5951619148254395
@@ -13184,13 +13184,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.126953125,
"learning_rate": 0.0009812748476598638,
- "loss": 0.0308,
+ "loss": 0.031,
"macro_f1": 0.5492662787437439,
"num_tokens": 2239746.0,
"repeat_count": 0.0,
- "routers_loss": 0.039687711745500565,
+ "routers_loss": 0.03981253132224083,
"skip_count": 2.0,
"step": 1388,
"text_loss": 0.22756551206111908
@@ -13203,13 +13203,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.12451171875,
"learning_rate": 0.0009811908438839498,
- "loss": 0.0329,
+ "loss": 0.0331,
"macro_f1": 0.5492662787437439,
"num_tokens": 2242786.0,
"repeat_count": 0.0,
- "routers_loss": 0.04785723611712456,
+ "routers_loss": 0.04617162421345711,
"skip_count": 2.0,
"step": 1390,
"text_loss": 0.3233799934387207
@@ -13222,13 +13222,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.154296875,
"learning_rate": 0.000981106655714797,
- "loss": 0.0359,
+ "loss": 0.0358,
"macro_f1": 0.3272727429866791,
"num_tokens": 2245696.0,
"repeat_count": 0.0,
- "routers_loss": 0.046765491366386414,
+ "routers_loss": 0.046828847378492355,
"skip_count": 1.0,
"step": 1392,
"text_loss": 0.24273279309272766
@@ -13241,13 +13241,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009810222831846656,
- "loss": 0.0303,
+ "loss": 0.0307,
"macro_f1": 0.5492662787437439,
"num_tokens": 2249326.0,
"repeat_count": 0.0,
- "routers_loss": 0.015151665546000004,
+ "routers_loss": 0.010921589098870754,
"skip_count": 2.0,
"step": 1394,
"text_loss": 0.3921460807323456
@@ -13260,13 +13260,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009809377263258882,
- "loss": 0.0321,
+ "loss": 0.0315,
"macro_f1": 0.32098767161369324,
"num_tokens": 2253393.0,
"repeat_count": 0.0,
- "routers_loss": 0.04431106895208359,
+ "routers_loss": 0.04564022272825241,
"skip_count": 1.0,
"step": 1396,
"text_loss": 0.582602858543396
@@ -13279,13 +13279,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09814453125,
+ "grad_norm": 0.103515625,
"learning_rate": 0.000980852985170867,
- "loss": 0.0317,
+ "loss": 0.0328,
"macro_f1": 0.3272727429866791,
"num_tokens": 2256626.0,
"repeat_count": 0.0,
- "routers_loss": 0.012700649909675121,
+ "routers_loss": 0.013289985246956348,
"skip_count": 0.0,
"step": 1398,
"text_loss": 0.41031694412231445
@@ -13298,13 +13298,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.0009807680597520745,
- "loss": 0.0256,
+ "loss": 0.0264,
"macro_f1": 0.3333333432674408,
"num_tokens": 2259326.0,
"repeat_count": 0.0,
- "routers_loss": 0.005919010378420353,
+ "routers_loss": 0.0065213534981012344,
"skip_count": 0.0,
"step": 1400,
"text_loss": 0.2888098657131195
@@ -13317,13 +13317,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.23046875,
"learning_rate": 0.0009806829501020546,
- "loss": 0.0372,
+ "loss": 0.0358,
"macro_f1": 0.3272727429866791,
"num_tokens": 2262344.0,
"repeat_count": 0.0,
- "routers_loss": 0.04717765748500824,
+ "routers_loss": 0.04199840500950813,
"skip_count": 1.0,
"step": 1402,
"text_loss": 0.31973034143447876
@@ -13336,13 +13336,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.08935546875,
"learning_rate": 0.0009805976562534215,
"loss": 0.0317,
"macro_f1": 0.6603773832321167,
"num_tokens": 2266354.0,
"repeat_count": 1.0,
- "routers_loss": 0.015415813773870468,
+ "routers_loss": 0.015434930101037025,
"skip_count": 1.0,
"step": 1404,
"text_loss": 0.508630633354187
@@ -13355,13 +13355,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009805121782388599,
"loss": 0.0339,
"macro_f1": 0.6533333659172058,
"num_tokens": 2269660.0,
"repeat_count": 2.0,
- "routers_loss": 0.06812979280948639,
+ "routers_loss": 0.0720924660563469,
"skip_count": 2.0,
"step": 1406,
"text_loss": 0.40927737951278687
@@ -13374,13 +13374,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05908203125,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009804265160911253,
- "loss": 0.0265,
+ "loss": 0.0266,
"macro_f1": 0.5492662787437439,
"num_tokens": 2273335.0,
"repeat_count": 0.0,
- "routers_loss": 0.025383235886693,
+ "routers_loss": 0.02400495670735836,
"skip_count": 2.0,
"step": 1408,
"text_loss": 0.1777762621641159
@@ -13393,13 +13393,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.2314453125,
"learning_rate": 0.0009803406698430433,
- "loss": 0.0367,
+ "loss": 0.0371,
"macro_f1": 0.3272727429866791,
"num_tokens": 2277107.0,
"repeat_count": 0.0,
- "routers_loss": 0.026493225246667862,
+ "routers_loss": 0.02560107782483101,
"skip_count": 1.0,
"step": 1410,
"text_loss": 0.17955881357192993
@@ -13412,13 +13412,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0009802546395275104,
- "loss": 0.0342,
+ "loss": 0.0349,
"macro_f1": 0.3333333432674408,
"num_tokens": 2281638.0,
"repeat_count": 0.0,
- "routers_loss": 0.006616846192628145,
+ "routers_loss": 0.006655813194811344,
"skip_count": 0.0,
"step": 1412,
"text_loss": 0.20882295072078705
@@ -13431,32 +13431,32 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.0888671875,
+ "grad_norm": 0.08740234375,
"learning_rate": 0.000980168425177494,
- "loss": 0.0328,
+ "loss": 0.0342,
"macro_f1": 0.8200000524520874,
"num_tokens": 2284876.0,
"repeat_count": 1.0,
- "routers_loss": 0.060631848871707916,
+ "routers_loss": 0.06325097382068634,
"skip_count": 3.0,
"step": 1414,
"text_loss": 0.26035264134407043
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 6.648077487525683,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.138671875,
"learning_rate": 0.000980082026826031,
- "loss": 0.0317,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0315,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2288938.0,
"repeat_count": 1.0,
- "routers_loss": 0.011199389584362507,
+ "routers_loss": 0.013436575420200825,
"skip_count": 0.0,
"step": 1416,
"text_loss": 0.5502325892448425
@@ -13469,13 +13469,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0009799954445062296,
- "loss": 0.0192,
+ "loss": 0.0193,
"macro_f1": 0.6603773832321167,
"num_tokens": 2292317.0,
"repeat_count": 1.0,
- "routers_loss": 0.01120354700833559,
+ "routers_loss": 0.011264479719102383,
"skip_count": 1.0,
"step": 1418,
"text_loss": 0.48075684905052185
@@ -13488,13 +13488,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009799086782512686,
- "loss": 0.0294,
+ "loss": 0.0292,
"macro_f1": 0.5492662787437439,
"num_tokens": 2295935.0,
"repeat_count": 0.0,
- "routers_loss": 0.030204148963093758,
+ "routers_loss": 0.02833271212875843,
"skip_count": 2.0,
"step": 1420,
"text_loss": 0.18221206963062286
@@ -13507,13 +13507,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0888671875,
+ "grad_norm": 0.09375,
"learning_rate": 0.0009798217280943967,
- "loss": 0.0348,
+ "loss": 0.0356,
"macro_f1": 0.6666666865348816,
"num_tokens": 2298927.0,
"repeat_count": 0.0,
- "routers_loss": 0.008244800381362438,
+ "routers_loss": 0.009208574891090393,
"skip_count": 1.0,
"step": 1422,
"text_loss": 0.48686322569847107
@@ -13526,32 +13526,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009797345940689335,
- "loss": 0.0269,
+ "loss": 0.0267,
"macro_f1": 0.3272727429866791,
"num_tokens": 2301541.0,
"repeat_count": 0.0,
- "routers_loss": 0.015340043231844902,
+ "routers_loss": 0.015011847950518131,
"skip_count": 0.0,
"step": 1424,
"text_loss": 0.49446266889572144
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6000000238418579,
- "avg_layers": 25.0,
+ "acc_skip": 0.4000000059604645,
+ "avg_layers": 26.0,
"epoch": 6.695039624302906,
- "f1_execute": 0.9583333134651184,
+ "f1_execute": 0.9387754797935486,
"f1_repeat": 0.0,
- "f1_skip": 0.75,
- "grad_norm": 0.1318359375,
+ "f1_skip": 0.5714285969734192,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.0009796472762082687,
- "loss": 0.0341,
- "macro_f1": 0.5694444179534912,
+ "loss": 0.0338,
+ "macro_f1": 0.5034013986587524,
"num_tokens": 2304589.0,
"repeat_count": 0.0,
- "routers_loss": 0.058681465685367584,
+ "routers_loss": 0.05912091210484505,
"skip_count": 5.0,
"step": 1426,
"text_loss": 0.23945684731006622
@@ -13564,32 +13564,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.09765625,
"learning_rate": 0.000979559774545863,
- "loss": 0.0423,
+ "loss": 0.0405,
"macro_f1": 0.3272727429866791,
"num_tokens": 2307860.0,
"repeat_count": 0.0,
- "routers_loss": 0.020810559391975403,
+ "routers_loss": 0.021242303773760796,
"skip_count": 1.0,
"step": 1428,
"text_loss": 0.531273365020752
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 6.713824479013795,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.09033203125,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.000979472089115247,
- "loss": 0.0268,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0276,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 2311581.0,
"repeat_count": 0.0,
- "routers_loss": 0.030001837760210037,
+ "routers_loss": 0.02768544852733612,
"skip_count": 2.0,
"step": 1430,
"text_loss": 0.2497459501028061
@@ -13602,13 +13602,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1318359375,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.000979384219950022,
- "loss": 0.034,
+ "loss": 0.0346,
"macro_f1": 0.3333333432674408,
"num_tokens": 2314639.0,
"repeat_count": 0.0,
- "routers_loss": 0.010381575673818588,
+ "routers_loss": 0.008678150363266468,
"skip_count": 0.0,
"step": 1432,
"text_loss": 0.6579355001449585
@@ -13621,32 +13621,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08056640625,
"learning_rate": 0.0009792961670838595,
- "loss": 0.0365,
+ "loss": 0.0362,
"macro_f1": 0.3272727429866791,
"num_tokens": 2317927.0,
"repeat_count": 1.0,
- "routers_loss": 0.03234704211354256,
+ "routers_loss": 0.03325597569346428,
"skip_count": 0.0,
"step": 1434,
"text_loss": 0.5209436416625977
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 6.742001761080129,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009792079305505016,
- "loss": 0.0303,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0306,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2321065.0,
"repeat_count": 1.0,
- "routers_loss": 0.015481291338801384,
+ "routers_loss": 0.019228918477892876,
"skip_count": 0.0,
"step": 1436,
"text_loss": 0.41087067127227783
@@ -13659,13 +13659,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.000979119510383761,
- "loss": 0.0366,
+ "loss": 0.0371,
"macro_f1": 0.3333333432674408,
"num_tokens": 2323714.0,
"repeat_count": 0.0,
- "routers_loss": 0.018170451745390892,
+ "routers_loss": 0.017071325331926346,
"skip_count": 0.0,
"step": 1438,
"text_loss": 0.21490029990673065
@@ -13678,13 +13678,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.2060546875,
"learning_rate": 0.00097903090661752,
- "loss": 0.0306,
+ "loss": 0.0309,
"macro_f1": 0.3333333432674408,
"num_tokens": 2326454.0,
"repeat_count": 0.0,
- "routers_loss": 0.010385681875050068,
+ "routers_loss": 0.00991755723953247,
"skip_count": 0.0,
"step": 1440,
"text_loss": 0.23847346007823944
@@ -13697,13 +13697,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.232421875,
"learning_rate": 0.000978942119285732,
- "loss": 0.0407,
+ "loss": 0.0404,
"macro_f1": 0.3272727429866791,
"num_tokens": 2329462.0,
"repeat_count": 0.0,
- "routers_loss": 0.04976538568735123,
+ "routers_loss": 0.04908733069896698,
"skip_count": 1.0,
"step": 1442,
"text_loss": 0.23343028128147125
@@ -13716,13 +13716,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009788531484224204,
- "loss": 0.0255,
+ "loss": 0.0264,
"macro_f1": 0.3333333432674408,
"num_tokens": 2332146.0,
"repeat_count": 0.0,
- "routers_loss": 0.0030266831163316965,
+ "routers_loss": 0.0032628148328512907,
"skip_count": 0.0,
"step": 1444,
"text_loss": 0.47423800826072693
@@ -13730,18 +13730,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.3333333432674408,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 6.788963897857353,
- "f1_execute": 0.9600000381469727,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 0.5,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009787639940616788,
- "loss": 0.0411,
- "macro_f1": 0.8200000524520874,
+ "loss": 0.0405,
+ "macro_f1": 0.7018141150474548,
"num_tokens": 2335738.0,
"repeat_count": 1.0,
- "routers_loss": 0.13420957326889038,
+ "routers_loss": 0.14336998760700226,
"skip_count": 3.0,
"step": 1446,
"text_loss": 0.21837592124938965
@@ -13754,13 +13754,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.189453125,
"learning_rate": 0.0009786746562376717,
- "loss": 0.0251,
+ "loss": 0.0241,
"macro_f1": 0.6666666865348816,
"num_tokens": 2338488.0,
"repeat_count": 0.0,
- "routers_loss": 0.012779864482581615,
+ "routers_loss": 0.010542908683419228,
"skip_count": 1.0,
"step": 1448,
"text_loss": 1.0614757537841797
@@ -13773,13 +13773,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.0009785851349846334,
- "loss": 0.0266,
+ "loss": 0.0268,
"macro_f1": 0.3333333432674408,
"num_tokens": 2342074.0,
"repeat_count": 0.0,
- "routers_loss": 0.005545398220419884,
+ "routers_loss": 0.005998016335070133,
"skip_count": 0.0,
"step": 1450,
"text_loss": 0.4269719421863556
@@ -13792,13 +13792,13 @@
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.09814453125,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009784954303368686,
- "loss": 0.0395,
+ "loss": 0.0384,
"macro_f1": 0.44705885648727417,
"num_tokens": 2345838.0,
"repeat_count": 0.0,
- "routers_loss": 0.0899835154414177,
+ "routers_loss": 0.0959126204252243,
"skip_count": 3.0,
"step": 1452,
"text_loss": 0.3315916955471039
@@ -13811,13 +13811,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.1005859375,
"learning_rate": 0.0009784055423287521,
"loss": 0.0218,
"macro_f1": 0.3333333432674408,
"num_tokens": 2348939.0,
"repeat_count": 0.0,
- "routers_loss": 0.002738836221396923,
+ "routers_loss": 0.0025467623490840197,
"skip_count": 0.0,
"step": 1454,
"text_loss": 0.6162732839584351
@@ -13830,13 +13830,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.115234375,
"learning_rate": 0.0009783154709947293,
- "loss": 0.0266,
+ "loss": 0.0256,
"macro_f1": 0.3272727429866791,
"num_tokens": 2352232.0,
"repeat_count": 0.0,
- "routers_loss": 0.020522192120552063,
+ "routers_loss": 0.01860538125038147,
"skip_count": 1.0,
"step": 1456,
"text_loss": 0.23928768932819366
@@ -13844,18 +13844,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 6.84531846199002,
- "f1_execute": 0.9629629850387573,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.0009782252163693158,
- "loss": 0.0197,
- "macro_f1": 0.32098767161369324,
+ "loss": 0.0201,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2355159.0,
"repeat_count": 0.0,
- "routers_loss": 0.04245268926024437,
+ "routers_loss": 0.04412713274359703,
"skip_count": 1.0,
"step": 1458,
"text_loss": 0.3371323347091675
@@ -13868,13 +13868,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.224609375,
+ "grad_norm": 0.21484375,
"learning_rate": 0.0009781347784870973,
- "loss": 0.0376,
+ "loss": 0.0379,
"macro_f1": 0.3333333432674408,
"num_tokens": 2358175.0,
"repeat_count": 0.0,
- "routers_loss": 0.009142685681581497,
+ "routers_loss": 0.006809141952544451,
"skip_count": 0.0,
"step": 1460,
"text_loss": 0.547267735004425
@@ -13887,13 +13887,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009780441573827296,
- "loss": 0.0295,
+ "loss": 0.03,
"macro_f1": 0.3076923191547394,
"num_tokens": 2360991.0,
"repeat_count": 0.0,
- "routers_loss": 0.08038893342018127,
+ "routers_loss": 0.08924390375614166,
"skip_count": 4.0,
"step": 1462,
"text_loss": 0.7026563882827759
@@ -13906,13 +13906,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.1865234375,
"learning_rate": 0.000977953353090939,
- "loss": 0.027,
+ "loss": 0.0272,
"macro_f1": 0.3333333432674408,
"num_tokens": 2363894.0,
"repeat_count": 0.0,
- "routers_loss": 0.02107175625860691,
+ "routers_loss": 0.021858472377061844,
"skip_count": 0.0,
"step": 1464,
"text_loss": 0.2718065083026886
@@ -13925,13 +13925,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.0009778623656465219,
- "loss": 0.0349,
+ "loss": 0.0338,
"macro_f1": 0.32098764181137085,
"num_tokens": 2367265.0,
"repeat_count": 0.0,
- "routers_loss": 0.042030055075883865,
+ "routers_loss": 0.044781096279621124,
"skip_count": 0.0,
"step": 1466,
"text_loss": 0.5008095502853394
@@ -13944,13 +13944,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07470703125,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009777711950843448,
- "loss": 0.022,
+ "loss": 0.0212,
"macro_f1": 0.3333333432674408,
"num_tokens": 2370186.0,
"repeat_count": 0.0,
- "routers_loss": 0.004230673424899578,
+ "routers_loss": 0.0040459707379341125,
"skip_count": 0.0,
"step": 1468,
"text_loss": 0.5242461562156677
@@ -13963,13 +13963,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009776798414393446,
- "loss": 0.0284,
+ "loss": 0.0279,
"macro_f1": 0.6598639488220215,
"num_tokens": 2373314.0,
"repeat_count": 1.0,
- "routers_loss": 0.06986775249242783,
+ "routers_loss": 0.0708528608083725,
"skip_count": 3.0,
"step": 1470,
"text_loss": 0.2821732461452484
@@ -13982,13 +13982,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.142578125,
+ "grad_norm": 0.1328125,
"learning_rate": 0.0009775883047465279,
- "loss": 0.0431,
+ "loss": 0.0414,
"macro_f1": 0.31446540355682373,
"num_tokens": 2376435.0,
"repeat_count": 1.0,
- "routers_loss": 0.0439564548432827,
+ "routers_loss": 0.0290578193962574,
"skip_count": 1.0,
"step": 1472,
"text_loss": 0.8438440561294556
@@ -14001,13 +14001,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1123046875,
+ "grad_norm": 0.10546875,
"learning_rate": 0.000977496585040972,
- "loss": 0.0376,
+ "loss": 0.0373,
"macro_f1": 0.3333333432674408,
"num_tokens": 2380244.0,
"repeat_count": 0.0,
- "routers_loss": 0.011889892630279064,
+ "routers_loss": 0.010360375046730042,
"skip_count": 0.0,
"step": 1474,
"text_loss": 0.4356135427951813
@@ -14020,13 +14020,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.000977404682357824,
- "loss": 0.0295,
+ "loss": 0.0294,
"macro_f1": 0.3272727429866791,
"num_tokens": 2383498.0,
"repeat_count": 0.0,
- "routers_loss": 0.022536326199769974,
+ "routers_loss": 0.023518972098827362,
"skip_count": 0.0,
"step": 1476,
"text_loss": 0.25195425748825073
@@ -14039,13 +14039,13 @@
"f1_execute": 0.9743589162826538,
"f1_repeat": 0.888888955116272,
"f1_skip": 1.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.000977312596732301,
- "loss": 0.0388,
+ "loss": 0.0375,
"macro_f1": 0.9544159770011902,
"num_tokens": 2386414.0,
"repeat_count": 5.0,
- "routers_loss": 0.07959948480129242,
+ "routers_loss": 0.08190606534481049,
"skip_count": 4.0,
"step": 1478,
"text_loss": 0.6586798429489136
@@ -14058,13 +14058,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.10546875,
"learning_rate": 0.0009772203281996905,
- "loss": 0.0341,
+ "loss": 0.0336,
"macro_f1": 1.0,
"num_tokens": 2389399.0,
"repeat_count": 1.0,
- "routers_loss": 0.019112225621938705,
+ "routers_loss": 0.016441475600004196,
"skip_count": 2.0,
"step": 1480,
"text_loss": 0.3671986758708954
@@ -14077,13 +14077,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0888671875,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009771278767953502,
- "loss": 0.0345,
+ "loss": 0.0357,
"macro_f1": 0.3333333432674408,
"num_tokens": 2392400.0,
"repeat_count": 0.0,
- "routers_loss": 0.018750866875052452,
+ "routers_loss": 0.019211363047361374,
"skip_count": 0.0,
"step": 1482,
"text_loss": 0.27418580651283264
@@ -14096,32 +14096,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09228515625,
+ "grad_norm": 0.0947265625,
"learning_rate": 0.0009770352425547072,
- "loss": 0.0291,
+ "loss": 0.0292,
"macro_f1": 0.3333333432674408,
"num_tokens": 2395123.0,
"repeat_count": 0.0,
- "routers_loss": 0.015407348051667213,
+ "routers_loss": 0.015800386667251587,
"skip_count": 0.0,
"step": 1484,
"text_loss": 0.19896622002124786
},
{
- "acc_repeat": 0.6666666865348816,
+ "acc_repeat": 0.3333333432674408,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 29.0,
"epoch": 6.976812444966246,
- "f1_execute": 0.9803921580314636,
- "f1_repeat": 0.800000011920929,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.5,
"f1_skip": 0.0,
- "grad_norm": 0.11474609375,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009769424255132596,
- "loss": 0.0258,
- "macro_f1": 0.5934640765190125,
+ "loss": 0.0256,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 2397359.0,
"repeat_count": 3.0,
- "routers_loss": 0.06514479219913483,
+ "routers_loss": 0.06670158356428146,
"skip_count": 0.0,
"step": 1486,
"text_loss": 0.4229799509048462
@@ -14134,13 +14134,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.111328125,
+ "grad_norm": 0.1162109375,
"learning_rate": 0.0009768494257065747,
- "loss": 0.0217,
+ "loss": 0.0218,
"macro_f1": 0.3272727429866791,
"num_tokens": 2400387.0,
"repeat_count": 0.0,
- "routers_loss": 0.013567833229899406,
+ "routers_loss": 0.011144762858748436,
"skip_count": 1.0,
"step": 1488,
"text_loss": 0.4264226257801056
@@ -14153,13 +14153,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12255859375,
+ "grad_norm": 0.12353515625,
"learning_rate": 0.0009767562431702904,
- "loss": 0.0389,
+ "loss": 0.0387,
"macro_f1": 0.3006536364555359,
"num_tokens": 2403241.0,
"repeat_count": 2.0,
- "routers_loss": 0.13762018084526062,
+ "routers_loss": 0.12339717149734497,
"skip_count": 3.0,
"step": 1490,
"text_loss": 0.2850193977355957
@@ -14172,13 +14172,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0009766628779401142,
- "loss": 0.0214,
+ "loss": 0.0215,
"macro_f1": 0.6666666865348816,
"num_tokens": 2406087.0,
"repeat_count": 0.0,
- "routers_loss": 0.008640666492283344,
+ "routers_loss": 0.008174685761332512,
"skip_count": 1.0,
"step": 1492,
"text_loss": 0.6756544709205627
@@ -14191,13 +14191,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05712890625,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.000976569330051824,
- "loss": 0.0182,
+ "loss": 0.0186,
"macro_f1": 0.3333333432674408,
"num_tokens": 2409312.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018257038900628686,
+ "routers_loss": 0.0021256296895444393,
"skip_count": 0.0,
"step": 1494,
"text_loss": 0.4789894223213196
@@ -14210,13 +14210,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0009764755995412677,
"loss": 0.0193,
"macro_f1": 0.3333333432674408,
"num_tokens": 2412758.0,
"repeat_count": 0.0,
- "routers_loss": 0.003656312357634306,
+ "routers_loss": 0.003944927826523781,
"skip_count": 0.0,
"step": 1496,
"text_loss": 0.5157490968704224
@@ -14229,13 +14229,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009763816864443627,
- "loss": 0.0246,
+ "loss": 0.0239,
"macro_f1": 0.3272727429866791,
"num_tokens": 2416079.0,
"repeat_count": 1.0,
- "routers_loss": 0.044268425554037094,
+ "routers_loss": 0.03893325850367546,
"skip_count": 0.0,
"step": 1498,
"text_loss": 0.28045418858528137
@@ -14248,13 +14248,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.0009762875907970968,
- "loss": 0.0207,
+ "loss": 0.0199,
"macro_f1": 0.3333333432674408,
"num_tokens": 2420340.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018966116476804018,
+ "routers_loss": 0.0017725443467497826,
"skip_count": 0.0,
"step": 1500,
"text_loss": 0.35550856590270996
@@ -14267,32 +14267,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.06298828125,
"learning_rate": 0.0009761933126355277,
- "loss": 0.0249,
+ "loss": 0.0245,
"macro_f1": 0.3272727429866791,
"num_tokens": 2424735.0,
"repeat_count": 0.0,
- "routers_loss": 0.01729201152920723,
+ "routers_loss": 0.01393749937415123,
"skip_count": 1.0,
"step": 1502,
"text_loss": 0.38840189576148987
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 7.06105077781039,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.11962890625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009760988519957828,
- "loss": 0.0248,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0249,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 2428132.0,
"repeat_count": 0.0,
- "routers_loss": 0.01693531684577465,
+ "routers_loss": 0.01687910407781601,
"skip_count": 2.0,
"step": 1504,
"text_loss": 0.3031681478023529
@@ -14305,13 +14305,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0009760042089140598,
- "loss": 0.0197,
+ "loss": 0.0193,
"macro_f1": 0.3144654333591461,
"num_tokens": 2431592.0,
"repeat_count": 1.0,
- "routers_loss": 0.04939094930887222,
+ "routers_loss": 0.04704280197620392,
"skip_count": 2.0,
"step": 1506,
"text_loss": 0.16355200111865997
@@ -14324,13 +14324,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0009759093834266259,
- "loss": 0.0213,
+ "loss": 0.0206,
"macro_f1": 0.3333333432674408,
"num_tokens": 2434236.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016892930725589395,
+ "routers_loss": 0.0016075772000476718,
"skip_count": 0.0,
"step": 1508,
"text_loss": 0.6080073118209839
@@ -14343,13 +14343,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10009765625,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009758143755698186,
- "loss": 0.0147,
+ "loss": 0.015,
"macro_f1": 0.3333333432674408,
"num_tokens": 2437170.0,
"repeat_count": 0.0,
- "routers_loss": 0.008671467192471027,
+ "routers_loss": 0.008451299741864204,
"skip_count": 0.0,
"step": 1510,
"text_loss": 0.22100484371185303
@@ -14362,13 +14362,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009757191853800449,
- "loss": 0.0228,
+ "loss": 0.0227,
"macro_f1": 0.5866667032241821,
"num_tokens": 2441187.0,
"repeat_count": 1.0,
- "routers_loss": 0.042682576924562454,
+ "routers_loss": 0.046565692871809006,
"skip_count": 3.0,
"step": 1512,
"text_loss": 0.25098952651023865
@@ -14381,13 +14381,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.11279296875,
"learning_rate": 0.000975623812893782,
- "loss": 0.028,
+ "loss": 0.0276,
"macro_f1": 0.3272727429866791,
"num_tokens": 2444664.0,
"repeat_count": 0.0,
- "routers_loss": 0.02905822917819023,
+ "routers_loss": 0.02872578240931034,
"skip_count": 1.0,
"step": 1514,
"text_loss": 0.4952253997325897
@@ -14400,13 +14400,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09326171875,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.0009755282581475768,
- "loss": 0.0223,
+ "loss": 0.0233,
"macro_f1": 0.3333333432674408,
"num_tokens": 2447748.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018810008186846972,
+ "routers_loss": 0.002055214950814843,
"skip_count": 0.0,
"step": 1516,
"text_loss": 0.7465500831604004
@@ -14419,13 +14419,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.10302734375,
"learning_rate": 0.000975432521178046,
- "loss": 0.0219,
+ "loss": 0.0216,
"macro_f1": 0.3272727429866791,
"num_tokens": 2450834.0,
"repeat_count": 1.0,
- "routers_loss": 0.04308714717626572,
+ "routers_loss": 0.04498551785945892,
"skip_count": 0.0,
"step": 1518,
"text_loss": 0.28144413232803345
@@ -14438,13 +14438,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.0009753366020218763,
- "loss": 0.0232,
+ "loss": 0.0234,
"macro_f1": 0.3333333432674408,
"num_tokens": 2454233.0,
"repeat_count": 0.0,
- "routers_loss": 0.003754811594262719,
+ "routers_loss": 0.003669742727652192,
"skip_count": 0.0,
"step": 1520,
"text_loss": 0.5667551755905151
@@ -14457,32 +14457,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08837890625,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009752405007158238,
- "loss": 0.0246,
+ "loss": 0.0238,
"macro_f1": 0.3333333432674408,
"num_tokens": 2457331.0,
"repeat_count": 0.0,
- "routers_loss": 0.010853761807084084,
+ "routers_loss": 0.010455607436597347,
"skip_count": 0.0,
"step": 1522,
"text_loss": 0.19575810432434082
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.5,
"acc_skip": 1.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 7.154975051364837,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0009751442172967151,
- "loss": 0.0196,
- "macro_f1": 1.0,
+ "loss": 0.0193,
+ "macro_f1": 0.8823530077934265,
"num_tokens": 2459935.0,
"repeat_count": 2.0,
- "routers_loss": 0.015100379474461079,
+ "routers_loss": 0.025189083069562912,
"skip_count": 1.0,
"step": 1524,
"text_loss": 0.45453405380249023
@@ -14495,13 +14495,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.000975047751801446,
- "loss": 0.0189,
+ "loss": 0.0187,
"macro_f1": 0.3272727429866791,
"num_tokens": 2463008.0,
"repeat_count": 0.0,
- "routers_loss": 0.011991916224360466,
+ "routers_loss": 0.012297490611672401,
"skip_count": 0.0,
"step": 1526,
"text_loss": 0.31437572836875916
@@ -14514,32 +14514,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009749511042669823,
- "loss": 0.0226,
+ "loss": 0.0233,
"macro_f1": 0.3333333432674408,
"num_tokens": 2466475.0,
"repeat_count": 0.0,
- "routers_loss": 0.008201062679290771,
+ "routers_loss": 0.011026266030967236,
"skip_count": 0.0,
"step": 1528,
"text_loss": 0.46604859828948975
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 7.183152333431171,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.1181640625,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1376953125,
"learning_rate": 0.0009748542747303595,
- "loss": 0.0174,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0182,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2469320.0,
"repeat_count": 0.0,
- "routers_loss": 0.008513177745044231,
+ "routers_loss": 0.011934996582567692,
"skip_count": 1.0,
"step": 1530,
"text_loss": 0.7764923572540283
@@ -14552,13 +14552,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.0966796875,
"learning_rate": 0.0009747572632286827,
- "loss": 0.02,
+ "loss": 0.0203,
"macro_f1": 0.3333333432674408,
"num_tokens": 2472468.0,
"repeat_count": 0.0,
- "routers_loss": 0.004850955214351416,
+ "routers_loss": 0.005786920432001352,
"skip_count": 0.0,
"step": 1532,
"text_loss": 0.3555782437324524
@@ -14571,32 +14571,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.087890625,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009746600697991271,
- "loss": 0.0206,
+ "loss": 0.02,
"macro_f1": 0.6666666865348816,
"num_tokens": 2475736.0,
"repeat_count": 1.0,
- "routers_loss": 0.0027650354895740747,
+ "routers_loss": 0.0026990731712430716,
"skip_count": 0.0,
"step": 1534,
"text_loss": 0.49561792612075806
},
{
"acc_repeat": 1.0,
- "acc_skip": 0.0,
- "avg_layers": 29.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
"epoch": 7.2113296154975055,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
- "f1_skip": 0.0,
- "grad_norm": 0.0615234375,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0556640625,
"learning_rate": 0.0009745626944789375,
- "loss": 0.0209,
- "macro_f1": 0.6538461446762085,
+ "loss": 0.0204,
+ "macro_f1": 0.8823530077934265,
"num_tokens": 2478887.0,
"repeat_count": 1.0,
- "routers_loss": 0.023268593475222588,
+ "routers_loss": 0.020221207290887833,
"skip_count": 2.0,
"step": 1536,
"text_loss": 0.5375416278839111
@@ -14609,13 +14609,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11669921875,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009744651373054279,
"loss": 0.0286,
"macro_f1": 0.3272727429866791,
"num_tokens": 2481293.0,
"repeat_count": 0.0,
- "routers_loss": 0.031235001981258392,
+ "routers_loss": 0.03131086751818657,
"skip_count": 1.0,
"step": 1538,
"text_loss": 0.5241039395332336
@@ -14628,13 +14628,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.080078125,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009743673983159828,
- "loss": 0.023,
+ "loss": 0.0241,
"macro_f1": 0.6122449040412903,
"num_tokens": 2484403.0,
"repeat_count": 0.0,
- "routers_loss": 0.042398080229759216,
+ "routers_loss": 0.04448170214891434,
"skip_count": 4.0,
"step": 1540,
"text_loss": 0.7465724349021912
@@ -14647,13 +14647,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.08935546875,
"learning_rate": 0.0009742694775480557,
- "loss": 0.0268,
+ "loss": 0.0265,
"macro_f1": 0.6666666865348816,
"num_tokens": 2487952.0,
"repeat_count": 0.0,
- "routers_loss": 0.007361465133726597,
+ "routers_loss": 0.007171491626650095,
"skip_count": 1.0,
"step": 1542,
"text_loss": 0.2877117097377777
@@ -14666,13 +14666,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0009741713750391703,
- "loss": 0.0166,
+ "loss": 0.0171,
"macro_f1": 0.6666666865348816,
"num_tokens": 2490815.0,
"repeat_count": 1.0,
- "routers_loss": 0.0052334014326334,
+ "routers_loss": 0.004559285007417202,
"skip_count": 0.0,
"step": 1544,
"text_loss": 0.6097800135612488
@@ -14685,13 +14685,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.06787109375,
"learning_rate": 0.0009740730908269193,
"loss": 0.0174,
"macro_f1": 0.3333333432674408,
"num_tokens": 2494727.0,
"repeat_count": 0.0,
- "routers_loss": 0.004993532784283161,
+ "routers_loss": 0.005271553061902523,
"skip_count": 0.0,
"step": 1546,
"text_loss": 0.5431114435195923
@@ -14704,13 +14704,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009739746249489658,
- "loss": 0.0248,
+ "loss": 0.0239,
"macro_f1": 0.3333333432674408,
"num_tokens": 2499266.0,
"repeat_count": 0.0,
- "routers_loss": 0.001611889572814107,
+ "routers_loss": 0.0015409323386847973,
"skip_count": 0.0,
"step": 1548,
"text_loss": 0.4702678322792053
@@ -14723,13 +14723,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.1171875,
"learning_rate": 0.0009738759774430417,
- "loss": 0.0209,
+ "loss": 0.0216,
"macro_f1": 0.32098764181137085,
"num_tokens": 2502273.0,
"repeat_count": 1.0,
- "routers_loss": 0.03059260919690132,
+ "routers_loss": 0.030183158814907074,
"skip_count": 1.0,
"step": 1550,
"text_loss": 0.3239189088344574
@@ -14742,32 +14742,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056396484375,
+ "grad_norm": 0.0498046875,
"learning_rate": 0.0009737771483469493,
- "loss": 0.0195,
+ "loss": 0.0196,
"macro_f1": 0.3333333432674408,
"num_tokens": 2507624.0,
"repeat_count": 0.0,
- "routers_loss": 0.00508903618901968,
+ "routers_loss": 0.005410848651081324,
"skip_count": 0.0,
"step": 1552,
"text_loss": 0.4014642834663391
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 7.295861461696507,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
+ "f1_skip": 1.0,
"grad_norm": 0.07763671875,
"learning_rate": 0.0009736781376985598,
- "loss": 0.0174,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0168,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 2510366.0,
"repeat_count": 0.0,
- "routers_loss": 0.007860450074076653,
+ "routers_loss": 0.0066976165398955345,
"skip_count": 1.0,
"step": 1554,
"text_loss": 0.5924848914146423
@@ -14780,13 +14780,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11669921875,
+ "grad_norm": 0.13671875,
"learning_rate": 0.0009735789455358144,
- "loss": 0.0217,
+ "loss": 0.022,
"macro_f1": 0.3333333432674408,
"num_tokens": 2513317.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027370608877390623,
+ "routers_loss": 0.002763477386906743,
"skip_count": 0.0,
"step": 1556,
"text_loss": 0.3222943842411041
@@ -14799,13 +14799,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10302734375,
+ "grad_norm": 0.11767578125,
"learning_rate": 0.0009734795718967237,
- "loss": 0.0276,
+ "loss": 0.0283,
"macro_f1": 0.32098764181137085,
"num_tokens": 2516628.0,
"repeat_count": 0.0,
- "routers_loss": 0.061584725975990295,
+ "routers_loss": 0.061566028743982315,
"skip_count": 2.0,
"step": 1558,
"text_loss": 0.3249334692955017
@@ -14818,13 +14818,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009733800168193679,
"loss": 0.0228,
"macro_f1": 1.0,
"num_tokens": 2519424.0,
"repeat_count": 2.0,
- "routers_loss": 0.01694316789507866,
+ "routers_loss": 0.017976421862840652,
"skip_count": 4.0,
"step": 1560,
"text_loss": 0.3341919481754303
@@ -14837,13 +14837,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1826171875,
"learning_rate": 0.0009732802803418966,
- "loss": 0.0234,
+ "loss": 0.023,
"macro_f1": 0.3333333432674408,
"num_tokens": 2522922.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023331891279667616,
+ "routers_loss": 0.002525332849472761,
"skip_count": 0.0,
"step": 1562,
"text_loss": 0.3176332712173462
@@ -14856,13 +14856,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.07861328125,
"learning_rate": 0.0009731803625025292,
- "loss": 0.0203,
+ "loss": 0.0196,
"macro_f1": 0.3272727429866791,
"num_tokens": 2525811.0,
"repeat_count": 0.0,
- "routers_loss": 0.021300682798027992,
+ "routers_loss": 0.015524424612522125,
"skip_count": 1.0,
"step": 1564,
"text_loss": 0.532774031162262
@@ -14875,13 +14875,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.0009730802633395541,
- "loss": 0.026,
+ "loss": 0.0257,
"macro_f1": 0.6603773832321167,
"num_tokens": 2529157.0,
"repeat_count": 1.0,
- "routers_loss": 0.08335043489933014,
+ "routers_loss": 0.08138631284236908,
"skip_count": 1.0,
"step": 1566,
"text_loss": 0.529487133026123
@@ -14894,13 +14894,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009729799828913298,
- "loss": 0.0224,
+ "loss": 0.0223,
"macro_f1": 0.3333333432674408,
"num_tokens": 2532249.0,
"repeat_count": 0.0,
- "routers_loss": 0.003535634372383356,
+ "routers_loss": 0.0035867292899638414,
"skip_count": 0.0,
"step": 1568,
"text_loss": 0.503160297870636
@@ -14913,13 +14913,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009728795211962838,
"loss": 0.0259,
"macro_f1": 0.5492662787437439,
"num_tokens": 2535904.0,
"repeat_count": 0.0,
- "routers_loss": 0.025729363784193993,
+ "routers_loss": 0.02987455204129219,
"skip_count": 2.0,
"step": 1570,
"text_loss": 0.9170270562171936
@@ -14932,13 +14932,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1357421875,
+ "grad_norm": 0.11865234375,
"learning_rate": 0.0009727788782929131,
- "loss": 0.0287,
+ "loss": 0.0273,
"macro_f1": 0.3272727429866791,
"num_tokens": 2538943.0,
"repeat_count": 1.0,
- "routers_loss": 0.059166863560676575,
+ "routers_loss": 0.04676021635532379,
"skip_count": 0.0,
"step": 1572,
"text_loss": 0.29146310687065125
@@ -14951,13 +14951,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0009726780542197844,
- "loss": 0.0173,
+ "loss": 0.0169,
"macro_f1": 0.3333333432674408,
"num_tokens": 2541805.0,
"repeat_count": 0.0,
- "routers_loss": 0.002580022206529975,
+ "routers_loss": 0.002127803163602948,
"skip_count": 0.0,
"step": 1574,
"text_loss": 1.0126502513885498
@@ -14970,13 +14970,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009725770490155338,
- "loss": 0.0257,
+ "loss": 0.0262,
"macro_f1": 0.3333333432674408,
"num_tokens": 2546213.0,
"repeat_count": 0.0,
- "routers_loss": 0.007746981456875801,
+ "routers_loss": 0.007609677035361528,
"skip_count": 0.0,
"step": 1576,
"text_loss": 0.190168559551239
@@ -14989,13 +14989,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.083984375,
"learning_rate": 0.0009724758627188665,
- "loss": 0.0344,
+ "loss": 0.0356,
"macro_f1": 0.3272727429866791,
"num_tokens": 2549554.0,
"repeat_count": 0.0,
- "routers_loss": 0.027308562770485878,
+ "routers_loss": 0.033554721623659134,
"skip_count": 1.0,
"step": 1578,
"text_loss": 0.2977406084537506
@@ -15008,13 +15008,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009723744953685572,
- "loss": 0.0277,
+ "loss": 0.028,
"macro_f1": 0.3272727429866791,
"num_tokens": 2552785.0,
"repeat_count": 1.0,
- "routers_loss": 0.029863199219107628,
+ "routers_loss": 0.027864238247275352,
"skip_count": 0.0,
"step": 1580,
"text_loss": 0.2700682580471039
@@ -15027,13 +15027,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.19921875,
"learning_rate": 0.0009722729470034503,
- "loss": 0.0218,
+ "loss": 0.0224,
"macro_f1": 0.3333333432674408,
"num_tokens": 2556550.0,
"repeat_count": 0.0,
- "routers_loss": 0.004019706044346094,
+ "routers_loss": 0.004798175301402807,
"skip_count": 0.0,
"step": 1582,
"text_loss": 0.6559903025627136
@@ -15046,32 +15046,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07177734375,
+ "grad_norm": 0.078125,
"learning_rate": 0.0009721712176624591,
- "loss": 0.0239,
+ "loss": 0.0242,
"macro_f1": 0.3333333432674408,
"num_tokens": 2559862.0,
"repeat_count": 0.0,
- "routers_loss": 0.014162382110953331,
+ "routers_loss": 0.013764148578047752,
"skip_count": 0.0,
"step": 1584,
"text_loss": 0.2257535308599472
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 7.446140299383622,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.0009720693073845667,
- "loss": 0.0338,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.032,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 2562766.0,
"repeat_count": 0.0,
- "routers_loss": 0.023485012352466583,
+ "routers_loss": 0.01937069371342659,
"skip_count": 2.0,
"step": 1586,
"text_loss": 0.178413525223732
@@ -15079,37 +15079,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 7.455532726739067,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.150390625,
"learning_rate": 0.0009719672162088252,
- "loss": 0.0308,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0306,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 2566583.0,
"repeat_count": 1.0,
- "routers_loss": 0.05822715163230896,
+ "routers_loss": 0.06224144622683525,
"skip_count": 0.0,
"step": 1588,
"text_loss": 0.3992367684841156
},
{
- "acc_repeat": 0.5,
- "acc_skip": 0.5,
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
"avg_layers": 27.0,
"epoch": 7.464925154094511,
- "f1_execute": 0.936170220375061,
- "f1_repeat": 0.6666666865348816,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.189453125,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.185546875,
"learning_rate": 0.0009718649441743559,
- "loss": 0.0243,
- "macro_f1": 0.7565011978149414,
+ "loss": 0.0239,
+ "macro_f1": 0.9449735879898071,
"num_tokens": 2569516.0,
"repeat_count": 2.0,
- "routers_loss": 0.07448136061429977,
+ "routers_loss": 0.06937911361455917,
"skip_count": 4.0,
"step": 1590,
"text_loss": 0.1945122629404068
@@ -15122,13 +15122,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.00097176249132035,
- "loss": 0.0228,
+ "loss": 0.0229,
"macro_f1": 0.3333333432674408,
"num_tokens": 2572418.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038424162194132805,
+ "routers_loss": 0.0034326619934290648,
"skip_count": 0.0,
"step": 1592,
"text_loss": 0.6259906888008118
@@ -15141,13 +15141,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.08642578125,
"learning_rate": 0.0009716598576860676,
- "loss": 0.0277,
+ "loss": 0.0278,
"macro_f1": 0.6666666865348816,
"num_tokens": 2575235.0,
"repeat_count": 1.0,
- "routers_loss": 0.005674343090504408,
+ "routers_loss": 0.004557516425848007,
"skip_count": 0.0,
"step": 1594,
"text_loss": 0.6638736724853516
@@ -15160,13 +15160,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.193359375,
"learning_rate": 0.0009715570433108378,
- "loss": 0.0209,
+ "loss": 0.0198,
"macro_f1": 1.0,
"num_tokens": 2578157.0,
"repeat_count": 1.0,
- "routers_loss": 0.015544800087809563,
+ "routers_loss": 0.015363055281341076,
"skip_count": 1.0,
"step": 1596,
"text_loss": 0.6530464887619019
@@ -15179,13 +15179,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009714540482340595,
- "loss": 0.0279,
+ "loss": 0.0268,
"macro_f1": 0.6666666865348816,
"num_tokens": 2581801.0,
"repeat_count": 1.0,
- "routers_loss": 0.013199405744671822,
+ "routers_loss": 0.01257144846022129,
"skip_count": 0.0,
"step": 1598,
"text_loss": 0.5916110277175903
@@ -15198,13 +15198,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059326171875,
+ "grad_norm": 0.058837890625,
"learning_rate": 0.0009713508724952006,
- "loss": 0.0178,
+ "loss": 0.0177,
"macro_f1": 0.3333333432674408,
"num_tokens": 2585204.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032487998250871897,
+ "routers_loss": 0.003175645601004362,
"skip_count": 0.0,
"step": 1600,
"text_loss": 0.27901601791381836
@@ -15217,13 +15217,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12255859375,
+ "grad_norm": 0.12353515625,
"learning_rate": 0.0009712475161337981,
- "loss": 0.0253,
+ "loss": 0.0261,
"macro_f1": 0.3333333432674408,
"num_tokens": 2588286.0,
"repeat_count": 0.0,
- "routers_loss": 0.0041928659193217754,
+ "routers_loss": 0.004122321493923664,
"skip_count": 0.0,
"step": 1602,
"text_loss": 0.42420244216918945
@@ -15236,13 +15236,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0009711439791894585,
- "loss": 0.0343,
+ "loss": 0.0341,
"macro_f1": 0.6666666865348816,
"num_tokens": 2591476.0,
"repeat_count": 0.0,
- "routers_loss": 0.011576149612665176,
+ "routers_loss": 0.011215819045901299,
"skip_count": 1.0,
"step": 1604,
"text_loss": 0.5549933910369873
@@ -15255,13 +15255,13 @@
"f1_execute": 0.9599999785423279,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009710402617018574,
- "loss": 0.0179,
+ "loss": 0.0172,
"macro_f1": 0.8200000524520874,
"num_tokens": 2594336.0,
"repeat_count": 1.0,
- "routers_loss": 0.03026912547647953,
+ "routers_loss": 0.02916567400097847,
"skip_count": 2.0,
"step": 1606,
"text_loss": 0.3263779282569885
@@ -15276,11 +15276,11 @@
"f1_skip": 1.0,
"grad_norm": 0.068359375,
"learning_rate": 0.0009709363637107393,
- "loss": 0.021,
+ "loss": 0.0209,
"macro_f1": 0.6666666865348816,
"num_tokens": 2597462.0,
"repeat_count": 0.0,
- "routers_loss": 0.014957098290324211,
+ "routers_loss": 0.015897957608103752,
"skip_count": 1.0,
"step": 1608,
"text_loss": 0.20917139947414398
@@ -15293,13 +15293,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009708322852559184,
- "loss": 0.0226,
+ "loss": 0.0229,
"macro_f1": 0.3333333432674408,
"num_tokens": 2601543.0,
"repeat_count": 0.0,
- "routers_loss": 0.00254683755338192,
+ "routers_loss": 0.002211357234045863,
"skip_count": 0.0,
"step": 1610,
"text_loss": 0.450550377368927
@@ -15312,13 +15312,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1748046875,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.0009707280263772776,
- "loss": 0.0286,
+ "loss": 0.0277,
"macro_f1": 0.6666666865348816,
"num_tokens": 2604462.0,
"repeat_count": 0.0,
- "routers_loss": 0.018759876489639282,
+ "routers_loss": 0.01615734025835991,
"skip_count": 2.0,
"step": 1612,
"text_loss": 0.6908381581306458
@@ -15337,7 +15337,7 @@
"macro_f1": 0.5492662787437439,
"num_tokens": 2607484.0,
"repeat_count": 0.0,
- "routers_loss": 0.022694367915391922,
+ "routers_loss": 0.022048067301511765,
"skip_count": 2.0,
"step": 1614,
"text_loss": 0.36691340804100037
@@ -15350,13 +15350,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.10546875,
"learning_rate": 0.0009705189675084138,
- "loss": 0.0181,
+ "loss": 0.0176,
"macro_f1": 0.6666666865348816,
"num_tokens": 2610204.0,
"repeat_count": 0.0,
- "routers_loss": 0.010102321393787861,
+ "routers_loss": 0.008503952994942665,
"skip_count": 1.0,
"step": 1616,
"text_loss": 0.5226598381996155
@@ -15369,13 +15369,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08984375,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009704141675983029,
- "loss": 0.0252,
+ "loss": 0.0248,
"macro_f1": 0.3333333432674408,
"num_tokens": 2613128.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020994991064071655,
+ "routers_loss": 0.0019020626787096262,
"skip_count": 0.0,
"step": 1618,
"text_loss": 0.6465088725090027
@@ -15388,13 +15388,13 @@
"f1_execute": 0.9333333373069763,
"f1_repeat": 0.0,
"f1_skip": 0.7272727489471436,
- "grad_norm": 0.10009765625,
+ "grad_norm": 0.107421875,
"learning_rate": 0.0009703091874245956,
- "loss": 0.0323,
+ "loss": 0.032,
"macro_f1": 0.5535354018211365,
"num_tokens": 2616360.0,
"repeat_count": 0.0,
- "routers_loss": 0.11748704314231873,
+ "routers_loss": 0.11837691068649292,
"skip_count": 7.0,
"step": 1620,
"text_loss": 0.2987039089202881
@@ -15407,32 +15407,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009702040270275204,
- "loss": 0.018,
+ "loss": 0.0181,
"macro_f1": 0.3333333432674408,
"num_tokens": 2619606.0,
"repeat_count": 0.0,
- "routers_loss": 0.007642311509698629,
+ "routers_loss": 0.0065958453342318535,
"skip_count": 0.0,
"step": 1622,
"text_loss": 0.6262096166610718
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 7.62459641913707,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.10595703125,
+ "f1_skip": 1.0,
+ "grad_norm": 0.103515625,
"learning_rate": 0.000970098686447375,
- "loss": 0.0258,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0257,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 2622499.0,
"repeat_count": 0.0,
- "routers_loss": 0.016890225932002068,
+ "routers_loss": 0.013632026500999928,
"skip_count": 1.0,
"step": 1624,
"text_loss": 0.2392602562904358
@@ -15445,13 +15445,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.125,
"learning_rate": 0.0009699931657245264,
- "loss": 0.0242,
+ "loss": 0.0245,
"macro_f1": 0.5492662787437439,
"num_tokens": 2626002.0,
"repeat_count": 0.0,
- "routers_loss": 0.010900186374783516,
+ "routers_loss": 0.012147823348641396,
"skip_count": 2.0,
"step": 1626,
"text_loss": 0.4742976129055023
@@ -15464,13 +15464,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0810546875,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009698874648994098,
- "loss": 0.0279,
+ "loss": 0.0285,
"macro_f1": 1.0,
"num_tokens": 2629847.0,
"repeat_count": 1.0,
- "routers_loss": 0.011229799129068851,
+ "routers_loss": 0.010692884214222431,
"skip_count": 3.0,
"step": 1628,
"text_loss": 0.5090685486793518
@@ -15483,13 +15483,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.0009697815840125304,
- "loss": 0.0275,
+ "loss": 0.0265,
"macro_f1": 0.3333333432674408,
"num_tokens": 2633529.0,
"repeat_count": 0.0,
- "routers_loss": 0.0105878422036767,
+ "routers_loss": 0.011442207731306553,
"skip_count": 0.0,
"step": 1630,
"text_loss": 0.1874329298734665
@@ -15502,13 +15502,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.2119140625,
"learning_rate": 0.0009696755231044618,
- "loss": 0.0209,
+ "loss": 0.0207,
"macro_f1": 0.3333333432674408,
"num_tokens": 2636321.0,
"repeat_count": 0.0,
- "routers_loss": 0.002953991526737809,
+ "routers_loss": 0.0026681360322982073,
"skip_count": 0.0,
"step": 1632,
"text_loss": 0.7650400400161743
@@ -15521,13 +15521,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10888671875,
+ "grad_norm": 0.10498046875,
"learning_rate": 0.0009695692822158466,
- "loss": 0.0241,
+ "loss": 0.0242,
"macro_f1": 0.3272727429866791,
"num_tokens": 2638840.0,
"repeat_count": 1.0,
- "routers_loss": 0.04717390984296799,
+ "routers_loss": 0.033965807408094406,
"skip_count": 0.0,
"step": 1634,
"text_loss": 0.6175784468650818
@@ -15540,13 +15540,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0009694628613873968,
- "loss": 0.0179,
+ "loss": 0.018,
"macro_f1": 0.3333333432674408,
"num_tokens": 2641886.0,
"repeat_count": 0.0,
- "routers_loss": 0.0073657832108438015,
+ "routers_loss": 0.007568214554339647,
"skip_count": 0.0,
"step": 1636,
"text_loss": 0.43139931559562683
@@ -15559,13 +15559,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.193359375,
"learning_rate": 0.0009693562606598929,
- "loss": 0.0259,
+ "loss": 0.025,
"macro_f1": 0.3333333432674408,
"num_tokens": 2645028.0,
"repeat_count": 0.0,
- "routers_loss": 0.005212752148509026,
+ "routers_loss": 0.004973865579813719,
"skip_count": 0.0,
"step": 1638,
"text_loss": 0.6430339217185974
@@ -15578,13 +15578,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0009692494800741844,
- "loss": 0.0304,
+ "loss": 0.0313,
"macro_f1": 0.3272727429866791,
"num_tokens": 2648209.0,
"repeat_count": 1.0,
- "routers_loss": 0.04311618581414223,
+ "routers_loss": 0.049863800406455994,
"skip_count": 0.0,
"step": 1640,
"text_loss": 0.28138160705566406
@@ -15597,13 +15597,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08251953125,
+ "grad_norm": 0.08544921875,
"learning_rate": 0.0009691425196711901,
- "loss": 0.039,
+ "loss": 0.0398,
"macro_f1": 0.3272727429866791,
"num_tokens": 2651171.0,
"repeat_count": 0.0,
- "routers_loss": 0.02027471922338009,
+ "routers_loss": 0.02112230286002159,
"skip_count": 0.0,
"step": 1642,
"text_loss": 0.3745322525501251
@@ -15616,13 +15616,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009690353794918971,
- "loss": 0.0279,
+ "loss": 0.0275,
"macro_f1": 0.3333333432674408,
"num_tokens": 2654093.0,
"repeat_count": 0.0,
- "routers_loss": 0.003074956126511097,
+ "routers_loss": 0.0024304776452481747,
"skip_count": 0.0,
"step": 1644,
"text_loss": 0.4275154173374176
@@ -15635,13 +15635,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.0771484375,
"learning_rate": 0.000968928059577362,
- "loss": 0.0241,
+ "loss": 0.0244,
"macro_f1": 0.6666666865348816,
"num_tokens": 2657079.0,
"repeat_count": 0.0,
- "routers_loss": 0.009374706074595451,
+ "routers_loss": 0.009320619516074657,
"skip_count": 1.0,
"step": 1646,
"text_loss": 0.46650025248527527
@@ -15654,13 +15654,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1162109375,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009688205599687099,
- "loss": 0.0218,
+ "loss": 0.0209,
"macro_f1": 0.3272727429866791,
"num_tokens": 2660951.0,
"repeat_count": 0.0,
- "routers_loss": 0.01204691268503666,
+ "routers_loss": 0.011913162656128407,
"skip_count": 0.0,
"step": 1648,
"text_loss": 0.46644100546836853
@@ -15673,13 +15673,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009687128807071347,
"loss": 0.0284,
"macro_f1": 0.3333333432674408,
"num_tokens": 2663823.0,
"repeat_count": 0.0,
- "routers_loss": 0.01376053225249052,
+ "routers_loss": 0.013754756189882755,
"skip_count": 0.0,
"step": 1650,
"text_loss": 0.40808847546577454
@@ -15692,13 +15692,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.103515625,
"learning_rate": 0.0009686050218338996,
- "loss": 0.0285,
+ "loss": 0.0286,
"macro_f1": 0.3333333432674408,
"num_tokens": 2667079.0,
"repeat_count": 0.0,
- "routers_loss": 0.009346984326839447,
+ "routers_loss": 0.009099726565182209,
"skip_count": 0.0,
"step": 1652,
"text_loss": 0.2389989197254181
@@ -15711,13 +15711,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.08837890625,
"learning_rate": 0.0009684969833903359,
- "loss": 0.0291,
+ "loss": 0.0283,
"macro_f1": 0.6666666865348816,
"num_tokens": 2670162.0,
"repeat_count": 0.0,
- "routers_loss": 0.002724624238908291,
+ "routers_loss": 0.0034928603563457727,
"skip_count": 1.0,
"step": 1654,
"text_loss": 0.6930749416351318
@@ -15730,13 +15730,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009683887654178445,
- "loss": 0.0271,
+ "loss": 0.0261,
"macro_f1": 0.6666666865348816,
"num_tokens": 2673031.0,
"repeat_count": 0.0,
- "routers_loss": 0.00823777075856924,
+ "routers_loss": 0.008340462110936642,
"skip_count": 1.0,
"step": 1656,
"text_loss": 0.277752548456192
@@ -15749,32 +15749,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07373046875,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009682803679578947,
- "loss": 0.0262,
+ "loss": 0.0259,
"macro_f1": 0.3333333432674408,
"num_tokens": 2676092.0,
"repeat_count": 0.0,
- "routers_loss": 0.004393119364976883,
+ "routers_loss": 0.004337446764111519,
"skip_count": 0.0,
"step": 1658,
"text_loss": 0.5176776051521301
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 7.7936601115350745,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.1513671875,
+ "f1_skip": 0.0,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009681717910520244,
- "loss": 0.024,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0242,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 2679479.0,
"repeat_count": 0.0,
- "routers_loss": 0.031827569007873535,
+ "routers_loss": 0.034611742943525314,
"skip_count": 2.0,
"step": 1660,
"text_loss": 0.21485982835292816
@@ -15789,11 +15789,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.07958984375,
"learning_rate": 0.0009680630347418406,
- "loss": 0.0216,
+ "loss": 0.022,
"macro_f1": 0.5492662787437439,
"num_tokens": 2683289.0,
"repeat_count": 0.0,
- "routers_loss": 0.03329647704958916,
+ "routers_loss": 0.03297121450304985,
"skip_count": 2.0,
"step": 1662,
"text_loss": 0.33801013231277466
@@ -15806,13 +15806,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.000967954099069019,
- "loss": 0.0415,
+ "loss": 0.0411,
"macro_f1": 0.32098764181137085,
"num_tokens": 2685879.0,
"repeat_count": 1.0,
- "routers_loss": 0.047317031770944595,
+ "routers_loss": 0.04551183059811592,
"skip_count": 1.0,
"step": 1664,
"text_loss": 0.41123488545417786
@@ -15827,11 +15827,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1240234375,
"learning_rate": 0.0009678449840753038,
- "loss": 0.0325,
+ "loss": 0.0324,
"macro_f1": 0.32098764181137085,
"num_tokens": 2688910.0,
"repeat_count": 0.0,
- "routers_loss": 0.05649980902671814,
+ "routers_loss": 0.05866450071334839,
"skip_count": 2.0,
"step": 1666,
"text_loss": 0.1740892380475998
@@ -15844,13 +15844,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009677356898025082,
- "loss": 0.0229,
+ "loss": 0.023,
"macro_f1": 0.3333333432674408,
"num_tokens": 2691680.0,
"repeat_count": 0.0,
- "routers_loss": 0.01004624180495739,
+ "routers_loss": 0.009243223816156387,
"skip_count": 0.0,
"step": 1668,
"text_loss": 0.2512350380420685
@@ -15863,13 +15863,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.09619140625,
"learning_rate": 0.000967626216292514,
- "loss": 0.0194,
+ "loss": 0.0195,
"macro_f1": 0.3333333432674408,
"num_tokens": 2694895.0,
"repeat_count": 0.0,
- "routers_loss": 0.0054973396472632885,
+ "routers_loss": 0.005576452240347862,
"skip_count": 0.0,
"step": 1670,
"text_loss": 0.43294376134872437
@@ -15882,13 +15882,13 @@
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.09130859375,
"learning_rate": 0.0009675165635872715,
- "loss": 0.031,
+ "loss": 0.0306,
"macro_f1": 0.44705885648727417,
"num_tokens": 2697806.0,
"repeat_count": 0.0,
- "routers_loss": 0.05615650862455368,
+ "routers_loss": 0.05372785031795502,
"skip_count": 3.0,
"step": 1672,
"text_loss": 0.1614082306623459
@@ -15901,13 +15901,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009674067317288,
- "loss": 0.0301,
+ "loss": 0.0296,
"macro_f1": 0.6666666865348816,
"num_tokens": 2700529.0,
"repeat_count": 1.0,
- "routers_loss": 0.012819192372262478,
+ "routers_loss": 0.018131591379642487,
"skip_count": 0.0,
"step": 1674,
"text_loss": 0.2093173861503601
@@ -15920,13 +15920,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.08203125,
"learning_rate": 0.0009672967207591869,
- "loss": 0.0253,
+ "loss": 0.0257,
"macro_f1": 0.3272727429866791,
"num_tokens": 2703650.0,
"repeat_count": 0.0,
- "routers_loss": 0.07059332728385925,
+ "routers_loss": 0.0673515796661377,
"skip_count": 1.0,
"step": 1676,
"text_loss": 0.3029400110244751
@@ -15939,13 +15939,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009671865307205892,
- "loss": 0.0198,
+ "loss": 0.021,
"macro_f1": 0.32098767161369324,
"num_tokens": 2707615.0,
"repeat_count": 0.0,
- "routers_loss": 0.029778441414237022,
+ "routers_loss": 0.03821169584989548,
"skip_count": 1.0,
"step": 1678,
"text_loss": 0.2262786477804184
@@ -15958,13 +15958,13 @@
"f1_execute": 0.9756097793579102,
"f1_repeat": 1.0,
"f1_skip": 0.9090909361839294,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0009670761616552315,
- "loss": 0.0474,
+ "loss": 0.0465,
"macro_f1": 0.9615669250488281,
"num_tokens": 2710894.0,
"repeat_count": 2.0,
- "routers_loss": 0.04371272772550583,
+ "routers_loss": 0.042625464498996735,
"skip_count": 6.0,
"step": 1680,
"text_loss": 0.29623574018478394
@@ -15977,13 +15977,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009669656136054074,
- "loss": 0.0293,
+ "loss": 0.0289,
"macro_f1": 0.3333333432674408,
"num_tokens": 2714330.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033591394312679768,
+ "routers_loss": 0.0037571541033685207,
"skip_count": 0.0,
"step": 1682,
"text_loss": 0.7510389089584351
@@ -15996,13 +15996,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.07421875,
"learning_rate": 0.0009668548866134795,
- "loss": 0.0259,
+ "loss": 0.0256,
"macro_f1": 0.3333333432674408,
"num_tokens": 2717176.0,
"repeat_count": 0.0,
- "routers_loss": 0.005085585173219442,
+ "routers_loss": 0.004142968449741602,
"skip_count": 0.0,
"step": 1684,
"text_loss": 0.3273485600948334
@@ -16015,13 +16015,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0712890625,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009667439807218783,
- "loss": 0.0243,
+ "loss": 0.0233,
"macro_f1": 0.6666666865348816,
"num_tokens": 2720628.0,
"repeat_count": 0.0,
- "routers_loss": 0.008569681085646152,
+ "routers_loss": 0.008753842674195766,
"skip_count": 2.0,
"step": 1686,
"text_loss": 0.4314708709716797
@@ -16034,32 +16034,32 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0009666328959731033,
- "loss": 0.022,
+ "loss": 0.0211,
"macro_f1": 0.6603773832321167,
"num_tokens": 2723739.0,
"repeat_count": 1.0,
- "routers_loss": 0.024587804451584816,
+ "routers_loss": 0.022674910724163055,
"skip_count": 1.0,
"step": 1688,
"text_loss": 0.25734150409698486
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.3333333432674408,
- "avg_layers": 27.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
"epoch": 7.934546521866745,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
- "f1_skip": 0.5,
- "grad_norm": 0.169921875,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009665216324097222,
- "loss": 0.0332,
- "macro_f1": 0.4871794879436493,
+ "loss": 0.0324,
+ "macro_f1": 0.5934640765190125,
"num_tokens": 2726644.0,
"repeat_count": 0.0,
- "routers_loss": 0.037516288459300995,
+ "routers_loss": 0.03932750225067139,
"skip_count": 3.0,
"step": 1690,
"text_loss": 0.24511034786701202
@@ -16072,13 +16072,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.09765625,
"learning_rate": 0.0009664101900743714,
- "loss": 0.0262,
+ "loss": 0.0255,
"macro_f1": 0.3272727429866791,
"num_tokens": 2729662.0,
"repeat_count": 0.0,
- "routers_loss": 0.01287431176751852,
+ "routers_loss": 0.012672754004597664,
"skip_count": 1.0,
"step": 1692,
"text_loss": 0.39431414008140564
@@ -16091,13 +16091,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.07763671875,
+ "grad_norm": 0.076171875,
"learning_rate": 0.000966298569009756,
- "loss": 0.0227,
+ "loss": 0.0231,
"macro_f1": 0.5492662787437439,
"num_tokens": 2732578.0,
"repeat_count": 0.0,
- "routers_loss": 0.015499880537390709,
+ "routers_loss": 0.01548632513731718,
"skip_count": 2.0,
"step": 1694,
"text_loss": 0.12439999729394913
@@ -16110,13 +16110,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.080078125,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009661867692586494,
- "loss": 0.0144,
+ "loss": 0.0153,
"macro_f1": 0.32098764181137085,
"num_tokens": 2735887.0,
"repeat_count": 0.0,
- "routers_loss": 0.049878787249326706,
+ "routers_loss": 0.05622401833534241,
"skip_count": 2.0,
"step": 1696,
"text_loss": 0.29024389386177063
@@ -16129,13 +16129,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10009765625,
+ "grad_norm": 0.087890625,
"learning_rate": 0.0009660747908638933,
- "loss": 0.0206,
+ "loss": 0.0205,
"macro_f1": 0.3272727429866791,
"num_tokens": 2739293.0,
"repeat_count": 0.0,
- "routers_loss": 0.04108169302344322,
+ "routers_loss": 0.041060201823711395,
"skip_count": 1.0,
"step": 1698,
"text_loss": 0.39461007714271545
@@ -16148,13 +16148,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.1767578125,
"learning_rate": 0.0009659626338683981,
- "loss": 0.0367,
+ "loss": 0.0369,
"macro_f1": 0.3333333432674408,
"num_tokens": 2742468.0,
"repeat_count": 0.0,
- "routers_loss": 0.007651917636394501,
+ "routers_loss": 0.007251353468745947,
"skip_count": 0.0,
"step": 1700,
"text_loss": 0.2751767635345459
@@ -16167,13 +16167,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.07763671875,
"learning_rate": 0.0009658502983151427,
- "loss": 0.0182,
+ "loss": 0.0186,
"macro_f1": 0.3272727429866791,
"num_tokens": 2745123.0,
"repeat_count": 0.0,
- "routers_loss": 0.015448091551661491,
+ "routers_loss": 0.012847424484789371,
"skip_count": 1.0,
"step": 1702,
"text_loss": 0.4756404757499695
@@ -16186,13 +16186,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.11767578125,
"learning_rate": 0.0009657377842471742,
- "loss": 0.0324,
+ "loss": 0.0313,
"macro_f1": 0.6666666865348816,
"num_tokens": 2748016.0,
"repeat_count": 0.0,
- "routers_loss": 0.009139287285506725,
+ "routers_loss": 0.007060411386191845,
"skip_count": 1.0,
"step": 1704,
"text_loss": 0.9571210145950317
@@ -16205,13 +16205,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.10009765625,
"learning_rate": 0.0009656250917076081,
- "loss": 0.0191,
+ "loss": 0.0188,
"macro_f1": 0.5492662787437439,
"num_tokens": 2750717.0,
"repeat_count": 0.0,
- "routers_loss": 0.015412120148539543,
+ "routers_loss": 0.016748681664466858,
"skip_count": 2.0,
"step": 1706,
"text_loss": 0.14542843401432037
@@ -16224,13 +16224,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.060302734375,
"learning_rate": 0.0009655122207396285,
- "loss": 0.0175,
+ "loss": 0.017,
"macro_f1": 0.3333333432674408,
"num_tokens": 2753635.0,
"repeat_count": 0.0,
- "routers_loss": 0.012735052965581417,
+ "routers_loss": 0.013607042841613293,
"skip_count": 0.0,
"step": 1708,
"text_loss": 0.21836471557617188
@@ -16243,13 +16243,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07177734375,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0009653991713864878,
- "loss": 0.0192,
+ "loss": 0.0205,
"macro_f1": 0.3333333432674408,
"num_tokens": 2756643.0,
"repeat_count": 0.0,
- "routers_loss": 0.00114025070797652,
+ "routers_loss": 0.0012097888393327594,
"skip_count": 0.0,
"step": 1710,
"text_loss": 0.635187029838562
@@ -16262,13 +16262,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1171875,
"learning_rate": 0.0009652859436915066,
- "loss": 0.0243,
+ "loss": 0.0231,
"macro_f1": 0.3333333432674408,
"num_tokens": 2759432.0,
"repeat_count": 0.0,
- "routers_loss": 0.006401443853974342,
+ "routers_loss": 0.006196760106831789,
"skip_count": 0.0,
"step": 1712,
"text_loss": 0.5629420876502991
@@ -16281,13 +16281,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0009651725376980743,
- "loss": 0.0185,
+ "loss": 0.0177,
"macro_f1": 0.3333333432674408,
"num_tokens": 2762538.0,
"repeat_count": 0.0,
- "routers_loss": 0.004316259175539017,
+ "routers_loss": 0.0042513771913945675,
"skip_count": 0.0,
"step": 1714,
"text_loss": 0.39522525668144226
@@ -16300,13 +16300,13 @@
"f1_execute": 0.9583333134651184,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.125,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009650589534496479,
- "loss": 0.0201,
+ "loss": 0.0194,
"macro_f1": 0.8194444179534912,
"num_tokens": 2765571.0,
"repeat_count": 2.0,
- "routers_loss": 0.043461959809064865,
+ "routers_loss": 0.03596706688404083,
"skip_count": 3.0,
"step": 1716,
"text_loss": 0.6252416968345642
@@ -16319,13 +16319,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.04833984375,
"learning_rate": 0.0009649451909897532,
"loss": 0.0178,
"macro_f1": 0.3333333432674408,
"num_tokens": 2769206.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024530428927391768,
+ "routers_loss": 0.0025788163766264915,
"skip_count": 0.0,
"step": 1718,
"text_loss": 0.8851634860038757
@@ -16338,13 +16338,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.10791015625,
"learning_rate": 0.0009648312503619843,
- "loss": 0.026,
+ "loss": 0.0265,
"macro_f1": 0.3333333432674408,
"num_tokens": 2772488.0,
"repeat_count": 0.0,
- "routers_loss": 0.0046626063995063305,
+ "routers_loss": 0.004443451762199402,
"skip_count": 0.0,
"step": 1720,
"text_loss": 0.8568580746650696
@@ -16357,13 +16357,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009647171316100034,
- "loss": 0.0257,
+ "loss": 0.0265,
"macro_f1": 0.9265305995941162,
"num_tokens": 2776482.0,
"repeat_count": 1.0,
- "routers_loss": 0.02480102889239788,
+ "routers_loss": 0.022948263213038445,
"skip_count": 3.0,
"step": 1722,
"text_loss": 0.13431036472320557
@@ -16376,13 +16376,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.0009646028347775409,
- "loss": 0.02,
+ "loss": 0.0204,
"macro_f1": 0.6666666865348816,
"num_tokens": 2778966.0,
"repeat_count": 0.0,
- "routers_loss": 0.012629947625100613,
+ "routers_loss": 0.011328035034239292,
"skip_count": 1.0,
"step": 1724,
"text_loss": 0.2085491120815277
@@ -16395,13 +16395,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08447265625,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009644883599083958,
"loss": 0.0238,
"macro_f1": 0.3333333432674408,
"num_tokens": 2781968.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024127380456775427,
+ "routers_loss": 0.002208018908277154,
"skip_count": 0.0,
"step": 1726,
"text_loss": 0.4948323965072632
@@ -16414,13 +16414,13 @@
"f1_execute": 0.9411764740943909,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.054443359375,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0009643737070464349,
- "loss": 0.0162,
+ "loss": 0.0158,
"macro_f1": 0.6470588445663452,
"num_tokens": 2784666.0,
"repeat_count": 1.0,
- "routers_loss": 0.0415453165769577,
+ "routers_loss": 0.04391832649707794,
"skip_count": 2.0,
"step": 1728,
"text_loss": 0.39060094952583313
@@ -16433,13 +16433,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.046630859375,
"learning_rate": 0.0009642588762355935,
- "loss": 0.0211,
+ "loss": 0.0212,
"macro_f1": 0.6666666865348816,
"num_tokens": 2787558.0,
"repeat_count": 0.0,
- "routers_loss": 0.0056681083515286446,
+ "routers_loss": 0.004497280344367027,
"skip_count": 1.0,
"step": 1730,
"text_loss": 0.34908708930015564
@@ -16452,13 +16452,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0009641438675198748,
- "loss": 0.0189,
+ "loss": 0.0175,
"macro_f1": 0.3333333432674408,
"num_tokens": 2790474.0,
"repeat_count": 0.0,
- "routers_loss": 0.006391602102667093,
+ "routers_loss": 0.00583475548774004,
"skip_count": 0.0,
"step": 1732,
"text_loss": 0.5720033049583435
@@ -16471,13 +16471,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0595703125,
+ "grad_norm": 0.08154296875,
"learning_rate": 0.0009640286809433508,
- "loss": 0.0229,
+ "loss": 0.0235,
"macro_f1": 0.3333333432674408,
"num_tokens": 2793272.0,
"repeat_count": 0.0,
- "routers_loss": 0.007466991897672415,
+ "routers_loss": 0.007826375775039196,
"skip_count": 0.0,
"step": 1734,
"text_loss": 0.32181721925735474
@@ -16490,13 +16490,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0009639133165501606,
- "loss": 0.0197,
+ "loss": 0.0192,
"macro_f1": 0.3333333432674408,
"num_tokens": 2797726.0,
"repeat_count": 0.0,
- "routers_loss": 0.001953453291207552,
+ "routers_loss": 0.0019055595621466637,
"skip_count": 0.0,
"step": 1736,
"text_loss": 0.620936393737793
@@ -16509,13 +16509,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009637977743845124,
- "loss": 0.0223,
+ "loss": 0.0229,
"macro_f1": 0.3333333432674408,
"num_tokens": 2800706.0,
"repeat_count": 0.0,
- "routers_loss": 0.003612719476222992,
+ "routers_loss": 0.0028302327264100313,
"skip_count": 0.0,
"step": 1738,
"text_loss": 0.6473138332366943
@@ -16528,13 +16528,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.049072265625,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009636820544906823,
- "loss": 0.0145,
+ "loss": 0.0146,
"macro_f1": 1.0,
"num_tokens": 2803847.0,
"repeat_count": 1.0,
- "routers_loss": 0.009977150708436966,
+ "routers_loss": 0.01105099730193615,
"skip_count": 2.0,
"step": 1740,
"text_loss": 0.4401201903820038
@@ -16547,13 +16547,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009635661569130141,
"loss": 0.0195,
"macro_f1": 0.5934640765190125,
"num_tokens": 2807235.0,
"repeat_count": 0.0,
- "routers_loss": 0.026468059048056602,
+ "routers_loss": 0.02619045600295067,
"skip_count": 3.0,
"step": 1742,
"text_loss": 0.459264874458313
@@ -16566,13 +16566,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0009634500816959202,
- "loss": 0.0165,
+ "loss": 0.0162,
"macro_f1": 0.6666666865348816,
"num_tokens": 2810396.0,
"repeat_count": 0.0,
- "routers_loss": 0.00849854201078415,
+ "routers_loss": 0.007915694266557693,
"skip_count": 2.0,
"step": 1744,
"text_loss": 0.5084020495414734
@@ -16585,13 +16585,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009633338288838805,
- "loss": 0.0275,
+ "loss": 0.0271,
"macro_f1": 0.5492662787437439,
"num_tokens": 2813215.0,
"repeat_count": 2.0,
- "routers_loss": 0.08082596957683563,
+ "routers_loss": 0.08364596217870712,
"skip_count": 0.0,
"step": 1746,
"text_loss": 0.27681824564933777
@@ -16604,13 +16604,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.046142578125,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.0009632173985214438,
- "loss": 0.015,
+ "loss": 0.0156,
"macro_f1": 0.8817967176437378,
"num_tokens": 2816452.0,
"repeat_count": 3.0,
- "routers_loss": 0.029500717297196388,
+ "routers_loss": 0.028805451467633247,
"skip_count": 2.0,
"step": 1748,
"text_loss": 0.4678419530391693
@@ -16623,13 +16623,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.0625,
"learning_rate": 0.000963100790653226,
- "loss": 0.0183,
+ "loss": 0.0188,
"macro_f1": 0.3272727429866791,
"num_tokens": 2819364.0,
"repeat_count": 0.0,
- "routers_loss": 0.025238536298274994,
+ "routers_loss": 0.03056817688047886,
"skip_count": 1.0,
"step": 1750,
"text_loss": 0.3078109920024872
@@ -16642,13 +16642,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0703125,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009629840053239116,
- "loss": 0.0204,
+ "loss": 0.0205,
"macro_f1": 0.3333333432674408,
"num_tokens": 2823469.0,
"repeat_count": 0.0,
- "routers_loss": 0.002069319598376751,
+ "routers_loss": 0.0019477814203128219,
"skip_count": 0.0,
"step": 1752,
"text_loss": 0.45501336455345154
@@ -16661,13 +16661,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05224609375,
+ "grad_norm": 0.057373046875,
"learning_rate": 0.000962867042578253,
- "loss": 0.0169,
+ "loss": 0.0173,
"macro_f1": 0.3333333432674408,
"num_tokens": 2826716.0,
"repeat_count": 0.0,
- "routers_loss": 0.002853946527466178,
+ "routers_loss": 0.0032963966950774193,
"skip_count": 0.0,
"step": 1754,
"text_loss": 0.49234694242477417
@@ -16680,13 +16680,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0947265625,
"learning_rate": 0.0009627499024610707,
- "loss": 0.0236,
+ "loss": 0.0239,
"macro_f1": 0.3272727429866791,
"num_tokens": 2829733.0,
"repeat_count": 0.0,
- "routers_loss": 0.0100983502343297,
+ "routers_loss": 0.010289114899933338,
"skip_count": 1.0,
"step": 1756,
"text_loss": 0.22335539758205414
@@ -16699,13 +16699,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09228515625,
+ "grad_norm": 0.0888671875,
"learning_rate": 0.0009626325850172527,
- "loss": 0.0173,
+ "loss": 0.0174,
"macro_f1": 0.3272727429866791,
"num_tokens": 2833350.0,
"repeat_count": 0.0,
- "routers_loss": 0.031218983232975006,
+ "routers_loss": 0.03249066323041916,
"skip_count": 1.0,
"step": 1758,
"text_loss": 0.6581931114196777
@@ -16718,13 +16718,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009625150902917555,
- "loss": 0.019,
+ "loss": 0.0185,
"macro_f1": 0.3333333432674408,
"num_tokens": 2836558.0,
"repeat_count": 0.0,
- "routers_loss": 0.010347879491746426,
+ "routers_loss": 0.00870000571012497,
"skip_count": 0.0,
"step": 1760,
"text_loss": 0.22938725352287292
@@ -16737,13 +16737,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1455078125,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0009623974183296031,
- "loss": 0.0193,
+ "loss": 0.0192,
"macro_f1": 0.3333333432674408,
"num_tokens": 2840560.0,
"repeat_count": 0.0,
- "routers_loss": 0.007768871728330851,
+ "routers_loss": 0.007767196744680405,
"skip_count": 0.0,
"step": 1762,
"text_loss": 0.24473799765110016
@@ -16756,13 +16756,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009622795691758876,
- "loss": 0.0253,
+ "loss": 0.0244,
"macro_f1": 0.3333333432674408,
"num_tokens": 2843548.0,
"repeat_count": 0.0,
- "routers_loss": 0.002887974726036191,
+ "routers_loss": 0.0021693643648177385,
"skip_count": 0.0,
"step": 1764,
"text_loss": 0.3084608018398285
@@ -16777,11 +16777,11 @@
"f1_skip": 0.0,
"grad_norm": 0.0498046875,
"learning_rate": 0.0009621615428757693,
- "loss": 0.0147,
+ "loss": 0.0149,
"macro_f1": 0.3333333432674408,
"num_tokens": 2847076.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027294005267322063,
+ "routers_loss": 0.0024727333802729845,
"skip_count": 0.0,
"step": 1766,
"text_loss": 0.5251734852790833
@@ -16794,13 +16794,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.000962043339474476,
- "loss": 0.0193,
+ "loss": 0.0194,
"macro_f1": 0.3333333432674408,
"num_tokens": 2849751.0,
"repeat_count": 0.0,
- "routers_loss": 0.00543541694059968,
+ "routers_loss": 0.005174890160560608,
"skip_count": 0.0,
"step": 1768,
"text_loss": 0.4410129189491272
@@ -16813,13 +16813,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.06103515625,
"learning_rate": 0.0009619249590173032,
- "loss": 0.0167,
+ "loss": 0.016,
"macro_f1": 0.6666666865348816,
"num_tokens": 2853916.0,
"repeat_count": 0.0,
- "routers_loss": 0.006514009553939104,
+ "routers_loss": 0.006785830482840538,
"skip_count": 2.0,
"step": 1770,
"text_loss": 0.550076425075531
@@ -16832,13 +16832,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.06591796875,
"learning_rate": 0.0009618064015496149,
- "loss": 0.019,
+ "loss": 0.0192,
"macro_f1": 0.5934640765190125,
"num_tokens": 2857372.0,
"repeat_count": 0.0,
- "routers_loss": 0.02333846502006054,
+ "routers_loss": 0.021370256319642067,
"skip_count": 3.0,
"step": 1772,
"text_loss": 0.1988629847764969
@@ -16851,13 +16851,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0732421875,
+ "grad_norm": 0.072265625,
"learning_rate": 0.0009616876671168423,
- "loss": 0.0165,
+ "loss": 0.0162,
"macro_f1": 0.6666666865348816,
"num_tokens": 2861028.0,
"repeat_count": 0.0,
- "routers_loss": 0.004471905063837767,
+ "routers_loss": 0.004313841462135315,
"skip_count": 1.0,
"step": 1774,
"text_loss": 0.42581331729888916
@@ -16870,13 +16870,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.0009615687557644847,
- "loss": 0.0261,
+ "loss": 0.0268,
"macro_f1": 0.3333333432674408,
"num_tokens": 2864847.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024362702388316393,
+ "routers_loss": 0.0025742491707205772,
"skip_count": 0.0,
"step": 1776,
"text_loss": 0.46510905027389526
@@ -16889,13 +16889,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009614496675381093,
- "loss": 0.0116,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 2867392.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021166049409657717,
+ "routers_loss": 0.0016813480760902166,
"skip_count": 0.0,
"step": 1778,
"text_loss": 0.5922174453735352
@@ -16908,13 +16908,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0712890625,
+ "grad_norm": 0.0810546875,
"learning_rate": 0.0009613304024833507,
"loss": 0.0166,
"macro_f1": 0.3333333432674408,
"num_tokens": 2871273.0,
"repeat_count": 0.0,
- "routers_loss": 0.004722296260297298,
+ "routers_loss": 0.004948933608829975,
"skip_count": 0.0,
"step": 1780,
"text_loss": 0.6776977777481079
@@ -16929,11 +16929,11 @@
"f1_skip": 1.0,
"grad_norm": 0.07470703125,
"learning_rate": 0.0009612109606459117,
- "loss": 0.0199,
+ "loss": 0.0186,
"macro_f1": 1.0,
"num_tokens": 2874172.0,
"repeat_count": 1.0,
- "routers_loss": 0.014188882894814014,
+ "routers_loss": 0.016950147226452827,
"skip_count": 2.0,
"step": 1782,
"text_loss": 0.48758944869041443
@@ -16946,13 +16946,13 @@
"f1_execute": 0.9599999785423279,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.076171875,
+ "grad_norm": 0.08251953125,
"learning_rate": 0.0009610913420715623,
- "loss": 0.0241,
+ "loss": 0.0237,
"macro_f1": 0.7644444704055786,
"num_tokens": 2877528.0,
"repeat_count": 2.0,
- "routers_loss": 0.04599560424685478,
+ "routers_loss": 0.04880943149328232,
"skip_count": 1.0,
"step": 1784,
"text_loss": 0.4404778480529785
@@ -16965,13 +16965,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.06201171875,
"learning_rate": 0.0009609715468061411,
- "loss": 0.0216,
+ "loss": 0.0205,
"macro_f1": 0.3333333432674408,
"num_tokens": 2880627.0,
"repeat_count": 0.0,
- "routers_loss": 0.004942454397678375,
+ "routers_loss": 0.004678630735725164,
"skip_count": 0.0,
"step": 1786,
"text_loss": 0.7295402884483337
@@ -16984,13 +16984,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08349609375,
+ "grad_norm": 0.07958984375,
"learning_rate": 0.0009608515748955535,
- "loss": 0.021,
+ "loss": 0.0205,
"macro_f1": 0.3333333432674408,
"num_tokens": 2883333.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020542226266115904,
+ "routers_loss": 0.0026695074047893286,
"skip_count": 0.0,
"step": 1788,
"text_loss": 0.9697831273078918
@@ -17003,13 +17003,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.1171875,
+ "grad_norm": 0.107421875,
"learning_rate": 0.000960731426385773,
- "loss": 0.0155,
+ "loss": 0.0157,
"macro_f1": 0.4871794879436493,
"num_tokens": 2887444.0,
"repeat_count": 0.0,
- "routers_loss": 0.0397041030228138,
+ "routers_loss": 0.029743613675236702,
"skip_count": 2.0,
"step": 1790,
"text_loss": 0.4737568199634552
@@ -17022,13 +17022,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.10107421875,
"learning_rate": 0.0009606111013228407,
- "loss": 0.0204,
+ "loss": 0.0207,
"macro_f1": 0.3333333432674408,
"num_tokens": 2890221.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017490010941401124,
+ "routers_loss": 0.0016153788892552257,
"skip_count": 0.0,
"step": 1792,
"text_loss": 0.6693558096885681
@@ -17041,13 +17041,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08251953125,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009604905997528655,
- "loss": 0.021,
+ "loss": 0.02,
"macro_f1": 0.3272727429866791,
"num_tokens": 2893262.0,
"repeat_count": 0.0,
- "routers_loss": 0.023590171709656715,
+ "routers_loss": 0.01965433731675148,
"skip_count": 1.0,
"step": 1794,
"text_loss": 0.45227760076522827
@@ -17060,13 +17060,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.08642578125,
"learning_rate": 0.0009603699217220239,
- "loss": 0.0125,
+ "loss": 0.0117,
"macro_f1": 0.6601307392120361,
"num_tokens": 2896823.0,
"repeat_count": 1.0,
- "routers_loss": 0.02458076737821102,
+ "routers_loss": 0.024017298594117165,
"skip_count": 2.0,
"step": 1796,
"text_loss": 0.48865509033203125
@@ -17079,13 +17079,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.08837890625,
"learning_rate": 0.0009602490672765597,
- "loss": 0.019,
+ "loss": 0.0182,
"macro_f1": 0.3333333432674408,
"num_tokens": 2899707.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014341498026624322,
+ "routers_loss": 0.0012420224957168102,
"skip_count": 0.0,
"step": 1798,
"text_loss": 0.43292415142059326
@@ -17098,13 +17098,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08056640625,
+ "grad_norm": 0.07861328125,
"learning_rate": 0.0009601280364627848,
- "loss": 0.02,
+ "loss": 0.0196,
"macro_f1": 0.3333333432674408,
"num_tokens": 2902795.0,
"repeat_count": 0.0,
- "routers_loss": 0.00213223067112267,
+ "routers_loss": 0.0020389219280332327,
"skip_count": 0.0,
"step": 1800,
"text_loss": 0.41021591424942017
@@ -17117,13 +17117,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07275390625,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009600068293270783,
- "loss": 0.0147,
+ "loss": 0.0142,
"macro_f1": 0.3333333432674408,
"num_tokens": 2905769.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027340995147824287,
+ "routers_loss": 0.002006303984671831,
"skip_count": 0.0,
"step": 1802,
"text_loss": 0.46892106533050537
@@ -17136,32 +17136,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.08740234375,
"learning_rate": 0.000959885445915887,
- "loss": 0.0172,
+ "loss": 0.017,
"macro_f1": 0.3333333432674408,
"num_tokens": 2909475.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035587961319833994,
+ "routers_loss": 0.003734810510650277,
"skip_count": 0.0,
"step": 1804,
"text_loss": 0.45364710688591003
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.5,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 8.479013795127678,
- "f1_execute": 0.9615384340286255,
- "f1_repeat": 0.0,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009597638862757254,
- "loss": 0.0187,
- "macro_f1": 0.5427350401878357,
+ "loss": 0.0182,
+ "macro_f1": 0.8823530077934265,
"num_tokens": 2914348.0,
"repeat_count": 1.0,
- "routers_loss": 0.04446055367588997,
+ "routers_loss": 0.038971323519945145,
"skip_count": 2.0,
"step": 1806,
"text_loss": 0.42913779616355896
@@ -17174,13 +17174,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08447265625,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009596421504531751,
- "loss": 0.0244,
+ "loss": 0.0249,
"macro_f1": 0.3272727429866791,
"num_tokens": 2917467.0,
"repeat_count": 1.0,
- "routers_loss": 0.05095123499631882,
+ "routers_loss": 0.04800829663872719,
"skip_count": 0.0,
"step": 1808,
"text_loss": 0.17332297563552856
@@ -17193,13 +17193,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009595202384948858,
- "loss": 0.0232,
+ "loss": 0.0227,
"macro_f1": 0.6666666865348816,
"num_tokens": 2920223.0,
"repeat_count": 1.0,
- "routers_loss": 0.008440068922936916,
+ "routers_loss": 0.009164143353700638,
"skip_count": 0.0,
"step": 1810,
"text_loss": 0.33740702271461487
@@ -17212,13 +17212,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0927734375,
+ "grad_norm": 0.0947265625,
"learning_rate": 0.0009593981504475742,
- "loss": 0.0273,
+ "loss": 0.0275,
"macro_f1": 0.6666666865348816,
"num_tokens": 2923780.0,
"repeat_count": 0.0,
- "routers_loss": 0.012230116873979568,
+ "routers_loss": 0.011236993595957756,
"skip_count": 2.0,
"step": 1812,
"text_loss": 0.1609916388988495
@@ -17231,13 +17231,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009592758863580248,
- "loss": 0.026,
+ "loss": 0.0259,
"macro_f1": 0.5492662787437439,
"num_tokens": 2926259.0,
"repeat_count": 0.0,
- "routers_loss": 0.017307188361883163,
+ "routers_loss": 0.019026532769203186,
"skip_count": 2.0,
"step": 1814,
"text_loss": 0.6460903882980347
@@ -17250,13 +17250,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009591534462730894,
- "loss": 0.0215,
+ "loss": 0.0206,
"macro_f1": 0.5492662787437439,
"num_tokens": 2929173.0,
"repeat_count": 2.0,
- "routers_loss": 0.07191162556409836,
+ "routers_loss": 0.0608333982527256,
"skip_count": 0.0,
"step": 1816,
"text_loss": 0.476126492023468
@@ -17269,13 +17269,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.06640625,
"learning_rate": 0.000959030830239687,
- "loss": 0.0182,
+ "loss": 0.0175,
"macro_f1": 0.3333333432674408,
"num_tokens": 2932703.0,
"repeat_count": 0.0,
- "routers_loss": 0.008753604255616665,
+ "routers_loss": 0.0093300249427557,
"skip_count": 0.0,
"step": 1818,
"text_loss": 0.5471875667572021
@@ -17288,13 +17288,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19921875,
+ "grad_norm": 0.2001953125,
"learning_rate": 0.0009589080383048048,
- "loss": 0.0233,
+ "loss": 0.0235,
"macro_f1": 0.3333333432674408,
"num_tokens": 2936195.0,
"repeat_count": 0.0,
- "routers_loss": 0.008390828967094421,
+ "routers_loss": 0.010434109717607498,
"skip_count": 0.0,
"step": 1820,
"text_loss": 0.5068115592002869
@@ -17307,13 +17307,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0009587850705154964,
"loss": 0.0291,
"macro_f1": 0.3333333432674408,
"num_tokens": 2939412.0,
"repeat_count": 0.0,
- "routers_loss": 0.005617359187453985,
+ "routers_loss": 0.004347751382738352,
"skip_count": 0.0,
"step": 1822,
"text_loss": 0.4241984784603119
@@ -17326,13 +17326,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.0859375,
"learning_rate": 0.0009586619269188836,
- "loss": 0.0227,
+ "loss": 0.0224,
"macro_f1": 0.32098767161369324,
"num_tokens": 2942318.0,
"repeat_count": 0.0,
- "routers_loss": 0.0346846878528595,
+ "routers_loss": 0.034238871186971664,
"skip_count": 1.0,
"step": 1824,
"text_loss": 0.2328975349664688
@@ -17345,32 +17345,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0009585386075621553,
"loss": 0.027,
"macro_f1": 0.3333333432674408,
"num_tokens": 2945731.0,
"repeat_count": 0.0,
- "routers_loss": 0.006601692643016577,
+ "routers_loss": 0.006097695790231228,
"skip_count": 0.0,
"step": 1826,
"text_loss": 0.22816994786262512
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 8.582330496037569,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.08837890625,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0908203125,
"learning_rate": 0.0009584151124925676,
- "loss": 0.0207,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0208,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2948944.0,
"repeat_count": 0.0,
- "routers_loss": 0.0065619745291769505,
+ "routers_loss": 0.007790776435285807,
"skip_count": 1.0,
"step": 1828,
"text_loss": 0.5009413361549377
@@ -17383,13 +17383,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0009582914417574438,
- "loss": 0.0149,
+ "loss": 0.0145,
"macro_f1": 0.6666666865348816,
"num_tokens": 2951723.0,
"repeat_count": 0.0,
- "routers_loss": 0.011109639890491962,
+ "routers_loss": 0.009144559502601624,
"skip_count": 2.0,
"step": 1830,
"text_loss": 0.1402502954006195
@@ -17404,11 +17404,11 @@
"f1_skip": 0.0,
"grad_norm": 0.06201171875,
"learning_rate": 0.0009581675954041751,
- "loss": 0.0167,
+ "loss": 0.0166,
"macro_f1": 0.6666666865348816,
"num_tokens": 2954726.0,
"repeat_count": 1.0,
- "routers_loss": 0.008432094007730484,
+ "routers_loss": 0.006593191530555487,
"skip_count": 0.0,
"step": 1832,
"text_loss": 0.4871736466884613
@@ -17421,13 +17421,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0859375,
+ "grad_norm": 0.0869140625,
"learning_rate": 0.0009580435734802196,
- "loss": 0.0208,
+ "loss": 0.0206,
"macro_f1": 0.3333333432674408,
"num_tokens": 2957853.0,
"repeat_count": 0.0,
- "routers_loss": 0.011518111452460289,
+ "routers_loss": 0.01241068821400404,
"skip_count": 0.0,
"step": 1834,
"text_loss": 0.30100154876708984
@@ -17440,13 +17440,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009579193760331027,
- "loss": 0.0211,
+ "loss": 0.022,
"macro_f1": 0.3333333432674408,
"num_tokens": 2960783.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026744187343865633,
+ "routers_loss": 0.002219218760728836,
"skip_count": 0.0,
"step": 1836,
"text_loss": 0.4961516559123993
@@ -17459,13 +17459,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009577950031104169,
- "loss": 0.0165,
+ "loss": 0.0166,
"macro_f1": 0.6601307392120361,
"num_tokens": 2963328.0,
"repeat_count": 1.0,
- "routers_loss": 0.028107430785894394,
+ "routers_loss": 0.029363535344600677,
"skip_count": 2.0,
"step": 1838,
"text_loss": 0.42814353108406067
@@ -17478,13 +17478,13 @@
"f1_execute": 0.9387754797935486,
"f1_repeat": 1.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009576704547598226,
- "loss": 0.0263,
+ "loss": 0.0257,
"macro_f1": 0.7795917987823486,
"num_tokens": 2966108.0,
"repeat_count": 1.0,
- "routers_loss": 0.060007549822330475,
+ "routers_loss": 0.0579402856528759,
"skip_count": 4.0,
"step": 1840,
"text_loss": 0.20523512363433838
@@ -17497,13 +17497,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009575457310290463,
"loss": 0.0121,
"macro_f1": 0.3272727429866791,
"num_tokens": 2969137.0,
"repeat_count": 0.0,
- "routers_loss": 0.01074182614684105,
+ "routers_loss": 0.008810589089989662,
"skip_count": 0.0,
"step": 1842,
"text_loss": 0.6199528574943542
@@ -17516,13 +17516,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0732421875,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0009574208319658831,
- "loss": 0.0213,
+ "loss": 0.0208,
"macro_f1": 0.6666666865348816,
"num_tokens": 2972407.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019638657104223967,
+ "routers_loss": 0.0012295129708945751,
"skip_count": 1.0,
"step": 1844,
"text_loss": 0.66938316822052
@@ -17535,13 +17535,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.000957295757618194,
- "loss": 0.0156,
+ "loss": 0.0152,
"macro_f1": 0.4871794879436493,
"num_tokens": 2976045.0,
"repeat_count": 0.0,
- "routers_loss": 0.06953249871730804,
+ "routers_loss": 0.06162935495376587,
"skip_count": 2.0,
"step": 1846,
"text_loss": 0.5381782650947571
@@ -17554,13 +17554,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009571705080339079,
- "loss": 0.0154,
+ "loss": 0.0144,
"macro_f1": 0.3333333432674408,
"num_tokens": 2979025.0,
"repeat_count": 0.0,
- "routers_loss": 0.003563052974641323,
+ "routers_loss": 0.003950524143874645,
"skip_count": 0.0,
"step": 1848,
"text_loss": 0.5831671357154846
@@ -17573,13 +17573,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.11376953125,
"learning_rate": 0.0009570450832610208,
- "loss": 0.0216,
+ "loss": 0.0209,
"macro_f1": 0.3333333432674408,
"num_tokens": 2982276.0,
"repeat_count": 0.0,
- "routers_loss": 0.010409255512058735,
+ "routers_loss": 0.010354886762797832,
"skip_count": 0.0,
"step": 1850,
"text_loss": 0.27448201179504395
@@ -17592,13 +17592,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.0009569194833475956,
- "loss": 0.0195,
+ "loss": 0.0199,
"macro_f1": 0.3272727429866791,
"num_tokens": 2985691.0,
"repeat_count": 0.0,
- "routers_loss": 0.009769548662006855,
+ "routers_loss": 0.010167439468204975,
"skip_count": 0.0,
"step": 1852,
"text_loss": 0.5264663696289062
@@ -17611,13 +17611,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1181640625,
+ "grad_norm": 0.1328125,
"learning_rate": 0.0009567937083417624,
- "loss": 0.0184,
+ "loss": 0.0194,
"macro_f1": 0.3272727429866791,
"num_tokens": 2989126.0,
"repeat_count": 0.0,
- "routers_loss": 0.036616452038288116,
+ "routers_loss": 0.0371871180832386,
"skip_count": 1.0,
"step": 1854,
"text_loss": 0.2008018046617508
@@ -17630,13 +17630,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.0009566677582917185,
- "loss": 0.0192,
+ "loss": 0.0184,
"macro_f1": 0.3333333432674408,
"num_tokens": 2992814.0,
"repeat_count": 0.0,
- "routers_loss": 0.009581349790096283,
+ "routers_loss": 0.010190588422119617,
"skip_count": 0.0,
"step": 1856,
"text_loss": 0.749717116355896
@@ -17649,13 +17649,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09814453125,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009565416332457282,
- "loss": 0.0138,
+ "loss": 0.0132,
"macro_f1": 0.6538461446762085,
"num_tokens": 2995729.0,
"repeat_count": 1.0,
- "routers_loss": 0.02330300398170948,
+ "routers_loss": 0.022285036742687225,
"skip_count": 1.0,
"step": 1858,
"text_loss": 0.5870219469070435
@@ -17668,13 +17668,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009564153332521228,
- "loss": 0.0226,
+ "loss": 0.0224,
"macro_f1": 0.3272727429866791,
"num_tokens": 2998812.0,
"repeat_count": 0.0,
- "routers_loss": 0.011985735036432743,
+ "routers_loss": 0.011050296947360039,
"skip_count": 1.0,
"step": 1860,
"text_loss": 0.8444408774375916
@@ -17687,13 +17687,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.06005859375,
"learning_rate": 0.0009562888583593005,
- "loss": 0.0162,
+ "loss": 0.0163,
"macro_f1": 0.3333333432674408,
"num_tokens": 3001799.0,
"repeat_count": 0.0,
- "routers_loss": 0.005997250322252512,
+ "routers_loss": 0.007125461008399725,
"skip_count": 0.0,
"step": 1862,
"text_loss": 0.41510361433029175
@@ -17706,13 +17706,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009561622086157272,
- "loss": 0.0243,
+ "loss": 0.0236,
"macro_f1": 0.3333333432674408,
"num_tokens": 3005088.0,
"repeat_count": 0.0,
- "routers_loss": 0.004814761225134134,
+ "routers_loss": 0.0049054501578211784,
"skip_count": 0.0,
"step": 1864,
"text_loss": 0.3801248073577881
@@ -17725,13 +17725,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.056884765625,
+ "grad_norm": 0.054443359375,
"learning_rate": 0.000956035384069935,
- "loss": 0.0242,
+ "loss": 0.0238,
"macro_f1": 1.0,
"num_tokens": 3008178.0,
"repeat_count": 1.0,
- "routers_loss": 0.004750931169837713,
+ "routers_loss": 0.005162427201867104,
"skip_count": 1.0,
"step": 1866,
"text_loss": 0.2687684893608093
@@ -17744,13 +17744,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1123046875,
+ "grad_norm": 0.10400390625,
"learning_rate": 0.0009559083847705233,
- "loss": 0.0216,
+ "loss": 0.0214,
"macro_f1": 0.3272727429866791,
"num_tokens": 3010923.0,
"repeat_count": 0.0,
- "routers_loss": 0.038251202553510666,
+ "routers_loss": 0.028984658420085907,
"skip_count": 1.0,
"step": 1868,
"text_loss": 0.6277349591255188
@@ -17763,13 +17763,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009557812107661584,
- "loss": 0.0204,
+ "loss": 0.0208,
"macro_f1": 1.0,
"num_tokens": 3015030.0,
"repeat_count": 1.0,
- "routers_loss": 0.010951942764222622,
+ "routers_loss": 0.012200530618429184,
"skip_count": 1.0,
"step": 1870,
"text_loss": 0.6293368339538574
@@ -17782,13 +17782,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.11962890625,
"learning_rate": 0.0009556538621055739,
- "loss": 0.0265,
+ "loss": 0.0268,
"macro_f1": 0.3272727429866791,
"num_tokens": 3019067.0,
"repeat_count": 0.0,
- "routers_loss": 0.06582094728946686,
+ "routers_loss": 0.06365182995796204,
"skip_count": 1.0,
"step": 1872,
"text_loss": 0.39046618342399597
@@ -17796,18 +17796,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
- "avg_layers": 26.0,
+ "avg_layers": 27.0,
"epoch": 8.798356325212797,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.12353515625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.115234375,
"learning_rate": 0.0009555263388375699,
- "loss": 0.0143,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.014,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3022166.0,
"repeat_count": 0.0,
- "routers_loss": 0.008920271880924702,
+ "routers_loss": 0.0041703456081449986,
"skip_count": 1.0,
"step": 1874,
"text_loss": 0.42232340574264526
@@ -17820,13 +17820,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1220703125,
+ "grad_norm": 0.11572265625,
"learning_rate": 0.0009553986410110134,
"loss": 0.016,
"macro_f1": 0.3333333432674408,
"num_tokens": 3025865.0,
"repeat_count": 0.0,
- "routers_loss": 0.006444344762712717,
+ "routers_loss": 0.005841755773872137,
"skip_count": 0.0,
"step": 1876,
"text_loss": 0.37600573897361755
@@ -17839,13 +17839,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009552707686748388,
- "loss": 0.022,
+ "loss": 0.0219,
"macro_f1": 0.3272727429866791,
"num_tokens": 3029950.0,
"repeat_count": 0.0,
- "routers_loss": 0.05197767913341522,
+ "routers_loss": 0.05165952071547508,
"skip_count": 1.0,
"step": 1878,
"text_loss": 0.33717799186706543
@@ -17858,13 +17858,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009551427218780467,
- "loss": 0.0224,
+ "loss": 0.0219,
"macro_f1": 0.6666666865348816,
"num_tokens": 3033649.0,
"repeat_count": 0.0,
- "routers_loss": 0.017570581287145615,
+ "routers_loss": 0.020680008456110954,
"skip_count": 2.0,
"step": 1880,
"text_loss": 0.5011783838272095
@@ -17877,13 +17877,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.173828125,
+ "grad_norm": 0.15625,
"learning_rate": 0.0009550145006697048,
- "loss": 0.0225,
+ "loss": 0.0217,
"macro_f1": 0.32098764181137085,
"num_tokens": 3036847.0,
"repeat_count": 0.0,
- "routers_loss": 0.07106777280569077,
+ "routers_loss": 0.07626450061798096,
"skip_count": 2.0,
"step": 1882,
"text_loss": 0.3066408336162567
@@ -17896,13 +17896,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0009548861050989482,
- "loss": 0.0139,
+ "loss": 0.0136,
"macro_f1": 1.0,
"num_tokens": 3040353.0,
"repeat_count": 1.0,
- "routers_loss": 0.009862381964921951,
+ "routers_loss": 0.010884666815400124,
"skip_count": 1.0,
"step": 1884,
"text_loss": 0.49779415130615234
@@ -17915,13 +17915,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0908203125,
"learning_rate": 0.0009547575352149778,
- "loss": 0.0209,
+ "loss": 0.0213,
"macro_f1": 0.6666666865348816,
"num_tokens": 3043504.0,
"repeat_count": 0.0,
- "routers_loss": 0.006928981747478247,
+ "routers_loss": 0.006704333238303661,
"skip_count": 2.0,
"step": 1886,
"text_loss": 0.12284614145755768
@@ -17934,13 +17934,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09423828125,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.0009546287910670621,
"loss": 0.0211,
"macro_f1": 0.5427350401878357,
"num_tokens": 3046422.0,
"repeat_count": 1.0,
- "routers_loss": 0.04788029566407204,
+ "routers_loss": 0.04799000173807144,
"skip_count": 2.0,
"step": 1888,
"text_loss": 0.1824081838130951
@@ -17953,13 +17953,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1357421875,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009544998727045361,
- "loss": 0.0299,
+ "loss": 0.0306,
"macro_f1": 0.3333333432674408,
"num_tokens": 3049819.0,
"repeat_count": 0.0,
- "routers_loss": 0.008282946422696114,
+ "routers_loss": 0.008139612153172493,
"skip_count": 0.0,
"step": 1890,
"text_loss": 0.18929053843021393
@@ -17972,32 +17972,32 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.09375,
"learning_rate": 0.0009543707801768015,
- "loss": 0.0181,
+ "loss": 0.0175,
"macro_f1": 0.5934640765190125,
"num_tokens": 3052766.0,
"repeat_count": 0.0,
- "routers_loss": 0.03251546248793602,
+ "routers_loss": 0.02966771461069584,
"skip_count": 3.0,
"step": 1892,
"text_loss": 0.247748002409935
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 24.0,
+ "acc_skip": 0.5,
+ "avg_layers": 25.0,
"epoch": 8.892280598767243,
- "f1_execute": 0.9600000381469727,
+ "f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.06640625,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009542415135333267,
- "loss": 0.0195,
- "macro_f1": 0.542222261428833,
+ "loss": 0.0193,
+ "macro_f1": 0.44705885648727417,
"num_tokens": 3056427.0,
"repeat_count": 0.0,
- "routers_loss": 0.03368280455470085,
+ "routers_loss": 0.03637036308646202,
"skip_count": 2.0,
"step": 1894,
"text_loss": 0.2583999037742615
@@ -18010,13 +18010,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.0595703125,
"learning_rate": 0.0009541120728236472,
- "loss": 0.0133,
+ "loss": 0.0136,
"macro_f1": 0.3333333432674408,
"num_tokens": 3059497.0,
"repeat_count": 0.0,
- "routers_loss": 0.0069940583780407906,
+ "routers_loss": 0.007026574574410915,
"skip_count": 0.0,
"step": 1896,
"text_loss": 0.5222375988960266
@@ -18029,13 +18029,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0810546875,
+ "grad_norm": 0.076171875,
"learning_rate": 0.0009539824580973646,
- "loss": 0.0221,
+ "loss": 0.0219,
"macro_f1": 0.3333333432674408,
"num_tokens": 3062187.0,
"repeat_count": 0.0,
- "routers_loss": 0.004268508404493332,
+ "routers_loss": 0.003449335927143693,
"skip_count": 0.0,
"step": 1898,
"text_loss": 0.5736427307128906
@@ -18048,13 +18048,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0009538526694041477,
- "loss": 0.0159,
+ "loss": 0.0163,
"macro_f1": 0.3333333432674408,
"num_tokens": 3066100.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032616283278912306,
+ "routers_loss": 0.0035463871899992228,
"skip_count": 0.0,
"step": 1900,
"text_loss": 0.5471583604812622
@@ -18067,13 +18067,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.08056640625,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009537227067937318,
- "loss": 0.023,
+ "loss": 0.0233,
"macro_f1": 1.0,
"num_tokens": 3068737.0,
"repeat_count": 3.0,
- "routers_loss": 0.005389219615608454,
+ "routers_loss": 0.00597514258697629,
"skip_count": 3.0,
"step": 1902,
"text_loss": 0.36644190549850464
@@ -18086,13 +18086,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.166015625,
"learning_rate": 0.0009535925703159186,
- "loss": 0.0311,
+ "loss": 0.0301,
"macro_f1": 0.32098764181137085,
"num_tokens": 3071686.0,
"repeat_count": 0.0,
- "routers_loss": 0.024814991280436516,
+ "routers_loss": 0.025420479476451874,
"skip_count": 2.0,
"step": 1904,
"text_loss": 0.535789966583252
@@ -18107,11 +18107,11 @@
"f1_skip": 0.0,
"grad_norm": 0.07568359375,
"learning_rate": 0.0009534622600205769,
- "loss": 0.0151,
+ "loss": 0.0145,
"macro_f1": 0.3333333432674408,
"num_tokens": 3074954.0,
"repeat_count": 0.0,
- "routers_loss": 0.013415839523077011,
+ "routers_loss": 0.014377486892044544,
"skip_count": 0.0,
"step": 1906,
"text_loss": 0.19009549915790558
@@ -18124,13 +18124,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009533317759576416,
- "loss": 0.019,
+ "loss": 0.0197,
"macro_f1": 0.3333333432674408,
"num_tokens": 3077540.0,
"repeat_count": 0.0,
- "routers_loss": 0.005814475007355213,
+ "routers_loss": 0.004848944488912821,
"skip_count": 0.0,
"step": 1908,
"text_loss": 0.5022001266479492
@@ -18143,13 +18143,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0732421875,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0009532011181771148,
- "loss": 0.0218,
+ "loss": 0.0217,
"macro_f1": 0.6666666865348816,
"num_tokens": 3080445.0,
"repeat_count": 0.0,
- "routers_loss": 0.007621586322784424,
+ "routers_loss": 0.009480170905590057,
"skip_count": 2.0,
"step": 1910,
"text_loss": 0.35135936737060547
@@ -18162,13 +18162,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.10400390625,
"learning_rate": 0.0009530702867290644,
- "loss": 0.0178,
+ "loss": 0.0185,
"macro_f1": 0.3333333432674408,
"num_tokens": 3083657.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020917020738124847,
+ "routers_loss": 0.0019353039097040892,
"skip_count": 0.0,
"step": 1912,
"text_loss": 0.5123994946479797
@@ -18181,13 +18181,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009529392816636256,
- "loss": 0.025,
+ "loss": 0.0249,
"macro_f1": 0.3333333432674408,
"num_tokens": 3086837.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010824954370036721,
+ "routers_loss": 0.0010921972570940852,
"skip_count": 0.0,
"step": 1914,
"text_loss": 0.44477662444114685
@@ -18200,13 +18200,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.19140625,
"learning_rate": 0.0009528081030309995,
- "loss": 0.0353,
+ "loss": 0.0351,
"macro_f1": 0.3333333432674408,
"num_tokens": 3089892.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018075350672006607,
+ "routers_loss": 0.0018027103506028652,
"skip_count": 0.0,
"step": 1916,
"text_loss": 0.7356183528900146
@@ -18219,13 +18219,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07958984375,
+ "grad_norm": 0.07568359375,
"learning_rate": 0.0009526767508814542,
- "loss": 0.0235,
+ "loss": 0.0236,
"macro_f1": 0.3333333432674408,
"num_tokens": 3093058.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032930250745266676,
+ "routers_loss": 0.003243023296818137,
"skip_count": 0.0,
"step": 1918,
"text_loss": 0.48823556303977966
@@ -18238,13 +18238,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08837890625,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009525452252653239,
- "loss": 0.0184,
+ "loss": 0.0175,
"macro_f1": 0.3333333432674408,
"num_tokens": 3096404.0,
"repeat_count": 0.0,
- "routers_loss": 0.009042349644005299,
+ "routers_loss": 0.009360014460980892,
"skip_count": 0.0,
"step": 1920,
"text_loss": 0.21498437225818634
@@ -18257,13 +18257,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009524135262330098,
- "loss": 0.022,
+ "loss": 0.0224,
"macro_f1": 0.9265305995941162,
"num_tokens": 3099520.0,
"repeat_count": 1.0,
- "routers_loss": 0.016776500269770622,
+ "routers_loss": 0.017444295808672905,
"skip_count": 3.0,
"step": 1922,
"text_loss": 0.27608850598335266
@@ -18276,13 +18276,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.0009522816538349789,
- "loss": 0.016,
+ "loss": 0.0162,
"macro_f1": 0.5492662787437439,
"num_tokens": 3102956.0,
"repeat_count": 0.0,
- "routers_loss": 0.06579705327749252,
+ "routers_loss": 0.06424452364444733,
"skip_count": 2.0,
"step": 1924,
"text_loss": 0.21558666229248047
@@ -18295,13 +18295,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.058349609375,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0009521496081217651,
- "loss": 0.0113,
+ "loss": 0.0112,
"macro_f1": 0.6666666865348816,
"num_tokens": 3106565.0,
"repeat_count": 1.0,
- "routers_loss": 0.0022786022163927555,
+ "routers_loss": 0.002270506462082267,
"skip_count": 0.0,
"step": 1926,
"text_loss": 0.5641813278198242
@@ -18314,13 +18314,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.09033203125,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009520173891439684,
"loss": 0.0216,
"macro_f1": 0.6666666865348816,
"num_tokens": 3109314.0,
"repeat_count": 0.0,
- "routers_loss": 0.01074281521141529,
+ "routers_loss": 0.011512448079884052,
"skip_count": 1.0,
"step": 1928,
"text_loss": 0.6351624727249146
@@ -18333,13 +18333,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009518849969522556,
- "loss": 0.0201,
+ "loss": 0.0198,
"macro_f1": 0.3333333432674408,
"num_tokens": 3112956.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032052614260464907,
+ "routers_loss": 0.003883908037096262,
"skip_count": 0.0,
"step": 1930,
"text_loss": 0.35160085558891296
@@ -18352,32 +18352,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009517524315973595,
- "loss": 0.0186,
+ "loss": 0.019,
"macro_f1": 1.0,
"num_tokens": 3115593.0,
"repeat_count": 1.0,
- "routers_loss": 0.008593574166297913,
+ "routers_loss": 0.009479222819209099,
"skip_count": 3.0,
"step": 1932,
"text_loss": 0.2900560200214386
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 9.079835632521279,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.07373046875,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0771484375,
"learning_rate": 0.0009516196931300794,
- "loss": 0.0152,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0153,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3118516.0,
"repeat_count": 0.0,
- "routers_loss": 0.0201246440410614,
+ "routers_loss": 0.017834696918725967,
"skip_count": 2.0,
"step": 1934,
"text_loss": 0.20094378292560577
@@ -18390,13 +18390,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1357421875,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009514867816012809,
- "loss": 0.0199,
+ "loss": 0.02,
"macro_f1": 0.3333333432674408,
"num_tokens": 3122242.0,
"repeat_count": 0.0,
- "routers_loss": 0.001721356064081192,
+ "routers_loss": 0.0017964740982279181,
"skip_count": 0.0,
"step": 1936,
"text_loss": 0.6498590707778931
@@ -18409,13 +18409,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.049072265625,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0009513536970618961,
- "loss": 0.0135,
+ "loss": 0.013,
"macro_f1": 0.6666666865348816,
"num_tokens": 3125645.0,
"repeat_count": 0.0,
- "routers_loss": 0.010442634113132954,
+ "routers_loss": 0.007437168620526791,
"skip_count": 2.0,
"step": 1938,
"text_loss": 0.25863033533096313
@@ -18428,13 +18428,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.058349609375,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009512204395629232,
- "loss": 0.019,
+ "loss": 0.0184,
"macro_f1": 0.6666666865348816,
"num_tokens": 3128740.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009493798715993762,
+ "routers_loss": 0.0008759932243265212,
"skip_count": 1.0,
"step": 1940,
"text_loss": 0.5638351440429688
@@ -18447,13 +18447,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05517578125,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009510870091554264,
- "loss": 0.0149,
+ "loss": 0.0153,
"macro_f1": 0.3272727429866791,
"num_tokens": 3131742.0,
"repeat_count": 1.0,
- "routers_loss": 0.022104881703853607,
+ "routers_loss": 0.019906625151634216,
"skip_count": 0.0,
"step": 1942,
"text_loss": 0.8410717844963074
@@ -18466,13 +18466,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009509534058905369,
- "loss": 0.0164,
+ "loss": 0.016,
"macro_f1": 0.3333333432674408,
"num_tokens": 3134407.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009013625676743686,
+ "routers_loss": 0.0009229081333614886,
"skip_count": 0.0,
"step": 1944,
"text_loss": 0.47506049275398254
@@ -18485,13 +18485,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06103515625,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0009508196298194517,
- "loss": 0.0121,
+ "loss": 0.0123,
"macro_f1": 0.3333333432674408,
"num_tokens": 3137053.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028069843538105488,
+ "routers_loss": 0.003630586201325059,
"skip_count": 0.0,
"step": 1946,
"text_loss": 0.32225799560546875
@@ -18504,13 +18504,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009506856809934338,
- "loss": 0.0116,
+ "loss": 0.0119,
"macro_f1": 0.3333333432674408,
"num_tokens": 3140943.0,
"repeat_count": 0.0,
- "routers_loss": 0.006877045147120953,
+ "routers_loss": 0.007580445148050785,
"skip_count": 0.0,
"step": 1948,
"text_loss": 0.3120577931404114
@@ -18523,13 +18523,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0009505515594638127,
- "loss": 0.0127,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 3144298.0,
"repeat_count": 0.0,
- "routers_loss": 0.004543667659163475,
+ "routers_loss": 0.004471861757338047,
"skip_count": 0.0,
"step": 1950,
"text_loss": 0.22052447497844696
@@ -18542,13 +18542,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.09130859375,
"learning_rate": 0.0009504172652819843,
- "loss": 0.0232,
+ "loss": 0.023,
"macro_f1": 1.0,
"num_tokens": 3147069.0,
"repeat_count": 1.0,
- "routers_loss": 0.007053609937429428,
+ "routers_loss": 0.009606664068996906,
"skip_count": 1.0,
"step": 1952,
"text_loss": 0.34773921966552734
@@ -18561,13 +18561,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0537109375,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009502827984994099,
- "loss": 0.0146,
+ "loss": 0.0148,
"macro_f1": 0.6666666865348816,
"num_tokens": 3149992.0,
"repeat_count": 0.0,
- "routers_loss": 0.006783280987292528,
+ "routers_loss": 0.006443799939006567,
"skip_count": 1.0,
"step": 1954,
"text_loss": 0.6442171335220337
@@ -18580,13 +18580,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.0009501481591676177,
- "loss": 0.0181,
+ "loss": 0.0188,
"macro_f1": 0.3333333432674408,
"num_tokens": 3153167.0,
"repeat_count": 0.0,
- "routers_loss": 0.002531677018851042,
+ "routers_loss": 0.003219039412215352,
"skip_count": 0.0,
"step": 1956,
"text_loss": 0.43369221687316895
@@ -18599,32 +18599,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.078125,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.000950013347338202,
- "loss": 0.0154,
+ "loss": 0.0152,
"macro_f1": 0.3272727429866791,
"num_tokens": 3156590.0,
"repeat_count": 0.0,
- "routers_loss": 0.027040868997573853,
+ "routers_loss": 0.025551019236445427,
"skip_count": 1.0,
"step": 1958,
"text_loss": 0.294479101896286
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 1.0,
- "avg_layers": 26.0,
+ "avg_layers": 27.0,
"epoch": 9.201937188142061,
- "f1_execute": 0.9803921580314636,
- "f1_repeat": 0.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009498783630628225,
- "loss": 0.0154,
- "macro_f1": 0.6601307392120361,
+ "loss": 0.0158,
+ "macro_f1": 1.0,
"num_tokens": 3159451.0,
"repeat_count": 1.0,
- "routers_loss": 0.01573321223258972,
+ "routers_loss": 0.013802438974380493,
"skip_count": 2.0,
"step": 1960,
"text_loss": 0.20888492465019226
@@ -18637,13 +18637,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06689453125,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009497432063932057,
- "loss": 0.0135,
+ "loss": 0.0137,
"macro_f1": 0.6601307392120361,
"num_tokens": 3162889.0,
"repeat_count": 1.0,
- "routers_loss": 0.02442278526723385,
+ "routers_loss": 0.02852988988161087,
"skip_count": 2.0,
"step": 1962,
"text_loss": 0.5027125477790833
@@ -18656,13 +18656,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.046630859375,
+ "grad_norm": 0.045166015625,
"learning_rate": 0.0009496078773811437,
- "loss": 0.0142,
+ "loss": 0.0136,
"macro_f1": 0.6666666865348816,
"num_tokens": 3165979.0,
"repeat_count": 0.0,
- "routers_loss": 0.018267054110765457,
+ "routers_loss": 0.01784522272646427,
"skip_count": 2.0,
"step": 1964,
"text_loss": 0.1696339100599289
@@ -18675,13 +18675,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.060302734375,
"learning_rate": 0.000949472376078495,
- "loss": 0.0162,
+ "loss": 0.016,
"macro_f1": 0.3333333432674408,
"num_tokens": 3168683.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016024474753066897,
+ "routers_loss": 0.0017019887454807758,
"skip_count": 0.0,
"step": 1966,
"text_loss": 0.48905447125434875
@@ -18694,13 +18694,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.052978515625,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.000949336702537184,
- "loss": 0.011,
+ "loss": 0.0108,
"macro_f1": 0.6666666865348816,
"num_tokens": 3171968.0,
"repeat_count": 0.0,
- "routers_loss": 0.004668849054723978,
+ "routers_loss": 0.004817947279661894,
"skip_count": 2.0,
"step": 1968,
"text_loss": 0.20984773337841034
@@ -18713,13 +18713,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0009492008568092007,
- "loss": 0.0098,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 3175947.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011657609138637781,
+ "routers_loss": 0.0012963006738573313,
"skip_count": 0.0,
"step": 1970,
"text_loss": 0.5215106010437012
@@ -18732,13 +18732,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.04248046875,
+ "grad_norm": 0.044921875,
"learning_rate": 0.0009490648389466019,
- "loss": 0.0133,
+ "loss": 0.0135,
"macro_f1": 0.4871794879436493,
"num_tokens": 3179348.0,
"repeat_count": 0.0,
- "routers_loss": 0.03806794434785843,
+ "routers_loss": 0.03950481489300728,
"skip_count": 2.0,
"step": 1972,
"text_loss": 0.24640929698944092
@@ -18751,13 +18751,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08837890625,
+ "grad_norm": 0.09326171875,
"learning_rate": 0.0009489286490015097,
- "loss": 0.0189,
+ "loss": 0.0183,
"macro_f1": 0.6666666865348816,
"num_tokens": 3182640.0,
"repeat_count": 0.0,
- "routers_loss": 0.005107097327709198,
+ "routers_loss": 0.0043345349840819836,
"skip_count": 2.0,
"step": 1974,
"text_loss": 0.6362852454185486
@@ -18770,13 +18770,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.078125,
+ "grad_norm": 0.07958984375,
"learning_rate": 0.0009487922870261122,
- "loss": 0.0156,
+ "loss": 0.0155,
"macro_f1": 0.3333333432674408,
"num_tokens": 3185657.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013696947135031223,
+ "routers_loss": 0.0015687479171901941,
"skip_count": 0.0,
"step": 1976,
"text_loss": 0.8977144360542297
@@ -18789,13 +18789,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.0009486557530726638,
- "loss": 0.0136,
+ "loss": 0.0139,
"macro_f1": 0.3333333432674408,
"num_tokens": 3188772.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012224154779687524,
+ "routers_loss": 0.0010977238416671753,
"skip_count": 0.0,
"step": 1978,
"text_loss": 0.38512736558914185
@@ -18808,13 +18808,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09423828125,
+ "grad_norm": 0.11279296875,
"learning_rate": 0.0009485190471934844,
"loss": 0.0196,
"macro_f1": 0.6666666865348816,
"num_tokens": 3193131.0,
"repeat_count": 2.0,
- "routers_loss": 0.0030119111761450768,
+ "routers_loss": 0.002264744369313121,
"skip_count": 0.0,
"step": 1980,
"text_loss": 0.4171289801597595
@@ -18827,13 +18827,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.09033203125,
"learning_rate": 0.00094838216944096,
- "loss": 0.0222,
+ "loss": 0.0219,
"macro_f1": 0.3272727429866791,
"num_tokens": 3196668.0,
"repeat_count": 0.0,
- "routers_loss": 0.04286033287644386,
+ "routers_loss": 0.042320676147937775,
"skip_count": 1.0,
"step": 1982,
"text_loss": 0.19008000195026398
@@ -18846,32 +18846,32 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.053466796875,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0009482451198675424,
- "loss": 0.0158,
+ "loss": 0.0151,
"macro_f1": 0.32098767161369324,
"num_tokens": 3200282.0,
"repeat_count": 0.0,
- "routers_loss": 0.019988590851426125,
+ "routers_loss": 0.01796630397439003,
"skip_count": 1.0,
"step": 1984,
"text_loss": 0.5009249448776245
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 9.324038743762841,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.0634765625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.061767578125,
"learning_rate": 0.0009481078985257494,
- "loss": 0.0154,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0147,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3204439.0,
"repeat_count": 0.0,
- "routers_loss": 0.012215938419103622,
+ "routers_loss": 0.01052347756922245,
"skip_count": 1.0,
"step": 1986,
"text_loss": 0.15319275856018066
@@ -18884,13 +18884,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0009479705054681644,
- "loss": 0.0149,
+ "loss": 0.015,
"macro_f1": 0.3076923191547394,
"num_tokens": 3207590.0,
"repeat_count": 1.0,
- "routers_loss": 0.10747655481100082,
+ "routers_loss": 0.09640293568372726,
"skip_count": 3.0,
"step": 1988,
"text_loss": 0.3654652535915375
@@ -18903,13 +18903,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009478329407474366,
- "loss": 0.0186,
+ "loss": 0.0183,
"macro_f1": 0.5492662787437439,
"num_tokens": 3211172.0,
"repeat_count": 0.0,
- "routers_loss": 0.016109853982925415,
+ "routers_loss": 0.012670112773776054,
"skip_count": 1.0,
"step": 1990,
"text_loss": 0.5817596316337585
@@ -18922,13 +18922,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.05859375,
"learning_rate": 0.000947695204416281,
- "loss": 0.0116,
+ "loss": 0.0121,
"macro_f1": 0.6666666865348816,
"num_tokens": 3214050.0,
"repeat_count": 1.0,
- "routers_loss": 0.006929324474185705,
+ "routers_loss": 0.005263707600533962,
"skip_count": 0.0,
"step": 1992,
"text_loss": 0.5985888242721558
@@ -18941,13 +18941,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009475572965274787,
- "loss": 0.0147,
+ "loss": 0.0144,
"macro_f1": 0.3272727429866791,
"num_tokens": 3217318.0,
"repeat_count": 1.0,
- "routers_loss": 0.0715102106332779,
+ "routers_loss": 0.0682850033044815,
"skip_count": 0.0,
"step": 1994,
"text_loss": 0.316506564617157
@@ -18960,13 +18960,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.052490234375,
+ "grad_norm": 0.0595703125,
"learning_rate": 0.000947419217133876,
- "loss": 0.0187,
+ "loss": 0.019,
"macro_f1": 0.6666666865348816,
"num_tokens": 3220012.0,
"repeat_count": 0.0,
- "routers_loss": 0.008499355986714363,
+ "routers_loss": 0.008508823812007904,
"skip_count": 2.0,
"step": 1996,
"text_loss": 0.09665893763303757
@@ -18979,13 +18979,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.048583984375,
+ "grad_norm": 0.053466796875,
"learning_rate": 0.0009472809662883852,
- "loss": 0.0162,
+ "loss": 0.0155,
"macro_f1": 1.0,
"num_tokens": 3223019.0,
"repeat_count": 1.0,
- "routers_loss": 0.012003371492028236,
+ "routers_loss": 0.01100847590714693,
"skip_count": 2.0,
"step": 1998,
"text_loss": 0.4938808083534241
@@ -18998,13 +18998,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0009471425440439844,
- "loss": 0.0137,
+ "loss": 0.0135,
"macro_f1": 0.8817967176437378,
"num_tokens": 3226013.0,
"repeat_count": 2.0,
- "routers_loss": 0.0529167577624321,
+ "routers_loss": 0.04953207075595856,
"skip_count": 3.0,
"step": 2000,
"text_loss": 0.22258254885673523
@@ -19017,13 +19017,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.076171875,
+ "grad_norm": 0.07568359375,
"learning_rate": 0.0009470039504537173,
- "loss": 0.0185,
+ "loss": 0.0186,
"macro_f1": 0.31446540355682373,
"num_tokens": 3230031.0,
"repeat_count": 0.0,
- "routers_loss": 0.05719539523124695,
+ "routers_loss": 0.052884332835674286,
"skip_count": 2.0,
"step": 2002,
"text_loss": 0.1741616576910019
@@ -19038,11 +19038,11 @@
"f1_skip": 0.0,
"grad_norm": 0.0869140625,
"learning_rate": 0.0009468651855706931,
- "loss": 0.0205,
+ "loss": 0.0204,
"macro_f1": 0.6666666865348816,
"num_tokens": 3232991.0,
"repeat_count": 1.0,
- "routers_loss": 0.007613501511514187,
+ "routers_loss": 0.008056716993451118,
"skip_count": 0.0,
"step": 2004,
"text_loss": 0.3173636198043823
@@ -19055,13 +19055,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0009467262494480868,
- "loss": 0.014,
+ "loss": 0.0136,
"macro_f1": 0.3333333432674408,
"num_tokens": 3236390.0,
"repeat_count": 0.0,
- "routers_loss": 0.005654903594404459,
+ "routers_loss": 0.0053409393876791,
"skip_count": 0.0,
"step": 2006,
"text_loss": 0.5806330442428589
@@ -19074,13 +19074,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07958984375,
+ "grad_norm": 0.068359375,
"learning_rate": 0.000946587142139139,
- "loss": 0.0152,
+ "loss": 0.0147,
"macro_f1": 0.3333333432674408,
"num_tokens": 3239267.0,
"repeat_count": 0.0,
- "routers_loss": 0.001680699409916997,
+ "routers_loss": 0.0015652200672775507,
"skip_count": 0.0,
"step": 2008,
"text_loss": 0.6214317679405212
@@ -19093,13 +19093,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.11376953125,
"learning_rate": 0.000946447863697156,
- "loss": 0.0171,
+ "loss": 0.0151,
"macro_f1": 0.6601307392120361,
"num_tokens": 3242569.0,
"repeat_count": 1.0,
- "routers_loss": 0.014179535210132599,
+ "routers_loss": 0.011673987843096256,
"skip_count": 2.0,
"step": 2010,
"text_loss": 0.532565712928772
@@ -19112,13 +19112,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.04345703125,
"learning_rate": 0.0009463084141755093,
- "loss": 0.0157,
+ "loss": 0.0159,
"macro_f1": 0.3272727429866791,
"num_tokens": 3245669.0,
"repeat_count": 0.0,
- "routers_loss": 0.026209332048892975,
+ "routers_loss": 0.028480790555477142,
"skip_count": 1.0,
"step": 2012,
"text_loss": 0.25210800766944885
@@ -19131,13 +19131,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08349609375,
+ "grad_norm": 0.0869140625,
"learning_rate": 0.0009461687936276364,
- "loss": 0.0134,
+ "loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 3248751.0,
"repeat_count": 0.0,
- "routers_loss": 0.008315940387547016,
+ "routers_loss": 0.007234727032482624,
"skip_count": 0.0,
"step": 2014,
"text_loss": 0.35922971367836
@@ -19150,13 +19150,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.06689453125,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0009460290021070402,
- "loss": 0.0197,
+ "loss": 0.0195,
"macro_f1": 0.6666666865348816,
"num_tokens": 3252614.0,
"repeat_count": 1.0,
- "routers_loss": 0.01872348040342331,
+ "routers_loss": 0.014691276475787163,
"skip_count": 0.0,
"step": 2016,
"text_loss": 0.2747853398323059
@@ -19169,13 +19169,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05126953125,
+ "grad_norm": 0.051513671875,
"learning_rate": 0.0009458890396672888,
"loss": 0.0186,
"macro_f1": 0.3333333432674408,
"num_tokens": 3256374.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024314222391694784,
+ "routers_loss": 0.002385235857218504,
"skip_count": 0.0,
"step": 2018,
"text_loss": 0.5268719792366028
@@ -19188,13 +19188,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.052978515625,
+ "grad_norm": 0.04443359375,
"learning_rate": 0.0009457489063620164,
- "loss": 0.0137,
+ "loss": 0.0133,
"macro_f1": 0.8823530077934265,
"num_tokens": 3259792.0,
"repeat_count": 1.0,
- "routers_loss": 0.04815426841378212,
+ "routers_loss": 0.047268565744161606,
"skip_count": 2.0,
"step": 2020,
"text_loss": 0.7785539627075195
@@ -19207,13 +19207,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.13671875,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009456086022449221,
- "loss": 0.0209,
+ "loss": 0.0218,
"macro_f1": 0.3272727429866791,
"num_tokens": 3262833.0,
"repeat_count": 0.0,
- "routers_loss": 0.015121756121516228,
+ "routers_loss": 0.015878718346357346,
"skip_count": 1.0,
"step": 2022,
"text_loss": 0.42270028591156006
@@ -19226,32 +19226,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.08935546875,
"learning_rate": 0.0009454681273697711,
- "loss": 0.0122,
+ "loss": 0.0117,
"macro_f1": 0.3272727429866791,
"num_tokens": 3265718.0,
"repeat_count": 1.0,
- "routers_loss": 0.030219297856092453,
+ "routers_loss": 0.030749641358852386,
"skip_count": 0.0,
"step": 2024,
"text_loss": 0.18668225407600403
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 9.511887290871735,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.05419921875,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.0009453274817903931,
- "loss": 0.0132,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.012,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3268158.0,
"repeat_count": 0.0,
- "routers_loss": 0.013256299309432507,
+ "routers_loss": 0.011538166552782059,
"skip_count": 1.0,
"step": 2026,
"text_loss": 0.34090787172317505
@@ -19264,13 +19264,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11572265625,
+ "grad_norm": 0.099609375,
"learning_rate": 0.000945186665560684,
- "loss": 0.0232,
+ "loss": 0.0218,
"macro_f1": 0.3333333432674408,
"num_tokens": 3271082.0,
"repeat_count": 0.0,
- "routers_loss": 0.009389489889144897,
+ "routers_loss": 0.009527760557830334,
"skip_count": 0.0,
"step": 2028,
"text_loss": 0.2110334187746048
@@ -19283,13 +19283,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.119140625,
"learning_rate": 0.000945045678734605,
- "loss": 0.0178,
+ "loss": 0.0175,
"macro_f1": 0.3144654333591461,
"num_tokens": 3273488.0,
"repeat_count": 0.0,
- "routers_loss": 0.03916877508163452,
+ "routers_loss": 0.03317151218652725,
"skip_count": 3.0,
"step": 2030,
"text_loss": 0.2233227640390396
@@ -19302,13 +19302,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11474609375,
+ "grad_norm": 0.12451171875,
"learning_rate": 0.0009449045213661822,
- "loss": 0.0215,
+ "loss": 0.0201,
"macro_f1": 0.3272727429866791,
"num_tokens": 3276646.0,
"repeat_count": 0.0,
- "routers_loss": 0.019781047478318214,
+ "routers_loss": 0.018510591238737106,
"skip_count": 1.0,
"step": 2032,
"text_loss": 0.16100332140922546
@@ -19321,13 +19321,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 1.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.11474609375,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.0009447631935095077,
- "loss": 0.0193,
+ "loss": 0.0185,
"macro_f1": 0.9452888369560242,
"num_tokens": 3279441.0,
"repeat_count": 1.0,
- "routers_loss": 0.02645993046462536,
+ "routers_loss": 0.028113311156630516,
"skip_count": 4.0,
"step": 2034,
"text_loss": 0.29208317399024963
@@ -19340,13 +19340,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.0009446216952187384,
- "loss": 0.0168,
+ "loss": 0.0164,
"macro_f1": 0.3333333432674408,
"num_tokens": 3282697.0,
"repeat_count": 0.0,
- "routers_loss": 0.008575125597417355,
+ "routers_loss": 0.008379172533750534,
"skip_count": 0.0,
"step": 2036,
"text_loss": 0.16026398539543152
@@ -19359,13 +19359,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.06298828125,
"learning_rate": 0.0009444800265480967,
- "loss": 0.0184,
+ "loss": 0.0178,
"macro_f1": 0.3333333432674408,
"num_tokens": 3285574.0,
"repeat_count": 0.0,
- "routers_loss": 0.01042154710739851,
+ "routers_loss": 0.00941354501992464,
"skip_count": 0.0,
"step": 2038,
"text_loss": 0.29523080587387085
@@ -19378,13 +19378,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.8571428656578064,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.076171875,
"learning_rate": 0.0009443381875518703,
- "loss": 0.0206,
+ "loss": 0.0197,
"macro_f1": 0.8600732684135437,
"num_tokens": 3289159.0,
"repeat_count": 4.0,
- "routers_loss": 0.05496715381741524,
+ "routers_loss": 0.04974055662751198,
"skip_count": 6.0,
"step": 2040,
"text_loss": 0.23033179342746735
@@ -19397,13 +19397,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.0537109375,
"learning_rate": 0.0009441961782844123,
- "loss": 0.0149,
+ "loss": 0.0146,
"macro_f1": 0.3272727429866791,
"num_tokens": 3293598.0,
"repeat_count": 0.0,
- "routers_loss": 0.021722445264458656,
+ "routers_loss": 0.022241825237870216,
"skip_count": 1.0,
"step": 2042,
"text_loss": 0.8299165368080139
@@ -19416,13 +19416,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.0009440539988001408,
- "loss": 0.0161,
+ "loss": 0.0159,
"macro_f1": 0.3333333432674408,
"num_tokens": 3296648.0,
"repeat_count": 0.0,
- "routers_loss": 0.011090370826423168,
+ "routers_loss": 0.011019332334399223,
"skip_count": 0.0,
"step": 2044,
"text_loss": 0.18207129836082458
@@ -19435,13 +19435,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.041259765625,
"learning_rate": 0.0009439116491535394,
- "loss": 0.0123,
+ "loss": 0.0118,
"macro_f1": 0.3333333432674408,
"num_tokens": 3300058.0,
"repeat_count": 0.0,
- "routers_loss": 0.00327755743637681,
+ "routers_loss": 0.002889640862122178,
"skip_count": 0.0,
"step": 2046,
"text_loss": 0.7051978707313538
@@ -19454,13 +19454,13 @@
"f1_execute": 0.9333333373069763,
"f1_repeat": 0.5,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.078125,
"learning_rate": 0.0009437691293991563,
- "loss": 0.0198,
+ "loss": 0.0192,
"macro_f1": 0.7634921073913574,
"num_tokens": 3303296.0,
"repeat_count": 3.0,
- "routers_loss": 0.0807223841547966,
+ "routers_loss": 0.07741832733154297,
"skip_count": 4.0,
"step": 2048,
"text_loss": 0.15563532710075378
@@ -19473,13 +19473,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.09521484375,
"learning_rate": 0.0009436264395916061,
- "loss": 0.0218,
+ "loss": 0.0209,
"macro_f1": 0.6666666865348816,
"num_tokens": 3306204.0,
"repeat_count": 0.0,
- "routers_loss": 0.014681774191558361,
+ "routers_loss": 0.014225383289158344,
"skip_count": 2.0,
"step": 2050,
"text_loss": 0.18117287755012512
@@ -19492,13 +19492,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09326171875,
+ "grad_norm": 0.1416015625,
"learning_rate": 0.0009434835797855672,
- "loss": 0.0166,
+ "loss": 0.0165,
"macro_f1": 0.3333333432674408,
"num_tokens": 3309444.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025602662935853004,
+ "routers_loss": 0.0023932650219649076,
"skip_count": 0.0,
"step": 2052,
"text_loss": 0.4645874798297882
@@ -19511,13 +19511,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05810546875,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0009433405500357839,
- "loss": 0.0148,
+ "loss": 0.0153,
"macro_f1": 0.3272727429866791,
"num_tokens": 3312488.0,
"repeat_count": 0.0,
- "routers_loss": 0.03283753618597984,
+ "routers_loss": 0.03193361684679985,
"skip_count": 1.0,
"step": 2054,
"text_loss": 0.5291082859039307
@@ -19530,13 +19530,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.062255859375,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0009431973503970655,
- "loss": 0.0138,
+ "loss": 0.0134,
"macro_f1": 0.3333333432674408,
"num_tokens": 3315765.0,
"repeat_count": 0.0,
- "routers_loss": 0.002137230010703206,
+ "routers_loss": 0.0020529816392809153,
"skip_count": 0.0,
"step": 2056,
"text_loss": 0.5877931118011475
@@ -19549,13 +19549,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08251953125,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0009430539809242864,
- "loss": 0.0199,
+ "loss": 0.0185,
"macro_f1": 0.32098764181137085,
"num_tokens": 3318877.0,
"repeat_count": 2.0,
- "routers_loss": 0.07938452064990997,
+ "routers_loss": 0.07907948642969131,
"skip_count": 0.0,
"step": 2058,
"text_loss": 0.3836737871170044
@@ -19568,13 +19568,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009429104416723862,
- "loss": 0.0164,
+ "loss": 0.0163,
"macro_f1": 0.6666666865348816,
"num_tokens": 3322576.0,
"repeat_count": 2.0,
- "routers_loss": 0.003832251997664571,
+ "routers_loss": 0.003006070153787732,
"skip_count": 0.0,
"step": 2060,
"text_loss": 0.3480920195579529
@@ -19587,13 +19587,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.045166015625,
"learning_rate": 0.0009427667326963689,
- "loss": 0.0131,
+ "loss": 0.0127,
"macro_f1": 0.3333333432674408,
"num_tokens": 3325974.0,
"repeat_count": 0.0,
- "routers_loss": 0.006192604545503855,
+ "routers_loss": 0.005013179033994675,
"skip_count": 0.0,
"step": 2062,
"text_loss": 0.931358814239502
@@ -19606,13 +19606,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09375,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0009426228540513047,
"loss": 0.0206,
"macro_f1": 0.3333333432674408,
"num_tokens": 3329398.0,
"repeat_count": 0.0,
- "routers_loss": 0.008115313947200775,
+ "routers_loss": 0.0059848143719136715,
"skip_count": 0.0,
"step": 2064,
"text_loss": 0.47568953037261963
@@ -19625,13 +19625,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009424788057923277,
- "loss": 0.0127,
+ "loss": 0.0131,
"macro_f1": 0.3333333432674408,
"num_tokens": 3332029.0,
"repeat_count": 0.0,
- "routers_loss": 0.007599714212119579,
+ "routers_loss": 0.00783882662653923,
"skip_count": 0.0,
"step": 2066,
"text_loss": 0.22887596487998962
@@ -19644,13 +19644,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.07470703125,
+ "grad_norm": 0.0712890625,
"learning_rate": 0.0009423345879746376,
- "loss": 0.0126,
+ "loss": 0.0128,
"macro_f1": 0.5492662787437439,
"num_tokens": 3334858.0,
"repeat_count": 0.0,
- "routers_loss": 0.016804348677396774,
+ "routers_loss": 0.01866884157061577,
"skip_count": 2.0,
"step": 2068,
"text_loss": 0.17724967002868652
@@ -19663,13 +19663,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.06591796875,
"learning_rate": 0.000942190200653499,
- "loss": 0.0164,
+ "loss": 0.0162,
"macro_f1": 0.32098764181137085,
"num_tokens": 3338094.0,
"repeat_count": 0.0,
- "routers_loss": 0.02686731517314911,
+ "routers_loss": 0.028636593371629715,
"skip_count": 2.0,
"step": 2070,
"text_loss": 0.34344956278800964
@@ -19682,13 +19682,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.07568359375,
"learning_rate": 0.0009420456438842413,
- "loss": 0.0172,
+ "loss": 0.0165,
"macro_f1": 0.5492662787437439,
"num_tokens": 3340526.0,
"repeat_count": 0.0,
- "routers_loss": 0.025320913642644882,
+ "routers_loss": 0.023245645686984062,
"skip_count": 2.0,
"step": 2072,
"text_loss": 0.7276164293289185
@@ -19701,13 +19701,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.11328125,
"learning_rate": 0.000941900917722259,
- "loss": 0.0145,
+ "loss": 0.0143,
"macro_f1": 0.3272727429866791,
"num_tokens": 3343303.0,
"repeat_count": 1.0,
- "routers_loss": 0.014900023117661476,
+ "routers_loss": 0.01565689593553543,
"skip_count": 0.0,
"step": 2074,
"text_loss": 0.5665070414543152
@@ -19720,13 +19720,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11474609375,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.0009417560222230115,
- "loss": 0.0244,
+ "loss": 0.0245,
"macro_f1": 0.3333333432674408,
"num_tokens": 3346409.0,
"repeat_count": 0.0,
- "routers_loss": 0.003426895011216402,
+ "routers_loss": 0.0035056080669164658,
"skip_count": 0.0,
"step": 2076,
"text_loss": 0.5112795233726501
@@ -19739,13 +19739,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0712890625,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0009416109574420229,
- "loss": 0.0136,
+ "loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 3349220.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031935563310980797,
+ "routers_loss": 0.0027565446216613054,
"skip_count": 0.0,
"step": 2078,
"text_loss": 0.5240910053253174
@@ -19758,13 +19758,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.08203125,
"learning_rate": 0.0009414657234348823,
- "loss": 0.0183,
+ "loss": 0.0186,
"macro_f1": 1.0,
"num_tokens": 3352627.0,
"repeat_count": 3.0,
- "routers_loss": 0.016454946249723434,
+ "routers_loss": 0.01652451977133751,
"skip_count": 2.0,
"step": 2080,
"text_loss": 1.0217112302780151
@@ -19777,13 +19777,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009413203202572438,
- "loss": 0.0174,
+ "loss": 0.0179,
"macro_f1": 0.32098764181137085,
"num_tokens": 3355392.0,
"repeat_count": 0.0,
- "routers_loss": 0.1056143268942833,
+ "routers_loss": 0.1012420505285263,
"skip_count": 2.0,
"step": 2082,
"text_loss": 0.4085482358932495
@@ -19796,13 +19796,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07373046875,
+ "grad_norm": 0.08251953125,
"learning_rate": 0.000941174747964826,
- "loss": 0.016,
+ "loss": 0.0154,
"macro_f1": 0.3333333432674408,
"num_tokens": 3358425.0,
"repeat_count": 0.0,
- "routers_loss": 0.003626141929998994,
+ "routers_loss": 0.004962718114256859,
"skip_count": 0.0,
"step": 2084,
"text_loss": 0.5833504796028137
@@ -19810,18 +19810,18 @@
{
"acc_repeat": 0.5,
"acc_skip": 0.6666666865348816,
- "avg_layers": 26.0,
+ "avg_layers": 27.0,
"epoch": 9.793660111535075,
- "f1_execute": 0.936170220375061,
+ "f1_execute": 0.9583333134651184,
"f1_repeat": 0.6666666865348816,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.107421875,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.11376953125,
"learning_rate": 0.0009410290066134124,
- "loss": 0.0216,
- "macro_f1": 0.7565011978149414,
+ "loss": 0.0211,
+ "macro_f1": 0.8083333373069763,
"num_tokens": 3361925.0,
"repeat_count": 2.0,
- "routers_loss": 0.08091846853494644,
+ "routers_loss": 0.07889176905155182,
"skip_count": 3.0,
"step": 2086,
"text_loss": 0.38126569986343384
@@ -19834,13 +19834,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.056884765625,
+ "grad_norm": 0.051513671875,
"learning_rate": 0.0009408830962588517,
- "loss": 0.0197,
+ "loss": 0.0195,
"macro_f1": 0.6601307392120361,
"num_tokens": 3365963.0,
"repeat_count": 1.0,
- "routers_loss": 0.035208042711019516,
+ "routers_loss": 0.033715736120939255,
"skip_count": 2.0,
"step": 2088,
"text_loss": 0.23213914036750793
@@ -19853,13 +19853,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07958984375,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0009407370169570567,
- "loss": 0.0167,
+ "loss": 0.0169,
"macro_f1": 0.3333333432674408,
"num_tokens": 3369422.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018934847321361303,
+ "routers_loss": 0.0014188943896442652,
"skip_count": 0.0,
"step": 2090,
"text_loss": 0.4648318886756897
@@ -19872,13 +19872,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.0712890625,
"learning_rate": 0.0009405907687640054,
- "loss": 0.0132,
+ "loss": 0.013,
"macro_f1": 0.3272727429866791,
"num_tokens": 3372506.0,
"repeat_count": 0.0,
- "routers_loss": 0.016075141727924347,
+ "routers_loss": 0.015339684672653675,
"skip_count": 1.0,
"step": 2092,
"text_loss": 0.2563800811767578
@@ -19891,13 +19891,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.054443359375,
"learning_rate": 0.0009404443517357404,
"loss": 0.0146,
"macro_f1": 0.542222261428833,
"num_tokens": 3375653.0,
"repeat_count": 4.0,
- "routers_loss": 0.06333976984024048,
+ "routers_loss": 0.06562861055135727,
"skip_count": 0.0,
"step": 2094,
"text_loss": 0.797835111618042
@@ -19910,13 +19910,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.060546875,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.000940297765928369,
- "loss": 0.0133,
+ "loss": 0.0136,
"macro_f1": 0.3333333432674408,
"num_tokens": 3379018.0,
"repeat_count": 0.0,
- "routers_loss": 0.005521406419575214,
+ "routers_loss": 0.005745889153331518,
"skip_count": 0.0,
"step": 2096,
"text_loss": 0.4238114655017853
@@ -19929,13 +19929,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06103515625,
+ "grad_norm": 0.0712890625,
"learning_rate": 0.0009401510113980631,
- "loss": 0.0205,
+ "loss": 0.0207,
"macro_f1": 0.3333333432674408,
"num_tokens": 3382855.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025159218348562717,
+ "routers_loss": 0.0026634482201188803,
"skip_count": 0.0,
"step": 2098,
"text_loss": 0.4967166483402252
@@ -19948,13 +19948,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009400040882010592,
- "loss": 0.0172,
+ "loss": 0.0166,
"macro_f1": 0.3333333432674408,
"num_tokens": 3386386.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025535966269671917,
+ "routers_loss": 0.0020642587915062904,
"skip_count": 0.0,
"step": 2100,
"text_loss": 0.44390562176704407
@@ -19967,13 +19967,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.056640625,
"learning_rate": 0.0009398569963936589,
- "loss": 0.0178,
+ "loss": 0.017,
"macro_f1": 0.3272727429866791,
"num_tokens": 3389958.0,
"repeat_count": 0.0,
- "routers_loss": 0.013569516129791737,
+ "routers_loss": 0.013722737319767475,
"skip_count": 1.0,
"step": 2102,
"text_loss": 0.7207565903663635
@@ -19986,13 +19986,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.08837890625,
"learning_rate": 0.0009397097360322276,
- "loss": 0.0175,
+ "loss": 0.017,
"macro_f1": 0.3333333432674408,
"num_tokens": 3392892.0,
"repeat_count": 0.0,
- "routers_loss": 0.0044935219921171665,
+ "routers_loss": 0.002051608171314001,
"skip_count": 0.0,
"step": 2104,
"text_loss": 0.3196398913860321
@@ -20005,13 +20005,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.000939562307173196,
- "loss": 0.0223,
+ "loss": 0.022,
"macro_f1": 0.3333333432674408,
"num_tokens": 3396636.0,
"repeat_count": 0.0,
- "routers_loss": 0.007407462690025568,
+ "routers_loss": 0.007085663266479969,
"skip_count": 0.0,
"step": 2106,
"text_loss": 0.5663776397705078
@@ -20024,13 +20024,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.13671875,
+ "grad_norm": 0.11328125,
"learning_rate": 0.0009394147098730592,
- "loss": 0.0205,
+ "loss": 0.02,
"macro_f1": 0.5492662787437439,
"num_tokens": 3399475.0,
"repeat_count": 0.0,
- "routers_loss": 0.024386432021856308,
+ "routers_loss": 0.019473131746053696,
"skip_count": 2.0,
"step": 2108,
"text_loss": 0.7708223462104797
@@ -20043,32 +20043,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037353515625,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.0009392669441883767,
- "loss": 0.0135,
+ "loss": 0.0134,
"macro_f1": 0.3333333432674408,
"num_tokens": 3402350.0,
"repeat_count": 0.0,
- "routers_loss": 0.002929724520072341,
+ "routers_loss": 0.0028328890912234783,
"skip_count": 0.0,
"step": 2110,
"text_loss": 0.5888006091117859
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 9.915761667155856,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.1201171875,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009391190101757724,
- "loss": 0.0168,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0166,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3405561.0,
"repeat_count": 0.0,
- "routers_loss": 0.026861928403377533,
+ "routers_loss": 0.023098422214388847,
"skip_count": 2.0,
"step": 2112,
"text_loss": 0.09865197539329529
@@ -20081,13 +20081,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0966796875,
+ "grad_norm": 0.10107421875,
"learning_rate": 0.000938970907891935,
- "loss": 0.0251,
+ "loss": 0.0247,
"macro_f1": 0.3333333432674408,
"num_tokens": 3408513.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025369988288730383,
+ "routers_loss": 0.002896632067859173,
"skip_count": 0.0,
"step": 2114,
"text_loss": 0.6613234281539917
@@ -20100,51 +20100,51 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09423828125,
+ "grad_norm": 0.0947265625,
"learning_rate": 0.0009388226373936179,
- "loss": 0.0209,
+ "loss": 0.0211,
"macro_f1": 0.3333333432674408,
"num_tokens": 3411195.0,
"repeat_count": 0.0,
- "routers_loss": 0.014292459934949875,
+ "routers_loss": 0.015814457088708878,
"skip_count": 0.0,
"step": 2116,
"text_loss": 0.17363053560256958
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 9.94393894922219,
- "f1_execute": 0.9629629850387573,
- "f1_repeat": 0.0,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1181640625,
+ "grad_norm": 0.12451171875,
"learning_rate": 0.0009386741987376381,
- "loss": 0.0151,
- "macro_f1": 0.32098767161369324,
+ "loss": 0.015,
+ "macro_f1": 0.6603773832321167,
"num_tokens": 3414875.0,
"repeat_count": 1.0,
- "routers_loss": 0.027571436017751694,
+ "routers_loss": 0.02676783688366413,
"skip_count": 0.0,
"step": 2118,
"text_loss": 0.674056887626648
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 9.953331376577633,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.08349609375,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0009385255919808778,
- "loss": 0.0205,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0203,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3418410.0,
"repeat_count": 0.0,
- "routers_loss": 0.011719600297510624,
+ "routers_loss": 0.01022857241332531,
"skip_count": 1.0,
"step": 2120,
"text_loss": 0.235092431306839
@@ -20157,13 +20157,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09375,
+ "grad_norm": 0.0888671875,
"learning_rate": 0.0009383768171802836,
- "loss": 0.0249,
+ "loss": 0.0244,
"macro_f1": 0.5492662787437439,
"num_tokens": 3421289.0,
"repeat_count": 0.0,
- "routers_loss": 0.01207603607326746,
+ "routers_loss": 0.013572212308645248,
"skip_count": 2.0,
"step": 2122,
"text_loss": 0.5992844104766846
@@ -20176,13 +20176,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.060791015625,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0009382278743928659,
- "loss": 0.0206,
+ "loss": 0.0201,
"macro_f1": 0.6666666865348816,
"num_tokens": 3424781.0,
"repeat_count": 0.0,
- "routers_loss": 0.008004254661500454,
+ "routers_loss": 0.0051873656921088696,
"skip_count": 2.0,
"step": 2124,
"text_loss": 0.29915499687194824
@@ -20195,13 +20195,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.07421875,
"learning_rate": 0.0009380787636757001,
- "loss": 0.0156,
+ "loss": 0.0155,
"macro_f1": 0.6122449040412903,
"num_tokens": 3427942.0,
"repeat_count": 0.0,
- "routers_loss": 0.030767880380153656,
+ "routers_loss": 0.030079292133450508,
"skip_count": 4.0,
"step": 2126,
"text_loss": 0.24181491136550903
@@ -20214,13 +20214,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06201171875,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0009379294850859256,
"loss": 0.0141,
"macro_f1": 0.3333333432674408,
"num_tokens": 3431314.0,
"repeat_count": 0.0,
- "routers_loss": 0.002620625076815486,
+ "routers_loss": 0.002675612922757864,
"skip_count": 0.0,
"step": 2128,
"text_loss": 0.4669873118400574
@@ -20233,13 +20233,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09033203125,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009377800386807465,
- "loss": 0.0175,
+ "loss": 0.0177,
"macro_f1": 0.3333333432674408,
"num_tokens": 3435020.0,
"repeat_count": 0.0,
- "routers_loss": 0.009095560759305954,
+ "routers_loss": 0.009334275498986244,
"skip_count": 0.0,
"step": 2130,
"text_loss": 0.6478219628334045
@@ -20252,13 +20252,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009376304245174306,
- "loss": 0.0143,
+ "loss": 0.0137,
"macro_f1": 0.6000000238418579,
"num_tokens": 3438276.0,
"repeat_count": 1.0,
- "routers_loss": 0.058448426425457,
+ "routers_loss": 0.038227908313274384,
"skip_count": 2.0,
"step": 2132,
"text_loss": 0.4401201903820038
@@ -20271,13 +20271,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.046875,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0009374806426533104,
- "loss": 0.0116,
+ "loss": 0.0113,
"macro_f1": 0.3333333432674408,
"num_tokens": 3440938.0,
"repeat_count": 0.0,
- "routers_loss": 0.007323687430471182,
+ "routers_loss": 0.006901399698108435,
"skip_count": 0.0,
"step": 2134,
"text_loss": 0.5948942303657532
@@ -20290,13 +20290,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.051513671875,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.0009373306931457827,
- "loss": 0.0122,
+ "loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 3444028.0,
"repeat_count": 0.0,
- "routers_loss": 0.003302243771031499,
+ "routers_loss": 0.0037061909679323435,
"skip_count": 0.0,
"step": 2136,
"text_loss": 0.5349751114845276
@@ -20309,13 +20309,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047607421875,
+ "grad_norm": 0.056884765625,
"learning_rate": 0.0009371805760523086,
- "loss": 0.0113,
+ "loss": 0.0111,
"macro_f1": 0.3333333432674408,
"num_tokens": 3448331.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027974818367511034,
+ "routers_loss": 0.0025877030566334724,
"skip_count": 0.0,
"step": 2138,
"text_loss": 0.4591051936149597
@@ -20328,13 +20328,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009370302914304129,
- "loss": 0.0145,
+ "loss": 0.0144,
"macro_f1": 0.5934640765190125,
"num_tokens": 3451434.0,
"repeat_count": 0.0,
- "routers_loss": 0.01572767272591591,
+ "routers_loss": 0.018742674961686134,
"skip_count": 3.0,
"step": 2140,
"text_loss": 0.23470863699913025
@@ -20347,13 +20347,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06201171875,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009368798393376851,
- "loss": 0.0119,
+ "loss": 0.0122,
"macro_f1": 0.3272727429866791,
"num_tokens": 3454375.0,
"repeat_count": 0.0,
- "routers_loss": 0.020721890032291412,
+ "routers_loss": 0.02382594160735607,
"skip_count": 1.0,
"step": 2142,
"text_loss": 0.6077954769134521
@@ -20366,13 +20366,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.05517578125,
"learning_rate": 0.0009367292198317787,
- "loss": 0.0161,
+ "loss": 0.0164,
"macro_f1": 0.5492662787437439,
"num_tokens": 3457591.0,
"repeat_count": 0.0,
- "routers_loss": 0.03272393345832825,
+ "routers_loss": 0.03331060707569122,
"skip_count": 2.0,
"step": 2144,
"text_loss": 0.3691073954105377
@@ -20385,13 +20385,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052490234375,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0009365784329704115,
- "loss": 0.0191,
+ "loss": 0.0186,
"macro_f1": 0.3333333432674408,
"num_tokens": 3460895.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017473002662882209,
+ "routers_loss": 0.0016955457394942641,
"skip_count": 0.0,
"step": 2146,
"text_loss": 0.3947436511516571
@@ -20404,13 +20404,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.05224609375,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.0009364274788113651,
- "loss": 0.0094,
+ "loss": 0.0096,
"macro_f1": 0.6666666865348816,
"num_tokens": 3464101.0,
"repeat_count": 1.0,
- "routers_loss": 0.008070237934589386,
+ "routers_loss": 0.006169239990413189,
"skip_count": 0.0,
"step": 2148,
"text_loss": 0.3348555266857147
@@ -20423,13 +20423,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.800000011920929,
"f1_skip": 1.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0009362763574124858,
- "loss": 0.0191,
+ "loss": 0.019,
"macro_f1": 0.9265305995941162,
"num_tokens": 3467417.0,
"repeat_count": 3.0,
- "routers_loss": 0.021709222346544266,
+ "routers_loss": 0.024033790454268456,
"skip_count": 1.0,
"step": 2150,
"text_loss": 0.496633380651474
@@ -20442,13 +20442,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.046630859375,
+ "grad_norm": 0.042724609375,
"learning_rate": 0.0009361250688316829,
- "loss": 0.014,
+ "loss": 0.0142,
"macro_f1": 0.3333333432674408,
"num_tokens": 3470917.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022237664088606834,
+ "routers_loss": 0.0024986129719763994,
"skip_count": 0.0,
"step": 2152,
"text_loss": 0.6857671737670898
@@ -20461,13 +20461,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.0546875,
"learning_rate": 0.0009359736131269312,
"loss": 0.0153,
"macro_f1": 0.6666666865348816,
"num_tokens": 3473624.0,
"repeat_count": 0.0,
- "routers_loss": 0.00838750321418047,
+ "routers_loss": 0.008183322846889496,
"skip_count": 1.0,
"step": 2154,
"text_loss": 0.13883116841316223
@@ -20480,13 +20480,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0576171875,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0009358219903562684,
- "loss": 0.01,
+ "loss": 0.0106,
"macro_f1": 0.6666666865348816,
"num_tokens": 3476472.0,
"repeat_count": 0.0,
- "routers_loss": 0.010190514847636223,
+ "routers_loss": 0.011198793537914753,
"skip_count": 3.0,
"step": 2156,
"text_loss": 0.24243666231632233
@@ -20499,13 +20499,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0419921875,
+ "grad_norm": 0.04296875,
"learning_rate": 0.0009356702005777969,
- "loss": 0.0124,
+ "loss": 0.0125,
"macro_f1": 0.3333333432674408,
"num_tokens": 3479688.0,
"repeat_count": 0.0,
- "routers_loss": 0.002411153633147478,
+ "routers_loss": 0.002520184963941574,
"skip_count": 0.0,
"step": 2158,
"text_loss": 0.6407818794250488
@@ -20518,13 +20518,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009355182438496825,
- "loss": 0.0141,
+ "loss": 0.0142,
"macro_f1": 0.3333333432674408,
"num_tokens": 3482598.0,
"repeat_count": 0.0,
- "routers_loss": 0.001032356172800064,
+ "routers_loss": 0.0011065017897635698,
"skip_count": 0.0,
"step": 2160,
"text_loss": 0.7214245796203613
@@ -20537,13 +20537,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05908203125,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0009353661202301557,
- "loss": 0.0147,
+ "loss": 0.0144,
"macro_f1": 0.3333333432674408,
"num_tokens": 3486271.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022046815138310194,
+ "routers_loss": 0.0017824085662141442,
"skip_count": 0.0,
"step": 2162,
"text_loss": 0.5140969157218933
@@ -20556,32 +20556,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.051513671875,
+ "grad_norm": 0.053466796875,
"learning_rate": 0.0009352138297775101,
"loss": 0.0145,
"macro_f1": 0.3333333432674408,
"num_tokens": 3489206.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014977266546338797,
+ "routers_loss": 0.001542879967018962,
"skip_count": 0.0,
"step": 2164,
"text_loss": 0.7956416606903076
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6666666865348816,
- "avg_layers": 26.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
"epoch": 10.169063692398003,
- "f1_execute": 0.9803921580314636,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.800000011920929,
+ "f1_skip": 1.0,
"grad_norm": 0.0771484375,
"learning_rate": 0.000935061372550104,
- "loss": 0.0132,
- "macro_f1": 0.5934640765190125,
+ "loss": 0.0134,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3492003.0,
"repeat_count": 0.0,
- "routers_loss": 0.016847684979438782,
+ "routers_loss": 0.01420794241130352,
"skip_count": 3.0,
"step": 2166,
"text_loss": 0.27489882707595825
@@ -20594,13 +20594,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0009349087486063594,
- "loss": 0.0168,
+ "loss": 0.0166,
"macro_f1": 0.6666666865348816,
"num_tokens": 3494784.0,
"repeat_count": 0.0,
- "routers_loss": 0.0036806222051382065,
+ "routers_loss": 0.003614309709519148,
"skip_count": 1.0,
"step": 2168,
"text_loss": 0.2962227761745453
@@ -20613,13 +20613,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0009347559580047618,
- "loss": 0.0174,
+ "loss": 0.0175,
"macro_f1": 0.8814815282821655,
"num_tokens": 3497886.0,
"repeat_count": 2.0,
- "routers_loss": 0.021412594243884087,
+ "routers_loss": 0.02122853323817253,
"skip_count": 4.0,
"step": 2170,
"text_loss": 0.5919580459594727
@@ -20627,18 +20627,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 10.197240974464338,
- "f1_execute": 1.0,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.06591796875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.000934603000803861,
- "loss": 0.0134,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0135,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 3500939.0,
"repeat_count": 0.0,
- "routers_loss": 0.0201424453407526,
+ "routers_loss": 0.02042219042778015,
"skip_count": 1.0,
"step": 2172,
"text_loss": 0.28722381591796875
@@ -20651,13 +20651,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05419921875,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0009344498770622704,
- "loss": 0.0131,
+ "loss": 0.013,
"macro_f1": 0.3333333432674408,
"num_tokens": 3504852.0,
"repeat_count": 0.0,
- "routers_loss": 0.005059401970356703,
+ "routers_loss": 0.004345106892287731,
"skip_count": 0.0,
"step": 2174,
"text_loss": 0.603236734867096
@@ -20670,13 +20670,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.1064453125,
"learning_rate": 0.0009342965868386673,
"loss": 0.0101,
"macro_f1": 0.3333333432674408,
"num_tokens": 3508320.0,
"repeat_count": 0.0,
- "routers_loss": 0.004006600938737392,
+ "routers_loss": 0.00368050136603415,
"skip_count": 0.0,
"step": 2176,
"text_loss": 0.6020491719245911
@@ -20691,11 +20691,11 @@
"f1_skip": 0.0,
"grad_norm": 0.060302734375,
"learning_rate": 0.000934143130191793,
- "loss": 0.0109,
+ "loss": 0.0108,
"macro_f1": 0.3333333432674408,
"num_tokens": 3511278.0,
"repeat_count": 0.0,
- "routers_loss": 0.013246738351881504,
+ "routers_loss": 0.013425769284367561,
"skip_count": 0.0,
"step": 2178,
"text_loss": 0.5954724550247192
@@ -20708,13 +20708,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06005859375,
+ "grad_norm": 0.060546875,
"learning_rate": 0.000933989507180452,
- "loss": 0.0151,
+ "loss": 0.0149,
"macro_f1": 0.3333333432674408,
"num_tokens": 3514361.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031937146559357643,
+ "routers_loss": 0.002896249992772937,
"skip_count": 0.0,
"step": 2180,
"text_loss": 0.39175131916999817
@@ -20727,13 +20727,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0556640625,
+ "grad_norm": 0.052978515625,
"learning_rate": 0.0009338357178635135,
- "loss": 0.0151,
+ "loss": 0.0147,
"macro_f1": 0.6603773832321167,
"num_tokens": 3517962.0,
"repeat_count": 1.0,
- "routers_loss": 0.014782631769776344,
+ "routers_loss": 0.011538350023329258,
"skip_count": 1.0,
"step": 2182,
"text_loss": 0.4482830762863159
@@ -20746,13 +20746,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.0869140625,
"learning_rate": 0.0009336817622999093,
- "loss": 0.0112,
+ "loss": 0.011,
"macro_f1": 0.3272727429866791,
"num_tokens": 3521299.0,
"repeat_count": 1.0,
- "routers_loss": 0.02318345196545124,
+ "routers_loss": 0.022787930443882942,
"skip_count": 0.0,
"step": 2184,
"text_loss": 0.35177817940711975
@@ -20765,13 +20765,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.055419921875,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009335276405486357,
- "loss": 0.0134,
+ "loss": 0.0139,
"macro_f1": 0.3272727429866791,
"num_tokens": 3524611.0,
"repeat_count": 0.0,
- "routers_loss": 0.011735675856471062,
+ "routers_loss": 0.011597735807299614,
"skip_count": 1.0,
"step": 2186,
"text_loss": 0.24868851900100708
@@ -20784,13 +20784,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0009333733526687524,
- "loss": 0.0198,
+ "loss": 0.0196,
"macro_f1": 0.3333333432674408,
"num_tokens": 3528012.0,
"repeat_count": 0.0,
- "routers_loss": 0.01558679062873125,
+ "routers_loss": 0.014253967441618443,
"skip_count": 0.0,
"step": 2188,
"text_loss": 0.3970910310745239
@@ -20803,13 +20803,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056396484375,
+ "grad_norm": 0.054931640625,
"learning_rate": 0.000933218898719383,
- "loss": 0.0163,
+ "loss": 0.0162,
"macro_f1": 0.3333333432674408,
"num_tokens": 3530908.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019149131840094924,
+ "routers_loss": 0.001659149187617004,
"skip_count": 0.0,
"step": 2190,
"text_loss": 0.7618573307991028
@@ -20822,13 +20822,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07958984375,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0009330642787597141,
- "loss": 0.0161,
+ "loss": 0.0159,
"macro_f1": 0.3333333432674408,
"num_tokens": 3533993.0,
"repeat_count": 0.0,
- "routers_loss": 0.0056966920383274555,
+ "routers_loss": 0.005574346985667944,
"skip_count": 0.0,
"step": 2192,
"text_loss": 0.16470147669315338
@@ -20841,13 +20841,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07080078125,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009329094928489969,
"loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 3537310.0,
"repeat_count": 0.0,
- "routers_loss": 0.002511024009436369,
+ "routers_loss": 0.0026400673668831587,
"skip_count": 0.0,
"step": 2194,
"text_loss": 0.3400416374206543
@@ -20860,13 +20860,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08935546875,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009327545410465452,
- "loss": 0.0126,
+ "loss": 0.0124,
"macro_f1": 0.6666666865348816,
"num_tokens": 3540045.0,
"repeat_count": 0.0,
- "routers_loss": 0.008584192954003811,
+ "routers_loss": 0.008448398672044277,
"skip_count": 3.0,
"step": 2196,
"text_loss": 0.3110542297363281
@@ -20879,13 +20879,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.0009325994234117372,
- "loss": 0.0129,
+ "loss": 0.0122,
"macro_f1": 0.32098764181137085,
"num_tokens": 3544097.0,
"repeat_count": 0.0,
- "routers_loss": 0.03748156875371933,
+ "routers_loss": 0.037553198635578156,
"skip_count": 2.0,
"step": 2198,
"text_loss": 0.36126700043678284
@@ -20898,13 +20898,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09814453125,
+ "grad_norm": 0.09716796875,
"learning_rate": 0.000932444140004014,
- "loss": 0.0129,
+ "loss": 0.0124,
"macro_f1": 0.6666666865348816,
"num_tokens": 3547054.0,
"repeat_count": 1.0,
- "routers_loss": 0.006402099970728159,
+ "routers_loss": 0.006464479025453329,
"skip_count": 0.0,
"step": 2200,
"text_loss": 0.4947047233581543
@@ -20917,13 +20917,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.158203125,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009322886908828805,
- "loss": 0.015,
+ "loss": 0.0138,
"macro_f1": 0.6666666865348816,
"num_tokens": 3549903.0,
"repeat_count": 1.0,
- "routers_loss": 0.0055928584188222885,
+ "routers_loss": 0.005384812597185373,
"skip_count": 0.0,
"step": 2202,
"text_loss": 0.5923738479614258
@@ -20936,13 +20936,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009321330761079052,
"loss": 0.0149,
"macro_f1": 0.6666666865348816,
"num_tokens": 3553745.0,
"repeat_count": 0.0,
- "routers_loss": 0.013155708089470863,
+ "routers_loss": 0.015346619300544262,
"skip_count": 2.0,
"step": 2204,
"text_loss": 0.1904175877571106
@@ -20955,13 +20955,13 @@
"f1_execute": 0.9268292784690857,
"f1_repeat": 0.800000011920929,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.06494140625,
"learning_rate": 0.00093197729573872,
- "loss": 0.0206,
+ "loss": 0.0203,
"macro_f1": 0.8422764539718628,
"num_tokens": 3557235.0,
"repeat_count": 3.0,
- "routers_loss": 0.12029488384723663,
+ "routers_loss": 0.1207597479224205,
"skip_count": 6.0,
"step": 2206,
"text_loss": 0.3904837667942047
@@ -20974,13 +20974,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0771484375,
"learning_rate": 0.0009318213498350202,
- "loss": 0.011,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 3560795.0,
"repeat_count": 0.0,
- "routers_loss": 0.0037007431965321302,
+ "routers_loss": 0.003334777895361185,
"skip_count": 0.0,
"step": 2208,
"text_loss": 0.4268290102481842
@@ -20993,13 +20993,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.0537109375,
"learning_rate": 0.0009316652384565645,
- "loss": 0.0124,
+ "loss": 0.0123,
"macro_f1": 0.3333333432674408,
"num_tokens": 3563754.0,
"repeat_count": 0.0,
- "routers_loss": 0.004071404226124287,
+ "routers_loss": 0.004230072256177664,
"skip_count": 0.0,
"step": 2210,
"text_loss": 0.40049710869789124
@@ -21012,13 +21012,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.046875,
"learning_rate": 0.0009315089616631751,
- "loss": 0.0103,
+ "loss": 0.0106,
"macro_f1": 0.3333333432674408,
"num_tokens": 3567173.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006955390563234687,
+ "routers_loss": 0.0006645230459980667,
"skip_count": 0.0,
"step": 2212,
"text_loss": 0.42568323016166687
@@ -21031,32 +21031,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0849609375,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0009313525195147376,
- "loss": 0.0128,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 3570831.0,
"repeat_count": 0.0,
- "routers_loss": 0.010293997824192047,
+ "routers_loss": 0.0097877848893404,
"skip_count": 0.0,
"step": 2214,
"text_loss": 0.45808279514312744
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.5,
"acc_skip": 0.3333333432674408,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 10.40387437628412,
- "f1_execute": 0.9583333134651184,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 0.5,
- "grad_norm": 0.07470703125,
+ "grad_norm": 0.076171875,
"learning_rate": 0.000931195912071201,
- "loss": 0.0185,
- "macro_f1": 0.8194444179534912,
+ "loss": 0.0187,
+ "macro_f1": 0.7018141150474548,
"num_tokens": 3573745.0,
"repeat_count": 2.0,
- "routers_loss": 0.06593773514032364,
+ "routers_loss": 0.07351134717464447,
"skip_count": 3.0,
"step": 2216,
"text_loss": 0.285696804523468
@@ -21069,13 +21069,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009310391393925775,
- "loss": 0.013,
+ "loss": 0.0125,
"macro_f1": 0.3333333432674408,
"num_tokens": 3576785.0,
"repeat_count": 0.0,
- "routers_loss": 0.00347105972468853,
+ "routers_loss": 0.0033160944003611803,
"skip_count": 0.0,
"step": 2218,
"text_loss": 0.17516443133354187
@@ -21088,32 +21088,32 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.04736328125,
+ "grad_norm": 0.047119140625,
"learning_rate": 0.0009308822015389424,
- "loss": 0.0244,
+ "loss": 0.0241,
"macro_f1": 0.5427350401878357,
"num_tokens": 3580695.0,
"repeat_count": 1.0,
- "routers_loss": 0.04871147498488426,
+ "routers_loss": 0.052930232137441635,
"skip_count": 1.0,
"step": 2220,
"text_loss": 0.5918155908584595
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 26.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
"epoch": 10.432051658350455,
- "f1_execute": 0.9600000381469727,
+ "f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.05517578125,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.072265625,
"learning_rate": 0.0009307250985704352,
- "loss": 0.012,
- "macro_f1": 0.542222261428833,
+ "loss": 0.0128,
+ "macro_f1": 0.6122449040412903,
"num_tokens": 3583729.0,
"repeat_count": 0.0,
- "routers_loss": 0.024859672412276268,
+ "routers_loss": 0.025454653427004814,
"skip_count": 4.0,
"step": 2222,
"text_loss": 0.2652169466018677
@@ -21126,13 +21126,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.052001953125,
"learning_rate": 0.0009305678305472575,
- "loss": 0.016,
+ "loss": 0.0158,
"macro_f1": 0.3333333432674408,
"num_tokens": 3586775.0,
"repeat_count": 0.0,
- "routers_loss": 0.010990055277943611,
+ "routers_loss": 0.011279845610260963,
"skip_count": 0.0,
"step": 2224,
"text_loss": 0.3511691987514496
@@ -21145,13 +21145,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.10791015625,
"learning_rate": 0.000930410397529675,
- "loss": 0.0171,
+ "loss": 0.017,
"macro_f1": 0.3333333432674408,
"num_tokens": 3589676.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025031559634953737,
+ "routers_loss": 0.002700264798477292,
"skip_count": 0.0,
"step": 2226,
"text_loss": 0.24045433104038239
@@ -21164,13 +21164,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.048095703125,
"learning_rate": 0.000930252799578016,
- "loss": 0.0147,
+ "loss": 0.0146,
"macro_f1": 1.0,
"num_tokens": 3593242.0,
"repeat_count": 1.0,
- "routers_loss": 0.008100497536361217,
+ "routers_loss": 0.00826631672680378,
"skip_count": 2.0,
"step": 2228,
"text_loss": 0.3777645528316498
@@ -21183,13 +21183,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0009300950367526728,
- "loss": 0.0128,
+ "loss": 0.0131,
"macro_f1": 0.8820862174034119,
"num_tokens": 3596807.0,
"repeat_count": 2.0,
- "routers_loss": 0.03150207921862602,
+ "routers_loss": 0.036221496760845184,
"skip_count": 2.0,
"step": 2230,
"text_loss": 0.502962589263916
@@ -21202,13 +21202,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07470703125,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009299371091141001,
- "loss": 0.0132,
+ "loss": 0.0131,
"macro_f1": 0.3333333432674408,
"num_tokens": 3600150.0,
"repeat_count": 0.0,
- "routers_loss": 0.006253884173929691,
+ "routers_loss": 0.006449893582612276,
"skip_count": 0.0,
"step": 2232,
"text_loss": 0.20256924629211426
@@ -21221,13 +21221,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.046142578125,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.0009297790167228161,
- "loss": 0.0119,
+ "loss": 0.012,
"macro_f1": 0.6666666865348816,
"num_tokens": 3602988.0,
"repeat_count": 0.0,
- "routers_loss": 0.007228068076074123,
+ "routers_loss": 0.007872486487030983,
"skip_count": 2.0,
"step": 2234,
"text_loss": 0.42476826906204224
@@ -21240,13 +21240,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0009296207596394022,
- "loss": 0.0103,
+ "loss": 0.0101,
"macro_f1": 0.32098764181137085,
"num_tokens": 3606071.0,
"repeat_count": 0.0,
- "routers_loss": 0.02524643763899803,
+ "routers_loss": 0.027397040277719498,
"skip_count": 2.0,
"step": 2236,
"text_loss": 0.23432791233062744
@@ -21259,13 +21259,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.0595703125,
"learning_rate": 0.0009294623379245028,
- "loss": 0.0119,
+ "loss": 0.0117,
"macro_f1": 0.3333333432674408,
"num_tokens": 3609389.0,
"repeat_count": 0.0,
- "routers_loss": 0.009672109968960285,
+ "routers_loss": 0.01042645052075386,
"skip_count": 0.0,
"step": 2238,
"text_loss": 0.16665785014629364
@@ -21278,13 +21278,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0498046875,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0009293037516388252,
- "loss": 0.0155,
+ "loss": 0.0161,
"macro_f1": 0.3333333432674408,
"num_tokens": 3612105.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010066524846479297,
+ "routers_loss": 0.0012458425480872393,
"skip_count": 0.0,
"step": 2240,
"text_loss": 0.59421306848526
@@ -21297,13 +21297,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0732421875,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0009291450008431404,
- "loss": 0.0184,
+ "loss": 0.0185,
"macro_f1": 1.0,
"num_tokens": 3615439.0,
"repeat_count": 1.0,
- "routers_loss": 0.005509128328412771,
+ "routers_loss": 0.005781981628388166,
"skip_count": 1.0,
"step": 2242,
"text_loss": 0.510798454284668
@@ -21316,13 +21316,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.09423828125,
+ "grad_norm": 0.0966796875,
"learning_rate": 0.0009289860855982814,
- "loss": 0.0172,
+ "loss": 0.0166,
"macro_f1": 0.4871794879436493,
"num_tokens": 3618842.0,
"repeat_count": 0.0,
- "routers_loss": 0.030802007764577866,
+ "routers_loss": 0.031195320188999176,
"skip_count": 3.0,
"step": 2244,
"text_loss": 0.7574363350868225
@@ -21335,13 +21335,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.04931640625,
"learning_rate": 0.0009288270059651454,
"loss": 0.0133,
"macro_f1": 0.3333333432674408,
"num_tokens": 3621823.0,
"repeat_count": 0.0,
- "routers_loss": 0.001686889911070466,
+ "routers_loss": 0.001746491645462811,
"skip_count": 0.0,
"step": 2246,
"text_loss": 0.5125683546066284
@@ -21354,13 +21354,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.220703125,
"learning_rate": 0.0009286677620046918,
- "loss": 0.0163,
+ "loss": 0.0159,
"macro_f1": 0.5492662787437439,
"num_tokens": 3624502.0,
"repeat_count": 0.0,
- "routers_loss": 0.03299177065491676,
+ "routers_loss": 0.03792348504066467,
"skip_count": 2.0,
"step": 2248,
"text_loss": 0.7533677220344543
@@ -21375,11 +21375,11 @@
"f1_skip": 0.0,
"grad_norm": 0.07763671875,
"learning_rate": 0.0009285083537779429,
- "loss": 0.0119,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 3627057.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010354233672842383,
+ "routers_loss": 0.0009684451506473124,
"skip_count": 0.0,
"step": 2250,
"text_loss": 0.2219279706478119
@@ -21392,13 +21392,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.11767578125,
"learning_rate": 0.0009283487813459845,
- "loss": 0.0145,
+ "loss": 0.0148,
"macro_f1": 0.5492662787437439,
"num_tokens": 3629720.0,
"repeat_count": 0.0,
- "routers_loss": 0.02196674607694149,
+ "routers_loss": 0.022757573053240776,
"skip_count": 2.0,
"step": 2252,
"text_loss": 0.6903313994407654
@@ -21411,13 +21411,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.1376953125,
"learning_rate": 0.0009281890447699652,
"loss": 0.015,
"macro_f1": 0.6666666865348816,
"num_tokens": 3633234.0,
"repeat_count": 1.0,
- "routers_loss": 0.002239946974441409,
+ "routers_loss": 0.003613058477640152,
"skip_count": 0.0,
"step": 2254,
"text_loss": 0.6278893351554871
@@ -21430,13 +21430,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.046142578125,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0009280291441110961,
- "loss": 0.0117,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 3636289.0,
"repeat_count": 0.0,
- "routers_loss": 0.0063575254753232,
+ "routers_loss": 0.006214062683284283,
"skip_count": 0.0,
"step": 2256,
"text_loss": 0.3011114001274109
@@ -21449,13 +21449,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.040283203125,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0009278690794306517,
- "loss": 0.0143,
+ "loss": 0.014,
"macro_f1": 0.5492662787437439,
"num_tokens": 3640251.0,
"repeat_count": 0.0,
- "routers_loss": 0.0524379126727581,
+ "routers_loss": 0.052556321024894714,
"skip_count": 2.0,
"step": 2258,
"text_loss": 0.19894185662269592
@@ -21468,13 +21468,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.8571428656578064,
"f1_skip": 1.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.08251953125,
"learning_rate": 0.0009277088507899689,
- "loss": 0.0156,
+ "loss": 0.0163,
"macro_f1": 0.9452888369560242,
"num_tokens": 3643527.0,
"repeat_count": 4.0,
- "routers_loss": 0.052486274391412735,
+ "routers_loss": 0.0572301521897316,
"skip_count": 1.0,
"step": 2260,
"text_loss": 0.5593410134315491
@@ -21487,13 +21487,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041748046875,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.0009275484582504475,
"loss": 0.0104,
"macro_f1": 0.3333333432674408,
"num_tokens": 3646959.0,
"repeat_count": 0.0,
- "routers_loss": 0.006877690553665161,
+ "routers_loss": 0.008010074496269226,
"skip_count": 0.0,
"step": 2262,
"text_loss": 0.2128177285194397
@@ -21506,13 +21506,13 @@
"f1_execute": 0.95652174949646,
"f1_repeat": 0.800000011920929,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.05322265625,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0009273879018735505,
- "loss": 0.0136,
+ "loss": 0.0138,
"macro_f1": 0.8521739840507507,
"num_tokens": 3651298.0,
"repeat_count": 3.0,
- "routers_loss": 0.03128742054104805,
+ "routers_loss": 0.035729870200157166,
"skip_count": 3.0,
"step": 2264,
"text_loss": 0.2987811267375946
@@ -21525,13 +21525,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009272271817208031,
- "loss": 0.0188,
+ "loss": 0.0182,
"macro_f1": 0.3333333432674408,
"num_tokens": 3655609.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028425443451851606,
+ "routers_loss": 0.002379779238253832,
"skip_count": 0.0,
"step": 2266,
"text_loss": 0.6024088263511658
@@ -21544,13 +21544,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06689453125,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0009270662978537939,
- "loss": 0.0101,
+ "loss": 0.0098,
"macro_f1": 0.3333333432674408,
"num_tokens": 3658444.0,
"repeat_count": 0.0,
- "routers_loss": 0.009712206199765205,
+ "routers_loss": 0.008943650871515274,
"skip_count": 0.0,
"step": 2268,
"text_loss": 0.1741207242012024
@@ -21563,13 +21563,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0009269052503341736,
- "loss": 0.0162,
+ "loss": 0.0161,
"macro_f1": 0.6595745086669922,
"num_tokens": 3662282.0,
"repeat_count": 1.0,
- "routers_loss": 0.03980376198887825,
+ "routers_loss": 0.030201267451047897,
"skip_count": 4.0,
"step": 2270,
"text_loss": 0.7300035953521729
@@ -21582,13 +21582,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.0009267440392236562,
- "loss": 0.0098,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 3665531.0,
"repeat_count": 0.0,
- "routers_loss": 0.0030603872146457434,
+ "routers_loss": 0.0026635683607310057,
"skip_count": 0.0,
"step": 2272,
"text_loss": 0.31535038352012634
@@ -21601,13 +21601,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0009265826645840178,
"loss": 0.0151,
"macro_f1": 0.3333333432674408,
"num_tokens": 3668407.0,
"repeat_count": 0.0,
- "routers_loss": 0.004795679822564125,
+ "routers_loss": 0.004258926957845688,
"skip_count": 0.0,
"step": 2274,
"text_loss": 0.7272579073905945
@@ -21620,13 +21620,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.1435546875,
+ "grad_norm": 0.125,
"learning_rate": 0.0009264211264770976,
- "loss": 0.0155,
+ "loss": 0.0154,
"macro_f1": 0.6122449040412903,
"num_tokens": 3671503.0,
"repeat_count": 0.0,
- "routers_loss": 0.0340447798371315,
+ "routers_loss": 0.038987524807453156,
"skip_count": 4.0,
"step": 2276,
"text_loss": 0.7488982677459717
@@ -21639,13 +21639,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.099609375,
"learning_rate": 0.0009262594249647975,
- "loss": 0.016,
+ "loss": 0.0164,
"macro_f1": 0.6666666865348816,
"num_tokens": 3674107.0,
"repeat_count": 0.0,
- "routers_loss": 0.007436402142047882,
+ "routers_loss": 0.007211760152131319,
"skip_count": 1.0,
"step": 2278,
"text_loss": 0.1992369294166565
@@ -21658,13 +21658,13 @@
"f1_execute": 0.9767441749572754,
"f1_repeat": 0.8571428656578064,
"f1_skip": 1.0,
- "grad_norm": 0.056396484375,
+ "grad_norm": 0.0546875,
"learning_rate": 0.0009260975601090815,
- "loss": 0.0113,
+ "loss": 0.0112,
"macro_f1": 0.9446290731430054,
"num_tokens": 3677184.0,
"repeat_count": 4.0,
- "routers_loss": 0.02465176396071911,
+ "routers_loss": 0.02538592554628849,
"skip_count": 3.0,
"step": 2280,
"text_loss": 0.46402135491371155
@@ -21677,13 +21677,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0009259355319719768,
- "loss": 0.0167,
+ "loss": 0.0162,
"macro_f1": 0.3333333432674408,
"num_tokens": 3680683.0,
"repeat_count": 0.0,
- "routers_loss": 0.0037910486571490765,
+ "routers_loss": 0.0038464947137981653,
"skip_count": 0.0,
"step": 2282,
"text_loss": 0.5804527401924133
@@ -21696,13 +21696,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009257733406155726,
- "loss": 0.0161,
+ "loss": 0.0169,
"macro_f1": 0.3333333432674408,
"num_tokens": 3683928.0,
"repeat_count": 0.0,
- "routers_loss": 0.003716849023476243,
+ "routers_loss": 0.004841136280447245,
"skip_count": 0.0,
"step": 2284,
"text_loss": 0.4834538400173187
@@ -21715,13 +21715,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0009256109861020212,
- "loss": 0.0118,
+ "loss": 0.0115,
"macro_f1": 0.3333333432674408,
"num_tokens": 3687101.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021690395660698414,
+ "routers_loss": 0.002191900508478284,
"skip_count": 0.0,
"step": 2286,
"text_loss": 0.8199604749679565
@@ -21734,13 +21734,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.000925448468493537,
"loss": 0.0162,
"macro_f1": 0.5427350401878357,
"num_tokens": 3690490.0,
"repeat_count": 1.0,
- "routers_loss": 0.034040264785289764,
+ "routers_loss": 0.03488675877451897,
"skip_count": 2.0,
"step": 2288,
"text_loss": 0.33263635635375977
@@ -21753,32 +21753,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0009252857878523971,
- "loss": 0.0133,
+ "loss": 0.0134,
"macro_f1": 0.6666666865348816,
"num_tokens": 3694109.0,
"repeat_count": 1.0,
- "routers_loss": 0.0027822356205433607,
+ "routers_loss": 0.002897309372201562,
"skip_count": 0.0,
"step": 2290,
"text_loss": 0.47494807839393616
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 10.760786615791018,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.0634765625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05810546875,
"learning_rate": 0.000925122944240941,
- "loss": 0.0156,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0153,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3697233.0,
"repeat_count": 0.0,
- "routers_loss": 0.020813947543501854,
+ "routers_loss": 0.01842675730586052,
"skip_count": 2.0,
"step": 2292,
"text_loss": 0.14693495631217957
@@ -21791,13 +21791,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.042236328125,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0009249599377215707,
- "loss": 0.0145,
+ "loss": 0.0146,
"macro_f1": 0.5866667032241821,
"num_tokens": 3700376.0,
"repeat_count": 1.0,
- "routers_loss": 0.038725610822439194,
+ "routers_loss": 0.04169808700680733,
"skip_count": 3.0,
"step": 2294,
"text_loss": 0.38051268458366394
@@ -21810,13 +21810,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059326171875,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.0009247967683567507,
- "loss": 0.0117,
+ "loss": 0.0112,
"macro_f1": 0.3272727429866791,
"num_tokens": 3703212.0,
"repeat_count": 0.0,
- "routers_loss": 0.01360203418880701,
+ "routers_loss": 0.012183113023638725,
"skip_count": 1.0,
"step": 2296,
"text_loss": 0.23789077997207642
@@ -21829,13 +21829,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0498046875,
+ "grad_norm": 0.05712890625,
"learning_rate": 0.0009246334362090077,
- "loss": 0.0135,
+ "loss": 0.0137,
"macro_f1": 0.8823530077934265,
"num_tokens": 3706490.0,
"repeat_count": 1.0,
- "routers_loss": 0.021909991279244423,
+ "routers_loss": 0.01880069635808468,
"skip_count": 2.0,
"step": 2298,
"text_loss": 0.29067978262901306
@@ -21848,13 +21848,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.08203125,
"learning_rate": 0.000924469941340931,
- "loss": 0.0175,
+ "loss": 0.0173,
"macro_f1": 0.3272727429866791,
"num_tokens": 3709804.0,
"repeat_count": 1.0,
- "routers_loss": 0.03153124824166298,
+ "routers_loss": 0.027359159663319588,
"skip_count": 0.0,
"step": 2300,
"text_loss": 0.67828369140625
@@ -21867,13 +21867,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.000924306283815172,
- "loss": 0.0154,
+ "loss": 0.0153,
"macro_f1": 0.3333333432674408,
"num_tokens": 3712824.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034419491421431303,
+ "routers_loss": 0.003152279881760478,
"skip_count": 0.0,
"step": 2302,
"text_loss": 0.8333184719085693
@@ -21886,13 +21886,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009241424636944445,
- "loss": 0.0163,
+ "loss": 0.0159,
"macro_f1": 0.5492662787437439,
"num_tokens": 3715385.0,
"repeat_count": 0.0,
- "routers_loss": 0.03655214607715607,
+ "routers_loss": 0.0442950464785099,
"skip_count": 2.0,
"step": 2304,
"text_loss": 0.41893699765205383
@@ -21905,13 +21905,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0576171875,
+ "grad_norm": 0.058837890625,
"learning_rate": 0.0009239784810415249,
- "loss": 0.014,
+ "loss": 0.0137,
"macro_f1": 0.8823530077934265,
"num_tokens": 3719080.0,
"repeat_count": 1.0,
- "routers_loss": 0.015360959805548191,
+ "routers_loss": 0.015729321166872978,
"skip_count": 2.0,
"step": 2306,
"text_loss": 0.13360483944416046
@@ -21924,13 +21924,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.0537109375,
+ "grad_norm": 0.06787109375,
"learning_rate": 0.0009238143359192514,
"loss": 0.0136,
"macro_f1": 0.5934640765190125,
"num_tokens": 3722439.0,
"repeat_count": 0.0,
- "routers_loss": 0.027275927364826202,
+ "routers_loss": 0.028816604986786842,
"skip_count": 3.0,
"step": 2308,
"text_loss": 0.39594101905822754
@@ -21943,13 +21943,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0546875,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.000923650028390525,
- "loss": 0.0163,
+ "loss": 0.0166,
"macro_f1": 0.6666666865348816,
"num_tokens": 3725092.0,
"repeat_count": 0.0,
- "routers_loss": 0.003742894157767296,
+ "routers_loss": 0.0036455015651881695,
"skip_count": 2.0,
"step": 2310,
"text_loss": 0.6169708371162415
@@ -21962,13 +21962,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0927734375,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009234855585183086,
- "loss": 0.0135,
+ "loss": 0.014,
"macro_f1": 0.6666666865348816,
"num_tokens": 3728412.0,
"repeat_count": 0.0,
- "routers_loss": 0.009356650523841381,
+ "routers_loss": 0.007565604057163,
"skip_count": 1.0,
"step": 2312,
"text_loss": 0.21257059276103973
@@ -21983,11 +21983,11 @@
"f1_skip": 0.800000011920929,
"grad_norm": 0.0517578125,
"learning_rate": 0.0009233209263656273,
- "loss": 0.0189,
+ "loss": 0.0184,
"macro_f1": 0.9262410998344421,
"num_tokens": 3731467.0,
"repeat_count": 2.0,
- "routers_loss": 0.02852487564086914,
+ "routers_loss": 0.02510629966855049,
"skip_count": 3.0,
"step": 2314,
"text_loss": 0.21639840304851532
@@ -22000,13 +22000,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.057861328125,
"learning_rate": 0.0009231561319955684,
- "loss": 0.0151,
+ "loss": 0.0154,
"macro_f1": 0.3333333432674408,
"num_tokens": 3734906.0,
"repeat_count": 0.0,
- "routers_loss": 0.007533316500484943,
+ "routers_loss": 0.00872227642685175,
"skip_count": 0.0,
"step": 2316,
"text_loss": 0.35639774799346924
@@ -22019,13 +22019,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009229911754712815,
"loss": 0.0176,
"macro_f1": 0.3333333432674408,
"num_tokens": 3737943.0,
"repeat_count": 0.0,
- "routers_loss": 0.004666361026465893,
+ "routers_loss": 0.004695790819823742,
"skip_count": 0.0,
"step": 2318,
"text_loss": 0.5269573330879211
@@ -22038,32 +22038,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.036376953125,
"learning_rate": 0.0009228260568559781,
- "loss": 0.0117,
+ "loss": 0.0115,
"macro_f1": 0.3272727429866791,
"num_tokens": 3741833.0,
"repeat_count": 1.0,
- "routers_loss": 0.020992714911699295,
+ "routers_loss": 0.0217357836663723,
"skip_count": 0.0,
"step": 2320,
"text_loss": 0.5110208988189697
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 10.901673026122689,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.1416015625,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1953125,
"learning_rate": 0.0009226607762129322,
- "loss": 0.0204,
- "macro_f1": 0.6603773832321167,
+ "loss": 0.0201,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 3744642.0,
"repeat_count": 1.0,
- "routers_loss": 0.047016773372888565,
+ "routers_loss": 0.05595960095524788,
"skip_count": 1.0,
"step": 2322,
"text_loss": 0.6291998624801636
@@ -22078,11 +22078,11 @@
"f1_skip": 0.0,
"grad_norm": 0.056884765625,
"learning_rate": 0.0009224953336054796,
- "loss": 0.0156,
+ "loss": 0.0161,
"macro_f1": 0.3333333432674408,
"num_tokens": 3748127.0,
"repeat_count": 0.0,
- "routers_loss": 0.006612313445657492,
+ "routers_loss": 0.0071634589694440365,
"skip_count": 0.0,
"step": 2324,
"text_loss": 0.7404762506484985
@@ -22095,13 +22095,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.000922329729097018,
- "loss": 0.0164,
+ "loss": 0.0169,
"macro_f1": 0.3333333432674408,
"num_tokens": 3751373.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012452995870262384,
+ "routers_loss": 0.0011676300782710314,
"skip_count": 0.0,
"step": 2326,
"text_loss": 0.2915459871292114
@@ -22114,13 +22114,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.055908203125,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.0009221639627510075,
- "loss": 0.0128,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 3754518.0,
"repeat_count": 0.0,
- "routers_loss": 0.011379311792552471,
+ "routers_loss": 0.01039792038500309,
"skip_count": 0.0,
"step": 2328,
"text_loss": 0.22066321969032288
@@ -22133,13 +22133,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0009219980346309702,
- "loss": 0.0127,
+ "loss": 0.0128,
"macro_f1": 0.3333333432674408,
"num_tokens": 3757621.0,
"repeat_count": 0.0,
- "routers_loss": 0.002973968628793955,
+ "routers_loss": 0.0032070958986878395,
"skip_count": 0.0,
"step": 2330,
"text_loss": 0.5558560490608215
@@ -22152,13 +22152,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.076171875,
"learning_rate": 0.0009218319448004899,
- "loss": 0.012,
+ "loss": 0.0118,
"macro_f1": 0.3333333432674408,
"num_tokens": 3760885.0,
"repeat_count": 0.0,
- "routers_loss": 0.00768645154312253,
+ "routers_loss": 0.007085457909852266,
"skip_count": 0.0,
"step": 2332,
"text_loss": 0.4348253607749939
@@ -22171,13 +22171,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.0009216656933232129,
- "loss": 0.0167,
+ "loss": 0.016,
"macro_f1": 0.6666666865348816,
"num_tokens": 3764462.0,
"repeat_count": 0.0,
- "routers_loss": 0.006761785596609116,
+ "routers_loss": 0.005504854489117861,
"skip_count": 1.0,
"step": 2334,
"text_loss": 0.35828644037246704
@@ -22190,13 +22190,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0576171875,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0009214992802628463,
- "loss": 0.0129,
+ "loss": 0.0131,
"macro_f1": 0.3333333432674408,
"num_tokens": 3767159.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013711688807234168,
+ "routers_loss": 0.0013970810687169433,
"skip_count": 0.0,
"step": 2336,
"text_loss": 0.2956557869911194
@@ -22209,13 +22209,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.08203125,
"learning_rate": 0.0009213327056831607,
- "loss": 0.0174,
+ "loss": 0.0181,
"macro_f1": 0.3272727429866791,
"num_tokens": 3770408.0,
"repeat_count": 0.0,
- "routers_loss": 0.04009406641125679,
+ "routers_loss": 0.0427570566534996,
"skip_count": 1.0,
"step": 2338,
"text_loss": 0.14883014559745789
@@ -22228,13 +22228,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0009211659696479875,
- "loss": 0.0095,
+ "loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 3773474.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013272224459797144,
+ "routers_loss": 0.0011273405980318785,
"skip_count": 0.0,
"step": 2340,
"text_loss": 0.26011669635772705
@@ -22249,11 +22249,11 @@
"f1_skip": 0.0,
"grad_norm": 0.059814453125,
"learning_rate": 0.00092099907222122,
- "loss": 0.0145,
+ "loss": 0.0148,
"macro_f1": 0.3333333432674408,
"num_tokens": 3776909.0,
"repeat_count": 0.0,
- "routers_loss": 0.001724833040498197,
+ "routers_loss": 0.0016178421210497618,
"skip_count": 0.0,
"step": 2342,
"text_loss": 0.49078530073165894
@@ -22266,13 +22266,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05908203125,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.000920832013466814,
- "loss": 0.0132,
+ "loss": 0.0129,
"macro_f1": 0.3333333432674408,
"num_tokens": 3780741.0,
"repeat_count": 0.0,
- "routers_loss": 0.005641496740281582,
+ "routers_loss": 0.005510095041245222,
"skip_count": 0.0,
"step": 2344,
"text_loss": 0.4870249927043915
@@ -22285,13 +22285,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.037109375,
"learning_rate": 0.0009206647934487866,
- "loss": 0.011,
+ "loss": 0.0114,
"macro_f1": 0.6666666865348816,
"num_tokens": 3784673.0,
"repeat_count": 1.0,
- "routers_loss": 0.003907595761120319,
+ "routers_loss": 0.0047357892617583275,
"skip_count": 0.0,
"step": 2346,
"text_loss": 0.3251725733280182
@@ -22304,13 +22304,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.057861328125,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0009204974122312167,
- "loss": 0.0141,
+ "loss": 0.0142,
"macro_f1": 0.6666666865348816,
"num_tokens": 3787503.0,
"repeat_count": 0.0,
- "routers_loss": 0.007570050656795502,
+ "routers_loss": 0.00795028731226921,
"skip_count": 1.0,
"step": 2348,
"text_loss": 0.18282145261764526
@@ -22323,13 +22323,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.055908203125,
+ "grad_norm": 0.060546875,
"learning_rate": 0.0009203298698782452,
- "loss": 0.0079,
+ "loss": 0.0081,
"macro_f1": 0.6666666865348816,
"num_tokens": 3790528.0,
"repeat_count": 1.0,
- "routers_loss": 0.0009280897793360054,
+ "routers_loss": 0.0009506374481134117,
"skip_count": 0.0,
"step": 2350,
"text_loss": 0.4093080461025238
@@ -22342,13 +22342,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.045166015625,
+ "grad_norm": 0.047607421875,
"learning_rate": 0.0009201621664540747,
"loss": 0.0155,
"macro_f1": 0.6666666865348816,
"num_tokens": 3794134.0,
"repeat_count": 1.0,
- "routers_loss": 0.005288597662001848,
+ "routers_loss": 0.005159572698175907,
"skip_count": 0.0,
"step": 2352,
"text_loss": 0.5451981425285339
@@ -22361,13 +22361,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009199943020229694,
- "loss": 0.0146,
+ "loss": 0.0148,
"macro_f1": 0.3333333432674408,
"num_tokens": 3797414.0,
"repeat_count": 0.0,
- "routers_loss": 0.002237799344584346,
+ "routers_loss": 0.002356168581172824,
"skip_count": 0.0,
"step": 2354,
"text_loss": 0.3070453405380249
@@ -22380,13 +22380,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.0810546875,
"learning_rate": 0.0009198262766492554,
- "loss": 0.0144,
+ "loss": 0.0141,
"macro_f1": 0.6666666865348816,
"num_tokens": 3800094.0,
"repeat_count": 0.0,
- "routers_loss": 0.006226782687008381,
+ "routers_loss": 0.0051761893555521965,
"skip_count": 1.0,
"step": 2356,
"text_loss": 0.5880904197692871
@@ -22399,13 +22399,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.049072265625,
+ "grad_norm": 0.049560546875,
"learning_rate": 0.00091965809039732,
- "loss": 0.0136,
+ "loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 3803280.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027645498048514128,
+ "routers_loss": 0.0025952060241252184,
"skip_count": 0.0,
"step": 2358,
"text_loss": 0.5210731625556946
@@ -22418,13 +22418,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.06787109375,
"learning_rate": 0.0009194897433316127,
- "loss": 0.0122,
+ "loss": 0.0125,
"macro_f1": 0.6666666865348816,
"num_tokens": 3805866.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034913592971861362,
+ "routers_loss": 0.0042560105212032795,
"skip_count": 2.0,
"step": 2360,
"text_loss": 0.6472984552383423
@@ -22437,13 +22437,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08056640625,
+ "grad_norm": 0.07568359375,
"learning_rate": 0.0009193212355166446,
- "loss": 0.0112,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 3808952.0,
"repeat_count": 0.0,
- "routers_loss": 0.002706601284444332,
+ "routers_loss": 0.0026232977397739887,
"skip_count": 0.0,
"step": 2362,
"text_loss": 0.450063556432724
@@ -22456,13 +22456,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009191525670169881,
- "loss": 0.0108,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 3812080.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032696903217583895,
+ "routers_loss": 0.0034355956595391035,
"skip_count": 0.0,
"step": 2364,
"text_loss": 0.49727216362953186
@@ -22475,13 +22475,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.000918983737897277,
- "loss": 0.0115,
+ "loss": 0.0112,
"macro_f1": 0.6666666865348816,
"num_tokens": 3815282.0,
"repeat_count": 0.0,
- "routers_loss": 0.006245410069823265,
+ "routers_loss": 0.0055653867311775684,
"skip_count": 1.0,
"step": 2366,
"text_loss": 0.6336377859115601
@@ -22496,11 +22496,11 @@
"f1_skip": 1.0,
"grad_norm": 0.033447265625,
"learning_rate": 0.0009188147482222071,
- "loss": 0.0079,
+ "loss": 0.008,
"macro_f1": 1.0,
"num_tokens": 3818106.0,
"repeat_count": 2.0,
- "routers_loss": 0.011230813339352608,
+ "routers_loss": 0.011016021482646465,
"skip_count": 2.0,
"step": 2368,
"text_loss": 0.22513329982757568
@@ -22515,11 +22515,11 @@
"f1_skip": 0.0,
"grad_norm": 0.04296875,
"learning_rate": 0.0009186455980565358,
- "loss": 0.0109,
+ "loss": 0.0105,
"macro_f1": 0.6666666865348816,
"num_tokens": 3821228.0,
"repeat_count": 1.0,
- "routers_loss": 0.014897257089614868,
+ "routers_loss": 0.014039464294910431,
"skip_count": 0.0,
"step": 2370,
"text_loss": 0.21331638097763062
@@ -22532,13 +22532,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0009184762874650816,
- "loss": 0.0131,
+ "loss": 0.0128,
"macro_f1": 0.3333333432674408,
"num_tokens": 3825048.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015503648901358247,
+ "routers_loss": 0.001088051125407219,
"skip_count": 0.0,
"step": 2372,
"text_loss": 0.6031543612480164
@@ -22551,13 +22551,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009183068165127245,
- "loss": 0.0127,
+ "loss": 0.013,
"macro_f1": 0.6666666865348816,
"num_tokens": 3828781.0,
"repeat_count": 0.0,
- "routers_loss": 0.00723480898886919,
+ "routers_loss": 0.006263940595090389,
"skip_count": 1.0,
"step": 2374,
"text_loss": 0.6249601244926453
@@ -22570,13 +22570,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.076171875,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0009181371852644062,
- "loss": 0.0139,
+ "loss": 0.0133,
"macro_f1": 0.6666666865348816,
"num_tokens": 3832507.0,
"repeat_count": 1.0,
- "routers_loss": 0.002053398173302412,
+ "routers_loss": 0.001987969037145376,
"skip_count": 0.0,
"step": 2376,
"text_loss": 0.37972065806388855
@@ -22589,32 +22589,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06689453125,
+ "grad_norm": 0.0908203125,
"learning_rate": 0.0009179673937851299,
"loss": 0.0158,
"macro_f1": 0.6666666865348816,
"num_tokens": 3835644.0,
"repeat_count": 0.0,
- "routers_loss": 0.007927518337965012,
+ "routers_loss": 0.007635094691067934,
"skip_count": 1.0,
"step": 2378,
"text_loss": 0.46319663524627686
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 11.173759906075727,
"f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.06298828125,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009177974421399598,
- "loss": 0.0144,
- "macro_f1": 0.5555555820465088,
+ "loss": 0.0137,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3838700.0,
"repeat_count": 0.0,
- "routers_loss": 0.01924682781100273,
+ "routers_loss": 0.01617279462516308,
"skip_count": 2.0,
"step": 2380,
"text_loss": 0.32141056656837463
@@ -22627,13 +22627,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.046875,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0009176273303940217,
- "loss": 0.0106,
+ "loss": 0.011,
"macro_f1": 0.6666666865348816,
"num_tokens": 3841953.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021689811255782843,
+ "routers_loss": 0.0022273799404501915,
"skip_count": 2.0,
"step": 2382,
"text_loss": 0.5908139944076538
@@ -22646,13 +22646,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.062255859375,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0009174570586125026,
- "loss": 0.0119,
+ "loss": 0.0122,
"macro_f1": 0.32098767161369324,
"num_tokens": 3845763.0,
"repeat_count": 1.0,
- "routers_loss": 0.03431013971567154,
+ "routers_loss": 0.030915161594748497,
"skip_count": 0.0,
"step": 2384,
"text_loss": 0.41400137543678284
@@ -22665,13 +22665,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.043212890625,
+ "grad_norm": 0.04248046875,
"learning_rate": 0.0009172866268606513,
- "loss": 0.0123,
+ "loss": 0.0122,
"macro_f1": 0.6666666865348816,
"num_tokens": 3848984.0,
"repeat_count": 0.0,
- "routers_loss": 0.008275258354842663,
+ "routers_loss": 0.010480951517820358,
"skip_count": 2.0,
"step": 2386,
"text_loss": 0.2560874819755554
@@ -22684,13 +22684,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04736328125,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0009171160352037775,
- "loss": 0.0121,
+ "loss": 0.0124,
"macro_f1": 0.6666666865348816,
"num_tokens": 3852118.0,
"repeat_count": 0.0,
- "routers_loss": 0.007780806161463261,
+ "routers_loss": 0.00809961836785078,
"skip_count": 1.0,
"step": 2388,
"text_loss": 0.28236693143844604
@@ -22709,7 +22709,7 @@
"macro_f1": 1.0,
"num_tokens": 3855314.0,
"repeat_count": 1.0,
- "routers_loss": 0.00553786288946867,
+ "routers_loss": 0.005569872446358204,
"skip_count": 1.0,
"step": 2390,
"text_loss": 0.4578137695789337
@@ -22722,13 +22722,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08447265625,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009167743724365073,
- "loss": 0.01,
+ "loss": 0.0105,
"macro_f1": 0.6666666865348816,
"num_tokens": 3858301.0,
"repeat_count": 0.0,
- "routers_loss": 0.004066115710884333,
+ "routers_loss": 0.0038610948249697685,
"skip_count": 1.0,
"step": 2392,
"text_loss": 0.14082716405391693
@@ -22741,13 +22741,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0810546875,
+ "grad_norm": 0.1376953125,
"learning_rate": 0.0009166033014570368,
- "loss": 0.0104,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 3861296.0,
"repeat_count": 0.0,
- "routers_loss": 0.002403446938842535,
+ "routers_loss": 0.0017607157351449132,
"skip_count": 0.0,
"step": 2394,
"text_loss": 0.384442001581192
@@ -22760,13 +22760,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.054443359375,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.0009164320708343954,
- "loss": 0.0137,
+ "loss": 0.0131,
"macro_f1": 0.6666666865348816,
"num_tokens": 3863985.0,
"repeat_count": 2.0,
- "routers_loss": 0.010212135501205921,
+ "routers_loss": 0.009627950377762318,
"skip_count": 0.0,
"step": 2396,
"text_loss": 0.6969521045684814
@@ -22779,13 +22779,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07275390625,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009162606806341989,
"loss": 0.0107,
"macro_f1": 0.3333333432674408,
"num_tokens": 3866636.0,
"repeat_count": 0.0,
- "routers_loss": 0.007781816180795431,
+ "routers_loss": 0.006915586534887552,
"skip_count": 0.0,
"step": 2398,
"text_loss": 0.48069697618484497
@@ -22798,32 +22798,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.04248046875,
"learning_rate": 0.0009160891309221242,
- "loss": 0.0151,
+ "loss": 0.0149,
"macro_f1": 0.6666666865348816,
"num_tokens": 3870867.0,
"repeat_count": 1.0,
- "routers_loss": 0.0016227158484980464,
+ "routers_loss": 0.0013031222624704242,
"skip_count": 0.0,
"step": 2400,
"text_loss": 0.3882075846195221
},
{
"acc_repeat": 0.5,
- "acc_skip": 1.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
"epoch": 11.277076606985618,
- "f1_execute": 0.9803921580314636,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.6666666865348816,
- "f1_skip": 1.0,
- "grad_norm": 0.06298828125,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0009159174217639096,
- "loss": 0.0114,
- "macro_f1": 0.8823530077934265,
+ "loss": 0.0112,
+ "macro_f1": 0.5427350401878357,
"num_tokens": 3873663.0,
"repeat_count": 2.0,
- "routers_loss": 0.06490851938724518,
+ "routers_loss": 0.06621067970991135,
"skip_count": 1.0,
"step": 2402,
"text_loss": 0.5740041136741638
@@ -22836,13 +22836,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0009157455532253547,
- "loss": 0.0075,
+ "loss": 0.0071,
"macro_f1": 0.6666666865348816,
"num_tokens": 3876788.0,
"repeat_count": 1.0,
- "routers_loss": 0.007105287164449692,
+ "routers_loss": 0.005957918707281351,
"skip_count": 0.0,
"step": 2404,
"text_loss": 0.26025933027267456
@@ -22855,13 +22855,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 1.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.06787109375,
+ "grad_norm": 0.08642578125,
"learning_rate": 0.0009155735253723191,
- "loss": 0.0125,
+ "loss": 0.0126,
"macro_f1": 0.9452888369560242,
"num_tokens": 3879942.0,
"repeat_count": 1.0,
- "routers_loss": 0.03736003860831261,
+ "routers_loss": 0.039429809898138046,
"skip_count": 4.0,
"step": 2406,
"text_loss": 1.1349908113479614
@@ -22874,13 +22874,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.047607421875,
"learning_rate": 0.0009154013382707251,
- "loss": 0.011,
+ "loss": 0.0113,
"macro_f1": 0.3333333432674408,
"num_tokens": 3882682.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012925176415592432,
+ "routers_loss": 0.0012570557883009315,
"skip_count": 0.0,
"step": 2408,
"text_loss": 0.5611135363578796
@@ -22895,11 +22895,11 @@
"f1_skip": 0.0,
"grad_norm": 0.034423828125,
"learning_rate": 0.0009152289919865543,
- "loss": 0.0124,
+ "loss": 0.0123,
"macro_f1": 0.3333333432674408,
"num_tokens": 3886425.0,
"repeat_count": 0.0,
- "routers_loss": 0.001746711554005742,
+ "routers_loss": 0.0017455556662753224,
"skip_count": 0.0,
"step": 2410,
"text_loss": 0.7523751854896545
@@ -22912,13 +22912,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04345703125,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0009150564865858506,
- "loss": 0.0112,
+ "loss": 0.0114,
"macro_f1": 0.6666666865348816,
"num_tokens": 3889273.0,
"repeat_count": 0.0,
- "routers_loss": 0.011005193926393986,
+ "routers_loss": 0.011178011074662209,
"skip_count": 1.0,
"step": 2412,
"text_loss": 0.26942551136016846
@@ -22931,13 +22931,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.800000011920929,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009148838221347182,
- "loss": 0.0102,
+ "loss": 0.0107,
"macro_f1": 0.5934640765190125,
"num_tokens": 3892199.0,
"repeat_count": 3.0,
- "routers_loss": 0.017795369029045105,
+ "routers_loss": 0.019628092646598816,
"skip_count": 0.0,
"step": 2414,
"text_loss": 0.5492315888404846
@@ -22950,13 +22950,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.0009147109986993225,
"loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 3895362.0,
"repeat_count": 1.0,
- "routers_loss": 0.011693861335515976,
+ "routers_loss": 0.012255983427166939,
"skip_count": 0.0,
"step": 2416,
"text_loss": 0.23798216879367828
@@ -22969,13 +22969,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009145380163458899,
- "loss": 0.0177,
+ "loss": 0.0178,
"macro_f1": 0.3333333432674408,
"num_tokens": 3898476.0,
"repeat_count": 0.0,
- "routers_loss": 0.007135285064578056,
+ "routers_loss": 0.007018954027444124,
"skip_count": 0.0,
"step": 2418,
"text_loss": 0.1923145055770874
@@ -22988,13 +22988,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03515625,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0009143648751407074,
- "loss": 0.0082,
+ "loss": 0.0081,
"macro_f1": 0.3333333432674408,
"num_tokens": 3901817.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008607010240666568,
+ "routers_loss": 0.0008574824314564466,
"skip_count": 0.0,
"step": 2420,
"text_loss": 0.4001806974411011
@@ -23007,13 +23007,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.11328125,
"learning_rate": 0.0009141915751501231,
- "loss": 0.0101,
+ "loss": 0.0102,
"macro_f1": 0.5492662787437439,
"num_tokens": 3905461.0,
"repeat_count": 0.0,
- "routers_loss": 0.015359465964138508,
+ "routers_loss": 0.01572350226342678,
"skip_count": 2.0,
"step": 2422,
"text_loss": 0.19519129395484924
@@ -23026,13 +23026,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0380859375,
+ "grad_norm": 0.037353515625,
"learning_rate": 0.0009140181164405458,
- "loss": 0.011,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 3908878.0,
"repeat_count": 0.0,
- "routers_loss": 0.00047823251225054264,
+ "routers_loss": 0.0005503420252352953,
"skip_count": 0.0,
"step": 2424,
"text_loss": 0.6937088370323181
@@ -23047,11 +23047,11 @@
"f1_skip": 0.0,
"grad_norm": 0.068359375,
"learning_rate": 0.0009138444990784454,
- "loss": 0.0129,
+ "loss": 0.013,
"macro_f1": 0.3333333432674408,
"num_tokens": 3912053.0,
"repeat_count": 0.0,
- "routers_loss": 0.0070601715706288815,
+ "routers_loss": 0.007556677330285311,
"skip_count": 0.0,
"step": 2426,
"text_loss": 0.35431069135665894
@@ -23064,13 +23064,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.06201171875,
"learning_rate": 0.000913670723130352,
- "loss": 0.0123,
+ "loss": 0.0117,
"macro_f1": 0.3333333432674408,
"num_tokens": 3915192.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010537977796047926,
+ "routers_loss": 0.0013609991874545813,
"skip_count": 0.0,
"step": 2428,
"text_loss": 0.5171207189559937
@@ -23083,13 +23083,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0517578125,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0009134967886628573,
- "loss": 0.0117,
+ "loss": 0.0115,
"macro_f1": 1.0,
"num_tokens": 3917927.0,
"repeat_count": 2.0,
- "routers_loss": 0.012852456420660019,
+ "routers_loss": 0.010895746760070324,
"skip_count": 2.0,
"step": 2430,
"text_loss": 0.2852934002876282
@@ -23102,13 +23102,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0009133226957426133,
- "loss": 0.0134,
+ "loss": 0.0132,
"macro_f1": 0.5492662787437439,
"num_tokens": 3921460.0,
"repeat_count": 2.0,
- "routers_loss": 0.05307198315858841,
+ "routers_loss": 0.04196908697485924,
"skip_count": 0.0,
"step": 2432,
"text_loss": 0.4864770770072937
@@ -23121,13 +23121,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009131484444363324,
- "loss": 0.0154,
+ "loss": 0.0155,
"macro_f1": 0.3333333432674408,
"num_tokens": 3924662.0,
"repeat_count": 0.0,
- "routers_loss": 0.004656757228076458,
+ "routers_loss": 0.004484197124838829,
"skip_count": 0.0,
"step": 2434,
"text_loss": 0.7568684220314026
@@ -23140,13 +23140,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0498046875,
+ "grad_norm": 0.05078125,
"learning_rate": 0.0009129740348107882,
- "loss": 0.0113,
+ "loss": 0.0114,
"macro_f1": 0.6666666865348816,
"num_tokens": 3927337.0,
"repeat_count": 0.0,
- "routers_loss": 0.0042406003922224045,
+ "routers_loss": 0.004351360257714987,
"skip_count": 2.0,
"step": 2436,
"text_loss": 0.5953161716461182
@@ -23159,13 +23159,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.0517578125,
+ "grad_norm": 0.04736328125,
"learning_rate": 0.0009127994669328151,
- "loss": 0.0089,
+ "loss": 0.0085,
"macro_f1": 0.6122449040412903,
"num_tokens": 3930407.0,
"repeat_count": 0.0,
- "routers_loss": 0.018079286441206932,
+ "routers_loss": 0.01664198748767376,
"skip_count": 4.0,
"step": 2438,
"text_loss": 0.5320524573326111
@@ -23178,13 +23178,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.0595703125,
"learning_rate": 0.0009126247408693071,
- "loss": 0.0072,
+ "loss": 0.0071,
"macro_f1": 0.6666666865348816,
"num_tokens": 3933184.0,
"repeat_count": 0.0,
- "routers_loss": 0.002266801195219159,
+ "routers_loss": 0.0017819046042859554,
"skip_count": 1.0,
"step": 2440,
"text_loss": 0.6051273345947266
@@ -23197,13 +23197,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0009124498566872204,
- "loss": 0.01,
+ "loss": 0.0105,
"macro_f1": 0.3333333432674408,
"num_tokens": 3936620.0,
"repeat_count": 0.0,
- "routers_loss": 0.005790423136204481,
+ "routers_loss": 0.005519696045666933,
"skip_count": 0.0,
"step": 2442,
"text_loss": 0.12987950444221497
@@ -23216,13 +23216,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052734375,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0009122748144535704,
- "loss": 0.011,
+ "loss": 0.0111,
"macro_f1": 0.32098764181137085,
"num_tokens": 3940010.0,
"repeat_count": 0.0,
- "routers_loss": 0.04591076448559761,
+ "routers_loss": 0.04543351009488106,
"skip_count": 2.0,
"step": 2444,
"text_loss": 0.4642033576965332
@@ -23235,13 +23235,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.04296875,
"learning_rate": 0.0009120996142354338,
- "loss": 0.0122,
+ "loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 3943135.0,
"repeat_count": 0.0,
- "routers_loss": 0.004969341680407524,
+ "routers_loss": 0.00550565542653203,
"skip_count": 0.0,
"step": 2446,
"text_loss": 0.5697627067565918
@@ -23254,13 +23254,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05615234375,
+ "grad_norm": 0.05029296875,
"learning_rate": 0.0009119242560999477,
"loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 3946650.0,
"repeat_count": 0.0,
- "routers_loss": 0.00830315612256527,
+ "routers_loss": 0.008842485956847668,
"skip_count": 0.0,
"step": 2448,
"text_loss": 0.17046524584293365
@@ -23273,13 +23273,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.078125,
+ "grad_norm": 0.08154296875,
"learning_rate": 0.0009117487401143095,
"loss": 0.0154,
"macro_f1": 0.6666666865348816,
"num_tokens": 3949470.0,
"repeat_count": 1.0,
- "routers_loss": 0.0059144929982721806,
+ "routers_loss": 0.005900127813220024,
"skip_count": 0.0,
"step": 2450,
"text_loss": 0.37260866165161133
@@ -23292,13 +23292,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.030029296875,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0009115730663457773,
- "loss": 0.0132,
+ "loss": 0.0137,
"macro_f1": 1.0,
"num_tokens": 3952546.0,
"repeat_count": 1.0,
- "routers_loss": 0.0029762545600533485,
+ "routers_loss": 0.003409258322790265,
"skip_count": 1.0,
"step": 2452,
"text_loss": 0.5308008193969727
@@ -23311,13 +23311,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.052001953125,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0009113972348616698,
- "loss": 0.0091,
+ "loss": 0.0095,
"macro_f1": 0.6666666865348816,
"num_tokens": 3955817.0,
"repeat_count": 0.0,
- "routers_loss": 0.011962058953940868,
+ "routers_loss": 0.010098597034811974,
"skip_count": 1.0,
"step": 2454,
"text_loss": 0.39226648211479187
@@ -23330,13 +23330,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009112212457293658,
- "loss": 0.0101,
+ "loss": 0.0102,
"macro_f1": 0.3272727429866791,
"num_tokens": 3958911.0,
"repeat_count": 0.0,
- "routers_loss": 0.07289884239435196,
+ "routers_loss": 0.08184818178415298,
"skip_count": 0.0,
"step": 2456,
"text_loss": 0.45411455631256104
@@ -23349,13 +23349,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.041259765625,
"learning_rate": 0.0009110450990163047,
- "loss": 0.0124,
+ "loss": 0.0127,
"macro_f1": 0.3333333432674408,
"num_tokens": 3962584.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009638209594413638,
+ "routers_loss": 0.0009352223132736981,
"skip_count": 0.0,
"step": 2458,
"text_loss": 0.47292324900627136
@@ -23368,13 +23368,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0400390625,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0009108687947899863,
- "loss": 0.0078,
+ "loss": 0.0077,
"macro_f1": 1.0,
"num_tokens": 3965597.0,
"repeat_count": 1.0,
- "routers_loss": 0.008587516844272614,
+ "routers_loss": 0.008150188252329826,
"skip_count": 2.0,
"step": 2460,
"text_loss": 0.33208340406417847
@@ -23387,13 +23387,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0009106923331179707,
- "loss": 0.0126,
+ "loss": 0.0125,
"macro_f1": 0.5492662787437439,
"num_tokens": 3968664.0,
"repeat_count": 0.0,
- "routers_loss": 0.05080332234501839,
+ "routers_loss": 0.050999004393815994,
"skip_count": 2.0,
"step": 2462,
"text_loss": 0.2459995150566101
@@ -23406,13 +23406,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07080078125,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0009105157140678782,
- "loss": 0.0124,
+ "loss": 0.0126,
"macro_f1": 0.6666666865348816,
"num_tokens": 3971772.0,
"repeat_count": 0.0,
- "routers_loss": 0.007348654326051474,
+ "routers_loss": 0.006196586415171623,
"skip_count": 1.0,
"step": 2464,
"text_loss": 0.23956991732120514
@@ -23425,13 +23425,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06787109375,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0009103389377073896,
- "loss": 0.0099,
+ "loss": 0.01,
"macro_f1": 0.3333333432674408,
"num_tokens": 3976224.0,
"repeat_count": 0.0,
- "routers_loss": 0.007161752786487341,
+ "routers_loss": 0.008181816898286343,
"skip_count": 0.0,
"step": 2466,
"text_loss": 0.3235875070095062
@@ -23444,13 +23444,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.057373046875,
"learning_rate": 0.0009101620041042462,
- "loss": 0.0119,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 3978876.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015090530505403876,
+ "routers_loss": 0.0015451472718268633,
"skip_count": 0.0,
"step": 2468,
"text_loss": 0.4038759469985962
@@ -23463,13 +23463,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07275390625,
+ "grad_norm": 0.09130859375,
"learning_rate": 0.000909984913326249,
- "loss": 0.0129,
+ "loss": 0.0131,
"macro_f1": 0.3272727429866791,
"num_tokens": 3981992.0,
"repeat_count": 0.0,
- "routers_loss": 0.021420184522867203,
+ "routers_loss": 0.021785033866763115,
"skip_count": 1.0,
"step": 2470,
"text_loss": 0.6346460580825806
@@ -23482,13 +23482,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.0712890625,
"learning_rate": 0.0009098076654412595,
- "loss": 0.0092,
+ "loss": 0.0094,
"macro_f1": 0.3333333432674408,
"num_tokens": 3984560.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010742908343672752,
+ "routers_loss": 0.0011462471447885036,
"skip_count": 0.0,
"step": 2472,
"text_loss": 0.3449646532535553
@@ -23501,13 +23501,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05078125,
+ "grad_norm": 0.049560546875,
"learning_rate": 0.0009096302605171996,
- "loss": 0.011,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 3987548.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015209210105240345,
+ "routers_loss": 0.0014367027906700969,
"skip_count": 0.0,
"step": 2474,
"text_loss": 0.5918350219726562
@@ -23520,13 +23520,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.0478515625,
"learning_rate": 0.0009094526986220513,
"loss": 0.0124,
"macro_f1": 0.3333333432674408,
"num_tokens": 3990727.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008761848439462483,
+ "routers_loss": 0.0008977655088528991,
"skip_count": 0.0,
"step": 2476,
"text_loss": 0.463350385427475
@@ -23539,13 +23539,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.0009092749798238563,
- "loss": 0.0146,
+ "loss": 0.015,
"macro_f1": 0.3272727429866791,
"num_tokens": 3993757.0,
"repeat_count": 1.0,
- "routers_loss": 0.01623794063925743,
+ "routers_loss": 0.016712551936507225,
"skip_count": 0.0,
"step": 2478,
"text_loss": 0.5621229410171509
@@ -23558,13 +23558,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07080078125,
+ "grad_norm": 0.06640625,
"learning_rate": 0.000909097104190717,
- "loss": 0.0174,
+ "loss": 0.0172,
"macro_f1": 0.32098764181137085,
"num_tokens": 3997259.0,
"repeat_count": 0.0,
- "routers_loss": 0.04170118644833565,
+ "routers_loss": 0.04134179651737213,
"skip_count": 2.0,
"step": 2480,
"text_loss": 0.375476598739624
@@ -23577,32 +23577,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.046875,
+ "grad_norm": 0.044677734375,
"learning_rate": 0.0009089190717907956,
- "loss": 0.0116,
+ "loss": 0.0117,
"macro_f1": 0.3333333432674408,
"num_tokens": 4000563.0,
"repeat_count": 0.0,
- "routers_loss": 0.003591755870729685,
+ "routers_loss": 0.003462378401309252,
"skip_count": 0.0,
"step": 2482,
"text_loss": 0.5553798675537109
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 11.66216612855885,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.0693359375,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0009087408826923146,
- "loss": 0.0185,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0182,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 4004065.0,
"repeat_count": 0.0,
- "routers_loss": 0.009214848279953003,
+ "routers_loss": 0.008057428523898125,
"skip_count": 2.0,
"step": 2484,
"text_loss": 0.4329465329647064
@@ -23615,13 +23615,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0009085625369635564,
- "loss": 0.0111,
+ "loss": 0.0114,
"macro_f1": 0.3333333432674408,
"num_tokens": 4007119.0,
"repeat_count": 0.0,
- "routers_loss": 0.0059350160881876945,
+ "routers_loss": 0.005759050603955984,
"skip_count": 0.0,
"step": 2486,
"text_loss": 0.501268744468689
@@ -23634,13 +23634,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.0009083840346728631,
- "loss": 0.0118,
+ "loss": 0.0122,
"macro_f1": 0.3272727429866791,
"num_tokens": 4010547.0,
"repeat_count": 1.0,
- "routers_loss": 0.019803427159786224,
+ "routers_loss": 0.020763102918863297,
"skip_count": 0.0,
"step": 2488,
"text_loss": 0.480196475982666
@@ -23653,13 +23653,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.058349609375,
+ "grad_norm": 0.05078125,
"learning_rate": 0.0009082053758886374,
- "loss": 0.0118,
+ "loss": 0.0117,
"macro_f1": 0.6666666865348816,
"num_tokens": 4014600.0,
"repeat_count": 0.0,
- "routers_loss": 0.006243673153221607,
+ "routers_loss": 0.005801836494356394,
"skip_count": 1.0,
"step": 2490,
"text_loss": 0.18249782919883728
@@ -23672,13 +23672,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0009080265606793416,
- "loss": 0.0132,
+ "loss": 0.0128,
"macro_f1": 1.0,
"num_tokens": 4017964.0,
"repeat_count": 1.0,
- "routers_loss": 0.003960726782679558,
+ "routers_loss": 0.004226063843816519,
"skip_count": 1.0,
"step": 2492,
"text_loss": 0.6573076248168945
@@ -23691,13 +23691,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0537109375,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.000907847589113498,
- "loss": 0.0127,
+ "loss": 0.0125,
"macro_f1": 0.6666666865348816,
"num_tokens": 4020694.0,
"repeat_count": 0.0,
- "routers_loss": 0.004959117621183395,
+ "routers_loss": 0.004281101748347282,
"skip_count": 2.0,
"step": 2494,
"text_loss": 0.3944586217403412
@@ -23710,13 +23710,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.000907668461259689,
- "loss": 0.0157,
+ "loss": 0.0152,
"macro_f1": 0.6666666865348816,
"num_tokens": 4023757.0,
"repeat_count": 0.0,
- "routers_loss": 0.009721433743834496,
+ "routers_loss": 0.008786370046436787,
"skip_count": 1.0,
"step": 2496,
"text_loss": 0.6452898979187012
@@ -23729,13 +23729,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0009074891771865566,
- "loss": 0.0124,
+ "loss": 0.0125,
"macro_f1": 0.3333333432674408,
"num_tokens": 4026601.0,
"repeat_count": 0.0,
- "routers_loss": 0.00491701066493988,
+ "routers_loss": 0.005209595896303654,
"skip_count": 0.0,
"step": 2498,
"text_loss": 0.9633619785308838
@@ -23748,13 +23748,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.03759765625,
"learning_rate": 0.0009073097369628028,
- "loss": 0.0131,
+ "loss": 0.013,
"macro_f1": 1.0,
"num_tokens": 4030321.0,
"repeat_count": 3.0,
- "routers_loss": 0.009832080453634262,
+ "routers_loss": 0.00860709697008133,
"skip_count": 1.0,
"step": 2500,
"text_loss": 0.48566827178001404
@@ -23767,13 +23767,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047607421875,
+ "grad_norm": 0.04443359375,
"learning_rate": 0.0009071301406571893,
- "loss": 0.0137,
+ "loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 4033234.0,
"repeat_count": 0.0,
- "routers_loss": 0.003301833290606737,
+ "routers_loss": 0.0035277456045150757,
"skip_count": 0.0,
"step": 2502,
"text_loss": 0.3771554231643677
@@ -23786,13 +23786,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.000906950388338538,
- "loss": 0.0134,
+ "loss": 0.0136,
"macro_f1": 0.3333333432674408,
"num_tokens": 4036417.0,
"repeat_count": 0.0,
- "routers_loss": 0.001580960932187736,
+ "routers_loss": 0.0013424850767478347,
"skip_count": 0.0,
"step": 2504,
"text_loss": 0.8962806463241577
@@ -23805,13 +23805,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.0009067704800757301,
- "loss": 0.0091,
+ "loss": 0.0095,
"macro_f1": 0.3333333432674408,
"num_tokens": 4039564.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011505817528814077,
+ "routers_loss": 0.0010423909407109022,
"skip_count": 0.0,
"step": 2506,
"text_loss": 0.43170279264450073
@@ -23824,13 +23824,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.04248046875,
"learning_rate": 0.000906590415937707,
- "loss": 0.0095,
+ "loss": 0.0094,
"macro_f1": 0.3272727429866791,
"num_tokens": 4043212.0,
"repeat_count": 0.0,
- "routers_loss": 0.023224346339702606,
+ "routers_loss": 0.021780289709568024,
"skip_count": 1.0,
"step": 2508,
"text_loss": 0.41495826840400696
@@ -23843,13 +23843,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0009064101959934696,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 4046687.0,
"repeat_count": 0.0,
- "routers_loss": 0.007955167442560196,
+ "routers_loss": 0.007261929102241993,
"skip_count": 1.0,
"step": 2510,
"text_loss": 0.21821187436580658
@@ -23862,13 +23862,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.057861328125,
"learning_rate": 0.0009062298203120783,
- "loss": 0.0101,
+ "loss": 0.0102,
"macro_f1": 0.6666666865348816,
"num_tokens": 4050735.0,
"repeat_count": 0.0,
- "routers_loss": 0.006164440419524908,
+ "routers_loss": 0.007447180338203907,
"skip_count": 2.0,
"step": 2512,
"text_loss": 0.1818767935037613
@@ -23881,13 +23881,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.058837890625,
+ "grad_norm": 0.06494140625,
"learning_rate": 0.0009060492889626535,
- "loss": 0.014,
+ "loss": 0.0142,
"macro_f1": 0.3272727429866791,
"num_tokens": 4054426.0,
"repeat_count": 1.0,
- "routers_loss": 0.0713663101196289,
+ "routers_loss": 0.0718490406870842,
"skip_count": 0.0,
"step": 2514,
"text_loss": 0.22798970341682434
@@ -23900,13 +23900,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.099609375,
"learning_rate": 0.0009058686020143753,
- "loss": 0.0182,
+ "loss": 0.0183,
"macro_f1": 0.3333333432674408,
"num_tokens": 4057615.0,
"repeat_count": 0.0,
- "routers_loss": 0.0052308146841824055,
+ "routers_loss": 0.0052676633931696415,
"skip_count": 0.0,
"step": 2516,
"text_loss": 0.1712338626384735
@@ -23919,13 +23919,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04052734375,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.0009056877595364832,
- "loss": 0.0143,
+ "loss": 0.0137,
"macro_f1": 0.3333333432674408,
"num_tokens": 4060338.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020465939305722713,
+ "routers_loss": 0.0018052728846669197,
"skip_count": 0.0,
"step": 2518,
"text_loss": 0.6811438798904419
@@ -23938,13 +23938,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.083984375,
"learning_rate": 0.0009055067615982761,
- "loss": 0.0114,
+ "loss": 0.0113,
"macro_f1": 0.3333333432674408,
"num_tokens": 4062887.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008663221378810704,
+ "routers_loss": 0.0009029926732182503,
"skip_count": 0.0,
"step": 2520,
"text_loss": 0.5480356812477112
@@ -23957,13 +23957,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.0009053256082691133,
- "loss": 0.0104,
+ "loss": 0.0106,
"macro_f1": 0.3333333432674408,
"num_tokens": 4065357.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026889131404459476,
+ "routers_loss": 0.0027515271212905645,
"skip_count": 0.0,
"step": 2522,
"text_loss": 0.5234101414680481
@@ -23978,11 +23978,11 @@
"f1_skip": 0.0,
"grad_norm": 0.08203125,
"learning_rate": 0.0009051442996184127,
- "loss": 0.0181,
+ "loss": 0.0174,
"macro_f1": 0.3333333432674408,
"num_tokens": 4068111.0,
"repeat_count": 0.0,
- "routers_loss": 0.002255887258797884,
+ "routers_loss": 0.002199822571128607,
"skip_count": 0.0,
"step": 2524,
"text_loss": 0.2418575882911682
@@ -23995,13 +23995,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.060546875,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009049628357156521,
- "loss": 0.0144,
+ "loss": 0.0143,
"macro_f1": 0.6666666865348816,
"num_tokens": 4071284.0,
"repeat_count": 0.0,
- "routers_loss": 0.005672316066920757,
+ "routers_loss": 0.006303096655756235,
"skip_count": 2.0,
"step": 2526,
"text_loss": 0.7948065996170044
@@ -24014,13 +24014,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0380859375,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.000904781216630369,
- "loss": 0.007,
+ "loss": 0.0068,
"macro_f1": 0.6601307392120361,
"num_tokens": 4074750.0,
"repeat_count": 1.0,
- "routers_loss": 0.017167411744594574,
+ "routers_loss": 0.01791904680430889,
"skip_count": 2.0,
"step": 2528,
"text_loss": 0.809726357460022
@@ -24033,13 +24033,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.053955078125,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0009045994424321602,
- "loss": 0.0101,
+ "loss": 0.0102,
"macro_f1": 1.0,
"num_tokens": 4078617.0,
"repeat_count": 2.0,
- "routers_loss": 0.019105618819594383,
+ "routers_loss": 0.016553178429603577,
"skip_count": 2.0,
"step": 2530,
"text_loss": 0.8755000829696655
@@ -24052,13 +24052,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.060791015625,
+ "grad_norm": 0.061767578125,
"learning_rate": 0.0009044175131906817,
"loss": 0.0145,
"macro_f1": 0.3333333432674408,
"num_tokens": 4080936.0,
"repeat_count": 0.0,
- "routers_loss": 0.007993129082024097,
+ "routers_loss": 0.00884837657213211,
"skip_count": 0.0,
"step": 2532,
"text_loss": 0.795871913433075
@@ -24071,13 +24071,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.05029296875,
"learning_rate": 0.0009042354289756491,
- "loss": 0.0124,
+ "loss": 0.0122,
"macro_f1": 0.3333333432674408,
"num_tokens": 4084459.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024954001419246197,
+ "routers_loss": 0.0024387789890170097,
"skip_count": 0.0,
"step": 2534,
"text_loss": 0.18875400722026825
@@ -24090,13 +24090,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009040531898568379,
- "loss": 0.0169,
+ "loss": 0.0171,
"macro_f1": 0.3333333432674408,
"num_tokens": 4088464.0,
"repeat_count": 0.0,
- "routers_loss": 0.004360117018222809,
+ "routers_loss": 0.00491489190608263,
"skip_count": 0.0,
"step": 2536,
"text_loss": 0.334369033575058
@@ -24109,13 +24109,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0927734375,
+ "grad_norm": 0.091796875,
"learning_rate": 0.000903870795904082,
- "loss": 0.0142,
+ "loss": 0.0145,
"macro_f1": 0.6666666865348816,
"num_tokens": 4091659.0,
"repeat_count": 0.0,
- "routers_loss": 0.00429064966738224,
+ "routers_loss": 0.004592662677168846,
"skip_count": 2.0,
"step": 2538,
"text_loss": 0.21298295259475708
@@ -24130,11 +24130,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.0458984375,
"learning_rate": 0.000903688247187275,
- "loss": 0.0136,
+ "loss": 0.0137,
"macro_f1": 0.5492662787437439,
"num_tokens": 4095496.0,
"repeat_count": 0.0,
- "routers_loss": 0.0132954316213727,
+ "routers_loss": 0.011647242121398449,
"skip_count": 2.0,
"step": 2540,
"text_loss": 0.2985081672668457
@@ -24147,13 +24147,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.04443359375,
"learning_rate": 0.0009035055437763704,
- "loss": 0.0129,
+ "loss": 0.0124,
"macro_f1": 0.3333333432674408,
"num_tokens": 4098663.0,
"repeat_count": 0.0,
- "routers_loss": 0.002104961546137929,
+ "routers_loss": 0.0021238960325717926,
"skip_count": 0.0,
"step": 2542,
"text_loss": 0.35359489917755127
@@ -24166,13 +24166,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.060791015625,
+ "grad_norm": 0.05859375,
"learning_rate": 0.0009033226857413803,
- "loss": 0.0167,
+ "loss": 0.0163,
"macro_f1": 0.6666666865348816,
"num_tokens": 4101588.0,
"repeat_count": 1.0,
- "routers_loss": 0.002973714144900441,
+ "routers_loss": 0.0024701557122170925,
"skip_count": 0.0,
"step": 2544,
"text_loss": 1.1577601432800293
@@ -24185,13 +24185,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.080078125,
"learning_rate": 0.000903139673152376,
- "loss": 0.0119,
+ "loss": 0.012,
"macro_f1": 0.3333333432674408,
"num_tokens": 4104643.0,
"repeat_count": 0.0,
- "routers_loss": 0.002359170001000166,
+ "routers_loss": 0.002499542199075222,
"skip_count": 0.0,
"step": 2546,
"text_loss": 1.0173401832580566
@@ -24204,13 +24204,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0615234375,
+ "grad_norm": 0.059814453125,
"learning_rate": 0.0009029565060794885,
- "loss": 0.0168,
+ "loss": 0.0165,
"macro_f1": 0.3333333432674408,
"num_tokens": 4109247.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033595687709748745,
+ "routers_loss": 0.0034200598020106554,
"skip_count": 0.0,
"step": 2548,
"text_loss": 0.5690504312515259
@@ -24223,13 +24223,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.07421875,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009027731845929079,
"loss": 0.0155,
"macro_f1": 0.8823530077934265,
"num_tokens": 4112597.0,
"repeat_count": 1.0,
- "routers_loss": 0.015323673374950886,
+ "routers_loss": 0.015981333330273628,
"skip_count": 1.0,
"step": 2550,
"text_loss": 0.294549822807312
@@ -24242,13 +24242,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.043212890625,
+ "grad_norm": 0.06103515625,
"learning_rate": 0.0009025897087628829,
- "loss": 0.0063,
+ "loss": 0.0064,
"macro_f1": 0.5492662787437439,
"num_tokens": 4115844.0,
"repeat_count": 0.0,
- "routers_loss": 0.02122018299996853,
+ "routers_loss": 0.02606951631605625,
"skip_count": 2.0,
"step": 2552,
"text_loss": 0.22692419588565826
@@ -24261,13 +24261,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07763671875,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009024060786597222,
"loss": 0.0202,
"macro_f1": 0.3333333432674408,
"num_tokens": 4118634.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010765352053567767,
+ "routers_loss": 0.001026194542646408,
"skip_count": 0.0,
"step": 2554,
"text_loss": 0.6807059645652771
@@ -24280,13 +24280,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.000902222294353793,
- "loss": 0.0128,
+ "loss": 0.0124,
"macro_f1": 0.3333333432674408,
"num_tokens": 4122024.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017301233019679785,
+ "routers_loss": 0.001974924933165312,
"skip_count": 0.0,
"step": 2556,
"text_loss": 0.7373668551445007
@@ -24299,13 +24299,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.055908203125,
+ "grad_norm": 0.04833984375,
"learning_rate": 0.0009020383559155219,
- "loss": 0.0056,
+ "loss": 0.0054,
"macro_f1": 1.0,
"num_tokens": 4124803.0,
"repeat_count": 1.0,
- "routers_loss": 0.004307204391807318,
+ "routers_loss": 0.004662613850086927,
"skip_count": 2.0,
"step": 2558,
"text_loss": 0.21808166801929474
@@ -24318,13 +24318,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.029541015625,
+ "grad_norm": 0.0263671875,
"learning_rate": 0.0009018542634153943,
- "loss": 0.0064,
+ "loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 4127680.0,
"repeat_count": 0.0,
- "routers_loss": 0.0073805381543934345,
+ "routers_loss": 0.006881687790155411,
"skip_count": 0.0,
"step": 2560,
"text_loss": 0.25192978978157043
@@ -24339,11 +24339,11 @@
"f1_skip": 1.0,
"grad_norm": 0.049560546875,
"learning_rate": 0.0009016700169239551,
- "loss": 0.0108,
+ "loss": 0.0105,
"macro_f1": 1.0,
"num_tokens": 4130431.0,
"repeat_count": 1.0,
- "routers_loss": 0.005493874195963144,
+ "routers_loss": 0.005977808032184839,
"skip_count": 1.0,
"step": 2562,
"text_loss": 0.4700816869735718
@@ -24356,13 +24356,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0009014856165118075,
- "loss": 0.0154,
+ "loss": 0.0153,
"macro_f1": 0.6666666865348816,
"num_tokens": 4133535.0,
"repeat_count": 0.0,
- "routers_loss": 0.006889877840876579,
+ "routers_loss": 0.007005698047578335,
"skip_count": 1.0,
"step": 2564,
"text_loss": 0.6558199524879456
@@ -24375,13 +24375,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03125,
+ "grad_norm": 0.030517578125,
"learning_rate": 0.0009013010622496144,
- "loss": 0.009,
+ "loss": 0.0088,
"macro_f1": 0.3333333432674408,
"num_tokens": 4136534.0,
"repeat_count": 0.0,
- "routers_loss": 0.008495541289448738,
+ "routers_loss": 0.007262171246111393,
"skip_count": 0.0,
"step": 2566,
"text_loss": 0.2565421462059021
@@ -24394,13 +24394,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0009011163542080971,
- "loss": 0.0089,
+ "loss": 0.0088,
"macro_f1": 0.5934640765190125,
"num_tokens": 4139762.0,
"repeat_count": 0.0,
- "routers_loss": 0.05929862707853317,
+ "routers_loss": 0.05431923270225525,
"skip_count": 3.0,
"step": 2568,
"text_loss": 0.19896510243415833
@@ -24413,13 +24413,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.026611328125,
"learning_rate": 0.0009009314924580363,
- "loss": 0.0086,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 4143398.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033934004604816437,
+ "routers_loss": 0.003667369019240141,
"skip_count": 0.0,
"step": 2570,
"text_loss": 0.6581419110298157
@@ -24432,13 +24432,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.054931640625,
+ "grad_norm": 0.052978515625,
"learning_rate": 0.0009007464770702712,
"loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 4146248.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012826769379898906,
+ "routers_loss": 0.00132099783513695,
"skip_count": 0.0,
"step": 2572,
"text_loss": 0.5316711068153381
@@ -24451,13 +24451,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.0009005613081157002,
"loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 4149455.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019460092298686504,
+ "routers_loss": 0.0020061524119228125,
"skip_count": 0.0,
"step": 2574,
"text_loss": 0.5400773882865906
@@ -24470,13 +24470,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.05517578125,
"learning_rate": 0.0009003759856652802,
- "loss": 0.0112,
+ "loss": 0.0111,
"macro_f1": 0.6666666865348816,
"num_tokens": 4152774.0,
"repeat_count": 0.0,
- "routers_loss": 0.004493138287216425,
+ "routers_loss": 0.002621434163302183,
"skip_count": 1.0,
"step": 2576,
"text_loss": 0.3672606945037842
@@ -24489,13 +24489,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.055908203125,
+ "grad_norm": 0.051513671875,
"learning_rate": 0.0009001905097900273,
"loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 4155835.0,
"repeat_count": 0.0,
- "routers_loss": 0.005607665050774813,
+ "routers_loss": 0.005290219560265541,
"skip_count": 0.0,
"step": 2578,
"text_loss": 0.8159038424491882
@@ -24508,13 +24508,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04345703125,
+ "grad_norm": 0.040771484375,
"learning_rate": 0.0009000048805610161,
- "loss": 0.0123,
+ "loss": 0.0119,
"macro_f1": 0.3333333432674408,
"num_tokens": 4158874.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015080278972163796,
+ "routers_loss": 0.0013576085912063718,
"skip_count": 0.0,
"step": 2580,
"text_loss": 0.5518951416015625
@@ -24529,11 +24529,11 @@
"f1_skip": 0.0,
"grad_norm": 0.138671875,
"learning_rate": 0.00089981909804938,
- "loss": 0.0142,
+ "loss": 0.0143,
"macro_f1": 0.3333333432674408,
"num_tokens": 4162076.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022276053205132484,
+ "routers_loss": 0.0021483441814780235,
"skip_count": 0.0,
"step": 2582,
"text_loss": 0.43552228808403015
@@ -24546,13 +24546,13 @@
"f1_execute": 0.9387754797935486,
"f1_repeat": 1.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.07421875,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0008996331623263114,
- "loss": 0.0116,
+ "loss": 0.0117,
"macro_f1": 0.7795917987823486,
"num_tokens": 4165041.0,
"repeat_count": 1.0,
- "routers_loss": 0.0499282106757164,
+ "routers_loss": 0.0544300302863121,
"skip_count": 4.0,
"step": 2584,
"text_loss": 0.24812501668930054
@@ -24565,13 +24565,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.047607421875,
"learning_rate": 0.0008994470734630611,
- "loss": 0.01,
+ "loss": 0.0101,
"macro_f1": 0.3333333432674408,
"num_tokens": 4168290.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016360745066776872,
+ "routers_loss": 0.0017150711501017213,
"skip_count": 0.0,
"step": 2586,
"text_loss": 0.6392097473144531
@@ -24584,32 +24584,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05224609375,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0008992608315309388,
- "loss": 0.0149,
+ "loss": 0.015,
"macro_f1": 0.6666666865348816,
"num_tokens": 4171310.0,
"repeat_count": 0.0,
- "routers_loss": 0.0037772543728351593,
+ "routers_loss": 0.0046473173424601555,
"skip_count": 2.0,
"step": 2588,
"text_loss": 0.6534156799316406
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 12.15967126504256,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.060791015625,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06591796875,
"learning_rate": 0.0008990744366013125,
- "loss": 0.0104,
- "macro_f1": 0.6538461446762085,
+ "loss": 0.0105,
+ "macro_f1": 0.3144654333591461,
"num_tokens": 4174042.0,
"repeat_count": 2.0,
- "routers_loss": 0.05992122367024422,
+ "routers_loss": 0.060913100838661194,
"skip_count": 1.0,
"step": 2590,
"text_loss": 0.5365690588951111
@@ -24622,13 +24622,13 @@
"f1_execute": 0.9583333134651184,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.055419921875,
"learning_rate": 0.0008988878887456093,
"loss": 0.0118,
"macro_f1": 0.6051587462425232,
"num_tokens": 4177666.0,
"repeat_count": 1.0,
- "routers_loss": 0.0679154023528099,
+ "routers_loss": 0.06268956512212753,
"skip_count": 4.0,
"step": 2592,
"text_loss": 0.226226806640625
@@ -24643,11 +24643,11 @@
"f1_skip": 0.0,
"grad_norm": 0.03662109375,
"learning_rate": 0.0008987011880353149,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.32098764181137085,
"num_tokens": 4180490.0,
"repeat_count": 0.0,
- "routers_loss": 0.03284052759408951,
+ "routers_loss": 0.030141465365886688,
"skip_count": 2.0,
"step": 2594,
"text_loss": 0.2581401765346527
@@ -24660,13 +24660,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.051513671875,
+ "grad_norm": 0.044677734375,
"learning_rate": 0.0008985143345419729,
- "loss": 0.0087,
+ "loss": 0.0082,
"macro_f1": 0.5492662787437439,
"num_tokens": 4183300.0,
"repeat_count": 0.0,
- "routers_loss": 0.01971421390771866,
+ "routers_loss": 0.018745863810181618,
"skip_count": 2.0,
"step": 2596,
"text_loss": 0.7778542637825012
@@ -24679,13 +24679,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0703125,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0008983273283371862,
- "loss": 0.0099,
+ "loss": 0.0096,
"macro_f1": 0.5492662787437439,
"num_tokens": 4186535.0,
"repeat_count": 0.0,
- "routers_loss": 0.028065117076039314,
+ "routers_loss": 0.026792079210281372,
"skip_count": 2.0,
"step": 2598,
"text_loss": 0.34700271487236023
@@ -24698,13 +24698,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0008981401694926159,
- "loss": 0.0077,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 4189082.0,
"repeat_count": 0.0,
- "routers_loss": 0.00166845612693578,
+ "routers_loss": 0.001914160675369203,
"skip_count": 0.0,
"step": 2600,
"text_loss": 0.6879339218139648
@@ -24717,13 +24717,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0008979528580799815,
- "loss": 0.0138,
+ "loss": 0.0136,
"macro_f1": 0.6666666865348816,
"num_tokens": 4192330.0,
"repeat_count": 0.0,
- "routers_loss": 0.007527270819991827,
+ "routers_loss": 0.007978348061442375,
"skip_count": 2.0,
"step": 2602,
"text_loss": 0.3524550497531891
@@ -24736,13 +24736,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.0008977653941710613,
- "loss": 0.0137,
+ "loss": 0.0134,
"macro_f1": 0.6666666865348816,
"num_tokens": 4196117.0,
"repeat_count": 2.0,
- "routers_loss": 0.00412185862660408,
+ "routers_loss": 0.0035376469604671,
"skip_count": 0.0,
"step": 2604,
"text_loss": 0.42356348037719727
@@ -24755,13 +24755,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06005859375,
+ "grad_norm": 0.05810546875,
"learning_rate": 0.0008975777778376916,
- "loss": 0.0157,
+ "loss": 0.0156,
"macro_f1": 0.6666666865348816,
"num_tokens": 4200423.0,
"repeat_count": 0.0,
- "routers_loss": 0.007787751499563456,
+ "routers_loss": 0.008262477815151215,
"skip_count": 1.0,
"step": 2606,
"text_loss": 0.5272893905639648
@@ -24774,13 +24774,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0008973900091517675,
"loss": 0.0114,
"macro_f1": 0.3272727429866791,
"num_tokens": 4203257.0,
"repeat_count": 0.0,
- "routers_loss": 0.024111779406666756,
+ "routers_loss": 0.022957922890782356,
"skip_count": 1.0,
"step": 2608,
"text_loss": 0.2713734805583954
@@ -24793,13 +24793,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.045166015625,
+ "grad_norm": 0.043701171875,
"learning_rate": 0.000897202088185242,
- "loss": 0.0091,
+ "loss": 0.0085,
"macro_f1": 0.6666666865348816,
"num_tokens": 4206243.0,
"repeat_count": 0.0,
- "routers_loss": 0.0057326615788042545,
+ "routers_loss": 0.006623407825827599,
"skip_count": 2.0,
"step": 2610,
"text_loss": 0.5920525789260864
@@ -24812,13 +24812,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.0517578125,
"learning_rate": 0.0008970140150101274,
- "loss": 0.0118,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 4209264.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008877563523128629,
+ "routers_loss": 0.0008602747693657875,
"skip_count": 0.0,
"step": 2612,
"text_loss": 0.33421996235847473
@@ -24831,13 +24831,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.030517578125,
"learning_rate": 0.0008968257896984932,
- "loss": 0.0067,
+ "loss": 0.0062,
"macro_f1": 0.6666666865348816,
"num_tokens": 4212058.0,
"repeat_count": 0.0,
- "routers_loss": 0.0039034869987517595,
+ "routers_loss": 0.0024653903674334288,
"skip_count": 1.0,
"step": 2614,
"text_loss": 0.37923356890678406
@@ -24850,13 +24850,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.06298828125,
"learning_rate": 0.0008966374123224677,
- "loss": 0.0085,
+ "loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 4214929.0,
"repeat_count": 0.0,
- "routers_loss": 0.01140254084020853,
+ "routers_loss": 0.010878405533730984,
"skip_count": 0.0,
"step": 2616,
"text_loss": 0.4350503981113434
@@ -24869,13 +24869,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03125,
+ "grad_norm": 0.0303955078125,
"learning_rate": 0.0008964488829542376,
"loss": 0.0083,
"macro_f1": 0.3272727429866791,
"num_tokens": 4219170.0,
"repeat_count": 0.0,
- "routers_loss": 0.028559349477291107,
+ "routers_loss": 0.02864212542772293,
"skip_count": 1.0,
"step": 2618,
"text_loss": 0.26250728964805603
@@ -24888,13 +24888,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.061279296875,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0008962602016660478,
- "loss": 0.0097,
+ "loss": 0.0096,
"macro_f1": 0.6666666865348816,
"num_tokens": 4222077.0,
"repeat_count": 0.0,
- "routers_loss": 0.010525460354983807,
+ "routers_loss": 0.010444172658026218,
"skip_count": 2.0,
"step": 2620,
"text_loss": 0.4718937575817108
@@ -24907,13 +24907,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.048583984375,
+ "grad_norm": 0.0478515625,
"learning_rate": 0.0008960713685302011,
- "loss": 0.0104,
+ "loss": 0.0105,
"macro_f1": 0.6666666865348816,
"num_tokens": 4225383.0,
"repeat_count": 0.0,
- "routers_loss": 0.005284689832478762,
+ "routers_loss": 0.006409442983567715,
"skip_count": 1.0,
"step": 2622,
"text_loss": 0.30420538783073425
@@ -24926,13 +24926,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0284423828125,
+ "grad_norm": 0.02978515625,
"learning_rate": 0.0008958823836190588,
- "loss": 0.0051,
+ "loss": 0.005,
"macro_f1": 0.3272727429866791,
"num_tokens": 4228349.0,
"repeat_count": 0.0,
- "routers_loss": 0.011040215380489826,
+ "routers_loss": 0.009996986016631126,
"skip_count": 1.0,
"step": 2624,
"text_loss": 0.5392362475395203
@@ -24945,13 +24945,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.031494140625,
"learning_rate": 0.0008956932470050404,
"loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 4232007.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014406041009351611,
+ "routers_loss": 0.0014383369125425816,
"skip_count": 0.0,
"step": 2626,
"text_loss": 0.7112401127815247
@@ -24964,13 +24964,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0008955039587606233,
- "loss": 0.0111,
+ "loss": 0.0109,
"macro_f1": 0.6666666865348816,
"num_tokens": 4235122.0,
"repeat_count": 0.0,
- "routers_loss": 0.007106760982424021,
+ "routers_loss": 0.00781513936817646,
"skip_count": 3.0,
"step": 2628,
"text_loss": 0.17802883684635162
@@ -24983,13 +24983,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0400390625,
+ "grad_norm": 0.0439453125,
"learning_rate": 0.0008953145189583429,
- "loss": 0.0125,
+ "loss": 0.0126,
"macro_f1": 0.542222261428833,
"num_tokens": 4238248.0,
"repeat_count": 0.0,
- "routers_loss": 0.06423533707857132,
+ "routers_loss": 0.062252625823020935,
"skip_count": 4.0,
"step": 2630,
"text_loss": 0.5551572442054749
@@ -25002,13 +25002,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0008951249276707933,
- "loss": 0.012,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 4241042.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010294591775164008,
+ "routers_loss": 0.0011421777307987213,
"skip_count": 0.0,
"step": 2632,
"text_loss": 0.7092233896255493
@@ -25021,13 +25021,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0008949351849706261,
- "loss": 0.0122,
+ "loss": 0.0117,
"macro_f1": 0.3333333432674408,
"num_tokens": 4243939.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032732547260820866,
+ "routers_loss": 0.0032689040526747704,
"skip_count": 0.0,
"step": 2634,
"text_loss": 0.19925718009471893
@@ -25040,13 +25040,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0390625,
+ "grad_norm": 0.033935546875,
"learning_rate": 0.0008947452909305509,
- "loss": 0.0112,
+ "loss": 0.0109,
"macro_f1": 0.6666666865348816,
"num_tokens": 4247535.0,
"repeat_count": 1.0,
- "routers_loss": 0.0021109411027282476,
+ "routers_loss": 0.002066014800220728,
"skip_count": 0.0,
"step": 2636,
"text_loss": 0.5249715447425842
@@ -25059,13 +25059,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.11279296875,
+ "grad_norm": 0.09326171875,
"learning_rate": 0.0008945552456233356,
"loss": 0.0169,
"macro_f1": 0.8820862174034119,
"num_tokens": 4251441.0,
"repeat_count": 2.0,
- "routers_loss": 0.029545020312070847,
+ "routers_loss": 0.029332537204027176,
"skip_count": 2.0,
"step": 2638,
"text_loss": 0.19229578971862793
@@ -25078,13 +25078,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.078125,
"learning_rate": 0.0008943650491218058,
- "loss": 0.0083,
+ "loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 4254314.0,
"repeat_count": 0.0,
- "routers_loss": 0.0075805820524692535,
+ "routers_loss": 0.0075911120511591434,
"skip_count": 0.0,
"step": 2640,
"text_loss": 0.27059751749038696
@@ -25097,13 +25097,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.0008941747014988453,
- "loss": 0.0155,
+ "loss": 0.0156,
"macro_f1": 0.3333333432674408,
"num_tokens": 4257442.0,
"repeat_count": 0.0,
- "routers_loss": 0.008832095190882683,
+ "routers_loss": 0.009030844084918499,
"skip_count": 0.0,
"step": 2642,
"text_loss": 0.36747801303863525
@@ -25116,13 +25116,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.080078125,
+ "grad_norm": 0.123046875,
"learning_rate": 0.0008939842028273956,
- "loss": 0.011,
+ "loss": 0.0112,
"macro_f1": 0.6666666865348816,
"num_tokens": 4260386.0,
"repeat_count": 0.0,
- "routers_loss": 0.008952614851295948,
+ "routers_loss": 0.007844001986086369,
"skip_count": 1.0,
"step": 2644,
"text_loss": 0.6397647857666016
@@ -25135,13 +25135,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0250244140625,
+ "grad_norm": 0.0283203125,
"learning_rate": 0.0008937935531804562,
- "loss": 0.0075,
+ "loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 4263516.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017659157747402787,
+ "routers_loss": 0.0018789108144119382,
"skip_count": 0.0,
"step": 2646,
"text_loss": 0.4795534908771515
@@ -25154,13 +25154,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05419921875,
+ "grad_norm": 0.06494140625,
"learning_rate": 0.0008936027526310844,
- "loss": 0.0101,
+ "loss": 0.0098,
"macro_f1": 0.3272727429866791,
"num_tokens": 4266744.0,
"repeat_count": 0.0,
- "routers_loss": 0.03944230079650879,
+ "routers_loss": 0.0348590686917305,
"skip_count": 1.0,
"step": 2648,
"text_loss": 0.27691999077796936
@@ -25173,13 +25173,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.000893411801252395,
"loss": 0.015,
"macro_f1": 0.6666666865348816,
"num_tokens": 4269766.0,
"repeat_count": 0.0,
- "routers_loss": 0.0037144431844353676,
+ "routers_loss": 0.004543309565633535,
"skip_count": 1.0,
"step": 2650,
"text_loss": 0.18867231905460358
@@ -25192,13 +25192,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0008932206991175615,
- "loss": 0.0143,
+ "loss": 0.0141,
"macro_f1": 0.6666666865348816,
"num_tokens": 4273513.0,
"repeat_count": 0.0,
- "routers_loss": 0.003659905167296529,
+ "routers_loss": 0.0035277456045150757,
"skip_count": 1.0,
"step": 2652,
"text_loss": 0.45613357424736023
@@ -25211,13 +25211,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.055908203125,
"learning_rate": 0.0008930294462998143,
"loss": 0.015,
"macro_f1": 0.6666666865348816,
"num_tokens": 4276878.0,
"repeat_count": 1.0,
- "routers_loss": 0.011676746420562267,
+ "routers_loss": 0.011337592266499996,
"skip_count": 0.0,
"step": 2654,
"text_loss": 0.24733254313468933
@@ -25230,13 +25230,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.0869140625,
"learning_rate": 0.0008928380428724419,
- "loss": 0.0061,
+ "loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 4279915.0,
"repeat_count": 0.0,
- "routers_loss": 0.000998969655483961,
+ "routers_loss": 0.0010295971296727657,
"skip_count": 1.0,
"step": 2656,
"text_loss": 0.41722849011421204
@@ -25249,13 +25249,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0008926464889087903,
- "loss": 0.0109,
+ "loss": 0.0116,
"macro_f1": 0.6666666865348816,
"num_tokens": 4282888.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016260759439319372,
+ "routers_loss": 0.0017198545392602682,
"skip_count": 2.0,
"step": 2658,
"text_loss": 0.738322377204895
@@ -25268,13 +25268,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0008924547844822634,
- "loss": 0.0101,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 4285805.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010900370543822646,
+ "routers_loss": 0.001339946174994111,
"skip_count": 0.0,
"step": 2660,
"text_loss": 0.4802379906177521
@@ -25287,13 +25287,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.05322265625,
"learning_rate": 0.000892262929666323,
- "loss": 0.0101,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 4290282.0,
"repeat_count": 0.0,
- "routers_loss": 0.002275131642818451,
+ "routers_loss": 0.0022340165451169014,
"skip_count": 0.0,
"step": 2662,
"text_loss": 0.6503544449806213
@@ -25306,13 +25306,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0419921875,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0008920709245344878,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 4294106.0,
"repeat_count": 0.0,
- "routers_loss": 0.00575100164860487,
+ "routers_loss": 0.005288850050419569,
"skip_count": 1.0,
"step": 2664,
"text_loss": 0.12312037497758865
@@ -25325,13 +25325,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.038330078125,
+ "grad_norm": 0.041259765625,
"learning_rate": 0.0008918787691603347,
- "loss": 0.0122,
+ "loss": 0.0121,
"macro_f1": 0.6666666865348816,
"num_tokens": 4298013.0,
"repeat_count": 0.0,
- "routers_loss": 0.004139711149036884,
+ "routers_loss": 0.004259659443050623,
"skip_count": 1.0,
"step": 2666,
"text_loss": 0.3070000112056732
@@ -25344,13 +25344,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0439453125,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.000891686463617498,
- "loss": 0.0072,
+ "loss": 0.0069,
"macro_f1": 0.6666666865348816,
"num_tokens": 4300799.0,
"repeat_count": 0.0,
- "routers_loss": 0.008856390602886677,
+ "routers_loss": 0.009489355608820915,
"skip_count": 1.0,
"step": 2668,
"text_loss": 0.18535588681697845
@@ -25363,13 +25363,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0576171875,
+ "grad_norm": 0.055908203125,
"learning_rate": 0.0008914940079796696,
- "loss": 0.0116,
+ "loss": 0.0114,
"macro_f1": 0.3333333432674408,
"num_tokens": 4304641.0,
"repeat_count": 0.0,
- "routers_loss": 0.002438562922179699,
+ "routers_loss": 0.0025417013093829155,
"skip_count": 0.0,
"step": 2670,
"text_loss": 0.482585072517395
@@ -25382,13 +25382,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0008913014023205988,
"loss": 0.0108,
"macro_f1": 0.3333333432674408,
"num_tokens": 4307462.0,
"repeat_count": 0.0,
- "routers_loss": 0.006435772404074669,
+ "routers_loss": 0.006371749565005302,
"skip_count": 0.0,
"step": 2672,
"text_loss": 0.7064456939697266
@@ -25401,13 +25401,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.0008911086467140925,
- "loss": 0.0069,
+ "loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 4310396.0,
"repeat_count": 0.0,
- "routers_loss": 0.002773779444396496,
+ "routers_loss": 0.0027512952219694853,
"skip_count": 0.0,
"step": 2674,
"text_loss": 0.23532851040363312
@@ -25420,13 +25420,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.05712890625,
"learning_rate": 0.000890915741234015,
- "loss": 0.0135,
+ "loss": 0.0133,
"macro_f1": 0.6666666865348816,
"num_tokens": 4314781.0,
"repeat_count": 0.0,
- "routers_loss": 0.00862761028110981,
+ "routers_loss": 0.008253013715147972,
"skip_count": 1.0,
"step": 2676,
"text_loss": 0.30950358510017395
@@ -25439,13 +25439,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033203125,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0008907226859542879,
- "loss": 0.0104,
+ "loss": 0.0105,
"macro_f1": 0.6666666865348816,
"num_tokens": 4317988.0,
"repeat_count": 0.0,
- "routers_loss": 0.005587176885455847,
+ "routers_loss": 0.005409995559602976,
"skip_count": 2.0,
"step": 2678,
"text_loss": 0.4930732846260071
@@ -25458,13 +25458,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.042236328125,
+ "grad_norm": 0.060546875,
"learning_rate": 0.0008905294809488907,
- "loss": 0.0082,
+ "loss": 0.0084,
"macro_f1": 1.0,
"num_tokens": 4321014.0,
"repeat_count": 1.0,
- "routers_loss": 0.0033104203175753355,
+ "routers_loss": 0.0029942214023321867,
"skip_count": 1.0,
"step": 2680,
"text_loss": 0.6224040389060974
@@ -25477,13 +25477,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0008903361262918595,
- "loss": 0.0117,
+ "loss": 0.0115,
"macro_f1": 0.6666666865348816,
"num_tokens": 4324268.0,
"repeat_count": 0.0,
- "routers_loss": 0.008205405436456203,
+ "routers_loss": 0.008411120623350143,
"skip_count": 1.0,
"step": 2682,
"text_loss": 0.16296671330928802
@@ -25496,13 +25496,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.052734375,
+ "grad_norm": 0.05126953125,
"learning_rate": 0.0008901426220572884,
- "loss": 0.0142,
+ "loss": 0.0138,
"macro_f1": 1.0,
"num_tokens": 4327494.0,
"repeat_count": 2.0,
- "routers_loss": 0.007884894497692585,
+ "routers_loss": 0.01039006095379591,
"skip_count": 4.0,
"step": 2684,
"text_loss": 0.43866512179374695
@@ -25515,13 +25515,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.060791015625,
"learning_rate": 0.0008899489683193286,
- "loss": 0.011,
+ "loss": 0.0107,
"macro_f1": 0.3333333432674408,
"num_tokens": 4330936.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009336905204690993,
+ "routers_loss": 0.0009329111780971289,
"skip_count": 0.0,
"step": 2686,
"text_loss": 0.44250962138175964
@@ -25534,13 +25534,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0810546875,
+ "grad_norm": 0.07421875,
"learning_rate": 0.0008897551651521885,
"loss": 0.0111,
"macro_f1": 0.3333333432674408,
"num_tokens": 4334123.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033622782211750746,
+ "routers_loss": 0.003197216661646962,
"skip_count": 0.0,
"step": 2688,
"text_loss": 0.48313501477241516
@@ -25553,13 +25553,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07763671875,
+ "grad_norm": 0.09716796875,
"learning_rate": 0.0008895612126301339,
"loss": 0.0157,
"macro_f1": 0.3333333432674408,
"num_tokens": 4337610.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034563415683805943,
+ "routers_loss": 0.0033548236824572086,
"skip_count": 0.0,
"step": 2690,
"text_loss": 0.4715327322483063
@@ -25572,13 +25572,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.051513671875,
"learning_rate": 0.0008893671108274877,
- "loss": 0.0115,
+ "loss": 0.0118,
"macro_f1": 0.3333333432674408,
"num_tokens": 4341026.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022277699317783117,
+ "routers_loss": 0.0024757643695920706,
"skip_count": 0.0,
"step": 2692,
"text_loss": 0.43402785062789917
@@ -25591,13 +25591,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0008891728598186302,
- "loss": 0.011,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 4344422.0,
"repeat_count": 0.0,
- "routers_loss": 0.003892304375767708,
+ "routers_loss": 0.003317243419587612,
"skip_count": 0.0,
"step": 2694,
"text_loss": 0.8498559594154358
@@ -25610,13 +25610,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.0380859375,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0008889784596779986,
- "loss": 0.0092,
+ "loss": 0.009,
"macro_f1": 0.5934640765190125,
"num_tokens": 4347507.0,
"repeat_count": 0.0,
- "routers_loss": 0.015058296732604504,
+ "routers_loss": 0.01577926240861416,
"skip_count": 3.0,
"step": 2696,
"text_loss": 0.5646669864654541
@@ -25629,13 +25629,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.11328125,
"learning_rate": 0.0008887839104800876,
- "loss": 0.0118,
+ "loss": 0.0124,
"macro_f1": 0.3333333432674408,
"num_tokens": 4350414.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033561652526259422,
+ "routers_loss": 0.002953822258859873,
"skip_count": 0.0,
"step": 2698,
"text_loss": 0.5145012140274048
@@ -25648,13 +25648,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.05029296875,
"learning_rate": 0.0008885892122994486,
- "loss": 0.0116,
+ "loss": 0.0112,
"macro_f1": 0.3333333432674408,
"num_tokens": 4354110.0,
"repeat_count": 0.0,
- "routers_loss": 0.0062471418641507626,
+ "routers_loss": 0.005849295295774937,
"skip_count": 0.0,
"step": 2700,
"text_loss": 0.580982506275177
@@ -25667,13 +25667,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.0008883943652106903,
"loss": 0.0086,
"macro_f1": 1.0,
"num_tokens": 4357323.0,
"repeat_count": 1.0,
- "routers_loss": 0.011802209541201591,
+ "routers_loss": 0.012347398325800896,
"skip_count": 2.0,
"step": 2702,
"text_loss": 0.2234988808631897
@@ -25686,13 +25686,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.0008881993692884787,
- "loss": 0.0132,
+ "loss": 0.0128,
"macro_f1": 0.6666666865348816,
"num_tokens": 4360228.0,
"repeat_count": 0.0,
- "routers_loss": 0.0041528744623064995,
+ "routers_loss": 0.003574999049305916,
"skip_count": 1.0,
"step": 2704,
"text_loss": 0.4261806607246399
@@ -25705,13 +25705,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0439453125,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0008880042246075365,
- "loss": 0.0094,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 4363905.0,
"repeat_count": 0.0,
- "routers_loss": 0.003151095937937498,
+ "routers_loss": 0.0031574300955981016,
"skip_count": 0.0,
"step": 2706,
"text_loss": 0.691118061542511
@@ -25724,13 +25724,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.0008878089312426433,
"loss": 0.0091,
"macro_f1": 0.3333333432674408,
"num_tokens": 4366736.0,
"repeat_count": 0.0,
- "routers_loss": 0.003142676781862974,
+ "routers_loss": 0.003195564029738307,
"skip_count": 0.0,
"step": 2708,
"text_loss": 0.613926112651825
@@ -25743,13 +25743,13 @@
"f1_execute": 0.9583333134651184,
"f1_repeat": 0.0,
"f1_skip": 0.75,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.054443359375,
"learning_rate": 0.0008876134892686363,
"loss": 0.011,
"macro_f1": 0.5694444179534912,
"num_tokens": 4370146.0,
"repeat_count": 0.0,
- "routers_loss": 0.032964516431093216,
+ "routers_loss": 0.038784291595220566,
"skip_count": 5.0,
"step": 2710,
"text_loss": 0.2723451852798462
@@ -25762,13 +25762,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.080078125,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.000887417898760409,
- "loss": 0.0123,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 4373653.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006848900229670107,
+ "routers_loss": 0.0006457131239585578,
"skip_count": 0.0,
"step": 2712,
"text_loss": 0.31667640805244446
@@ -25781,13 +25781,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.10498046875,
"learning_rate": 0.000887222159792912,
- "loss": 0.0156,
+ "loss": 0.0155,
"macro_f1": 0.6603773832321167,
"num_tokens": 4376993.0,
"repeat_count": 1.0,
- "routers_loss": 0.04388813674449921,
+ "routers_loss": 0.045078590512275696,
"skip_count": 1.0,
"step": 2714,
"text_loss": 0.5872798562049866
@@ -25800,13 +25800,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.034912109375,
"learning_rate": 0.0008870262724411528,
- "loss": 0.0122,
+ "loss": 0.012,
"macro_f1": 0.3333333432674408,
"num_tokens": 4380160.0,
"repeat_count": 0.0,
- "routers_loss": 0.003538437420502305,
+ "routers_loss": 0.003628545207902789,
"skip_count": 0.0,
"step": 2716,
"text_loss": 0.7468157410621643
@@ -25819,13 +25819,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1328125,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0008868302367801962,
- "loss": 0.0123,
+ "loss": 0.0118,
"macro_f1": 0.6598639488220215,
"num_tokens": 4383100.0,
"repeat_count": 1.0,
- "routers_loss": 0.05479869619011879,
+ "routers_loss": 0.05404464527964592,
"skip_count": 3.0,
"step": 2718,
"text_loss": 0.2970244884490967
@@ -25838,13 +25838,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0008866340528851629,
"loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 4386700.0,
"repeat_count": 0.0,
- "routers_loss": 0.0070296903140842915,
+ "routers_loss": 0.007000274024903774,
"skip_count": 0.0,
"step": 2720,
"text_loss": 0.34521186351776123
@@ -25857,13 +25857,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05810546875,
+ "grad_norm": 0.052978515625,
"learning_rate": 0.0008864377208312313,
- "loss": 0.0085,
+ "loss": 0.0082,
"macro_f1": 0.8823530077934265,
"num_tokens": 4390299.0,
"repeat_count": 1.0,
- "routers_loss": 0.02051853947341442,
+ "routers_loss": 0.02025366574525833,
"skip_count": 2.0,
"step": 2722,
"text_loss": 1.0536936521530151
@@ -25876,13 +25876,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.000886241240693636,
- "loss": 0.0096,
+ "loss": 0.0098,
"macro_f1": 0.3333333432674408,
"num_tokens": 4393353.0,
"repeat_count": 0.0,
- "routers_loss": 0.002662461483851075,
+ "routers_loss": 0.00251673418097198,
"skip_count": 0.0,
"step": 2724,
"text_loss": 0.5678093433380127
@@ -25895,13 +25895,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.05615234375,
+ "grad_norm": 0.052001953125,
"learning_rate": 0.0008860446125476686,
"loss": 0.0135,
"macro_f1": 0.6666666865348816,
"num_tokens": 4396446.0,
"repeat_count": 1.0,
- "routers_loss": 0.009321866557002068,
+ "routers_loss": 0.009532532654702663,
"skip_count": 0.0,
"step": 2726,
"text_loss": 0.23775041103363037
@@ -25914,13 +25914,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.109375,
+ "grad_norm": 0.091796875,
"learning_rate": 0.0008858478364686776,
- "loss": 0.0102,
+ "loss": 0.0099,
"macro_f1": 0.6666666865348816,
"num_tokens": 4399977.0,
"repeat_count": 1.0,
- "routers_loss": 0.01029124017804861,
+ "routers_loss": 0.008062181062996387,
"skip_count": 0.0,
"step": 2728,
"text_loss": 0.18888695538043976
@@ -25933,13 +25933,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037353515625,
+ "grad_norm": 0.035888671875,
"learning_rate": 0.0008856509125320678,
- "loss": 0.0082,
+ "loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 4404406.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008023424888961017,
+ "routers_loss": 0.0007731119985692203,
"skip_count": 0.0,
"step": 2730,
"text_loss": 0.47331541776657104
@@ -25952,13 +25952,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0517578125,
+ "grad_norm": 0.0498046875,
"learning_rate": 0.0008854538408133006,
- "loss": 0.0115,
+ "loss": 0.0114,
"macro_f1": 0.6666666865348816,
"num_tokens": 4407165.0,
"repeat_count": 0.0,
- "routers_loss": 0.003058656118810177,
+ "routers_loss": 0.003115242812782526,
"skip_count": 1.0,
"step": 2732,
"text_loss": 0.491370290517807
@@ -25971,13 +25971,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039794921875,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0008852566213878947,
- "loss": 0.0082,
+ "loss": 0.0081,
"macro_f1": 0.3333333432674408,
"num_tokens": 4410101.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010282890871167183,
+ "routers_loss": 0.0008958528051152825,
"skip_count": 0.0,
"step": 2734,
"text_loss": 0.42188262939453125
@@ -25990,13 +25990,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.07421875,
+ "grad_norm": 0.07763671875,
"learning_rate": 0.0008850592543314246,
- "loss": 0.0123,
+ "loss": 0.0118,
"macro_f1": 1.0,
"num_tokens": 4413015.0,
"repeat_count": 1.0,
- "routers_loss": 0.014785367995500565,
+ "routers_loss": 0.01139112375676632,
"skip_count": 1.0,
"step": 2736,
"text_loss": 0.4716498553752899
@@ -26009,13 +26009,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0654296875,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0008848617397195218,
- "loss": 0.0089,
+ "loss": 0.0084,
"macro_f1": 0.6603773832321167,
"num_tokens": 4416404.0,
"repeat_count": 1.0,
- "routers_loss": 0.017717093229293823,
+ "routers_loss": 0.01609630137681961,
"skip_count": 1.0,
"step": 2738,
"text_loss": 0.19490821659564972
@@ -26028,13 +26028,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0008846640776278745,
- "loss": 0.0067,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 4419408.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011861984385177493,
+ "routers_loss": 0.001489170710556209,
"skip_count": 0.0,
"step": 2740,
"text_loss": 0.6443108320236206
@@ -26047,13 +26047,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0008844662681322269,
"loss": 0.0144,
"macro_f1": 0.6666666865348816,
"num_tokens": 4422067.0,
"repeat_count": 1.0,
- "routers_loss": 0.0013843412743881345,
+ "routers_loss": 0.0014755792217329144,
"skip_count": 0.0,
"step": 2742,
"text_loss": 0.9150356650352478
@@ -26068,11 +26068,11 @@
"f1_skip": 1.0,
"grad_norm": 0.05078125,
"learning_rate": 0.0008842683113083801,
- "loss": 0.0154,
+ "loss": 0.0149,
"macro_f1": 0.6666666865348816,
"num_tokens": 4425647.0,
"repeat_count": 0.0,
- "routers_loss": 0.010318896733224392,
+ "routers_loss": 0.008962674997746944,
"skip_count": 1.0,
"step": 2744,
"text_loss": 0.7103227972984314
@@ -26085,13 +26085,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0008840702072321915,
- "loss": 0.0108,
+ "loss": 0.0104,
"macro_f1": 0.6598639488220215,
"num_tokens": 4428855.0,
"repeat_count": 1.0,
- "routers_loss": 0.029359478503465652,
+ "routers_loss": 0.02554207295179367,
"skip_count": 3.0,
"step": 2746,
"text_loss": 0.27141591906547546
@@ -26104,13 +26104,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0234375,
+ "grad_norm": 0.0230712890625,
"learning_rate": 0.0008838719559795751,
"loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 4432838.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014995118835940957,
+ "routers_loss": 0.0011747616808861494,
"skip_count": 0.0,
"step": 2748,
"text_loss": 0.4007738530635834
@@ -26123,13 +26123,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.03515625,
+ "grad_norm": 0.03466796875,
"learning_rate": 0.0008836735576265009,
- "loss": 0.0074,
+ "loss": 0.0073,
"macro_f1": 0.5492662787437439,
"num_tokens": 4435793.0,
"repeat_count": 0.0,
- "routers_loss": 0.017950648441910744,
+ "routers_loss": 0.017564335837960243,
"skip_count": 2.0,
"step": 2750,
"text_loss": 0.5972410440444946
@@ -26142,13 +26142,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.044921875,
"learning_rate": 0.0008834750122489956,
- "loss": 0.0083,
+ "loss": 0.0086,
"macro_f1": 0.6666666865348816,
"num_tokens": 4438871.0,
"repeat_count": 1.0,
- "routers_loss": 0.0069067892618477345,
+ "routers_loss": 0.007004009559750557,
"skip_count": 0.0,
"step": 2752,
"text_loss": 0.2294853925704956
@@ -26161,13 +26161,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.051513671875,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0008832763199231423,
- "loss": 0.0101,
+ "loss": 0.0107,
"macro_f1": 0.3333333432674408,
"num_tokens": 4441846.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013944554375484586,
+ "routers_loss": 0.0014562139986082911,
"skip_count": 0.0,
"step": 2754,
"text_loss": 0.722432017326355
@@ -26180,13 +26180,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0008830774807250802,
"loss": 0.013,
"macro_f1": 0.3272727429866791,
"num_tokens": 4444786.0,
"repeat_count": 1.0,
- "routers_loss": 0.025158623233437538,
+ "routers_loss": 0.024773593991994858,
"skip_count": 0.0,
"step": 2756,
"text_loss": 0.507905125617981
@@ -26199,13 +26199,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05419921875,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.0008828784947310049,
- "loss": 0.0131,
+ "loss": 0.0129,
"macro_f1": 0.8823530077934265,
"num_tokens": 4448442.0,
"repeat_count": 1.0,
- "routers_loss": 0.05205477401614189,
+ "routers_loss": 0.04959975928068161,
"skip_count": 2.0,
"step": 2758,
"text_loss": 0.3617522418498993
@@ -26218,13 +26218,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.000882679362017168,
"loss": 0.0149,
"macro_f1": 1.0,
"num_tokens": 4451401.0,
"repeat_count": 1.0,
- "routers_loss": 0.005898742936551571,
+ "routers_loss": 0.005783245898783207,
"skip_count": 2.0,
"step": 2760,
"text_loss": 0.49187400937080383
@@ -26237,13 +26237,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0008824800826598778,
- "loss": 0.0129,
+ "loss": 0.0127,
"macro_f1": 0.3333333432674408,
"num_tokens": 4454537.0,
"repeat_count": 0.0,
- "routers_loss": 0.006758298724889755,
+ "routers_loss": 0.00656260596588254,
"skip_count": 0.0,
"step": 2762,
"text_loss": 0.6823583245277405
@@ -26256,13 +26256,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.0546875,
"learning_rate": 0.0008822806567354983,
- "loss": 0.0109,
+ "loss": 0.0111,
"macro_f1": 0.6666666865348816,
"num_tokens": 4457706.0,
"repeat_count": 1.0,
- "routers_loss": 0.005730919074267149,
+ "routers_loss": 0.005298966076225042,
"skip_count": 0.0,
"step": 2764,
"text_loss": 0.554322361946106
@@ -26275,13 +26275,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.051025390625,
+ "grad_norm": 0.046630859375,
"learning_rate": 0.0008820810843204501,
- "loss": 0.0098,
+ "loss": 0.0096,
"macro_f1": 0.3272727429866791,
"num_tokens": 4460710.0,
"repeat_count": 0.0,
- "routers_loss": 0.03390989825129509,
+ "routers_loss": 0.03164982795715332,
"skip_count": 1.0,
"step": 2766,
"text_loss": 0.1656961441040039
@@ -26294,13 +26294,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0849609375,
+ "grad_norm": 0.072265625,
"learning_rate": 0.0008818813654912095,
- "loss": 0.0165,
+ "loss": 0.0162,
"macro_f1": 0.3333333432674408,
"num_tokens": 4464001.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007058497285470366,
+ "routers_loss": 0.000715116853825748,
"skip_count": 0.0,
"step": 2768,
"text_loss": 0.5818144083023071
@@ -26313,13 +26313,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.058837890625,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0008816815003243093,
- "loss": 0.0136,
+ "loss": 0.0133,
"macro_f1": 0.3333333432674408,
"num_tokens": 4467364.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027468691114336252,
+ "routers_loss": 0.002851625671610236,
"skip_count": 0.0,
"step": 2770,
"text_loss": 0.6068631410598755
@@ -26332,13 +26332,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.033203125,
"learning_rate": 0.0008814814888963383,
"loss": 0.0073,
"macro_f1": 0.6666666865348816,
"num_tokens": 4470681.0,
"repeat_count": 0.0,
- "routers_loss": 0.00443003186956048,
+ "routers_loss": 0.004729873035103083,
"skip_count": 1.0,
"step": 2772,
"text_loss": 0.5386646389961243
@@ -26351,13 +26351,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0439453125,
+ "grad_norm": 0.04296875,
"learning_rate": 0.000881281331283941,
"loss": 0.0091,
"macro_f1": 0.6666666865348816,
"num_tokens": 4473734.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031219064258038998,
+ "routers_loss": 0.0031853127293288708,
"skip_count": 1.0,
"step": 2774,
"text_loss": 0.5695263147354126
@@ -26370,13 +26370,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03369140625,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0008810810275638182,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 4478404.0,
"repeat_count": 0.0,
- "routers_loss": 0.000846695271320641,
+ "routers_loss": 0.0008977465913631022,
"skip_count": 0.0,
"step": 2776,
"text_loss": 0.4750773310661316
@@ -26389,13 +26389,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0008808805778127269,
- "loss": 0.0075,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 4481287.0,
"repeat_count": 0.0,
- "routers_loss": 0.0074167875573039055,
+ "routers_loss": 0.00469845999032259,
"skip_count": 0.0,
"step": 2778,
"text_loss": 0.14078612625598907
@@ -26408,13 +26408,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.04296875,
+ "grad_norm": 0.049560546875,
"learning_rate": 0.0008806799821074796,
- "loss": 0.0078,
+ "loss": 0.0079,
"macro_f1": 0.5492662787437439,
"num_tokens": 4483929.0,
"repeat_count": 0.0,
- "routers_loss": 0.018358726054430008,
+ "routers_loss": 0.01789761893451214,
"skip_count": 2.0,
"step": 2780,
"text_loss": 0.2167191207408905
@@ -26427,13 +26427,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0008804792405249451,
- "loss": 0.0124,
+ "loss": 0.0123,
"macro_f1": 0.3333333432674408,
"num_tokens": 4487468.0,
"repeat_count": 0.0,
- "routers_loss": 0.001094152103178203,
+ "routers_loss": 0.001018838956952095,
"skip_count": 0.0,
"step": 2782,
"text_loss": 0.5424665212631226
@@ -26446,13 +26446,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.0498046875,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.000880278353142048,
- "loss": 0.0075,
+ "loss": 0.0077,
"macro_f1": 0.8200000524520874,
"num_tokens": 4490942.0,
"repeat_count": 1.0,
- "routers_loss": 0.03035641834139824,
+ "routers_loss": 0.03260354697704315,
"skip_count": 3.0,
"step": 2784,
"text_loss": 0.20994654297828674
@@ -26465,13 +26465,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05517578125,
+ "grad_norm": 0.05322265625,
"learning_rate": 0.0008800773200357683,
- "loss": 0.0123,
+ "loss": 0.0122,
"macro_f1": 0.3333333432674408,
"num_tokens": 4493986.0,
"repeat_count": 0.0,
- "routers_loss": 0.002394269686192274,
+ "routers_loss": 0.003019835101440549,
"skip_count": 0.0,
"step": 2786,
"text_loss": 0.5709528923034668
@@ -26484,13 +26484,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.034423828125,
"learning_rate": 0.0008798761412831429,
"loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 4498232.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028274122159928083,
+ "routers_loss": 0.00285192858427763,
"skip_count": 0.0,
"step": 2788,
"text_loss": 0.5103896260261536
@@ -26503,13 +26503,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0439453125,
+ "grad_norm": 0.044921875,
"learning_rate": 0.0008796748169612634,
- "loss": 0.0088,
+ "loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 4501231.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012642849469557405,
+ "routers_loss": 0.0012469831854104996,
"skip_count": 0.0,
"step": 2790,
"text_loss": 0.43669697642326355
@@ -26522,13 +26522,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03662109375,
+ "grad_norm": 0.039794921875,
"learning_rate": 0.0008794733471472778,
"loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 4504208.0,
"repeat_count": 0.0,
- "routers_loss": 0.010966303758323193,
+ "routers_loss": 0.011512776836752892,
"skip_count": 1.0,
"step": 2792,
"text_loss": 0.2299770563840866
@@ -26541,13 +26541,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.03564453125,
"learning_rate": 0.0008792717319183899,
- "loss": 0.0064,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 4507013.0,
"repeat_count": 0.0,
- "routers_loss": 0.008194026537239552,
+ "routers_loss": 0.00834917277097702,
"skip_count": 0.0,
"step": 2794,
"text_loss": 0.2130603939294815
@@ -26560,13 +26560,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0283203125,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0008790699713518587,
- "loss": 0.008,
+ "loss": 0.0078,
"macro_f1": 0.6666666865348816,
"num_tokens": 4510286.0,
"repeat_count": 0.0,
- "routers_loss": 0.008828429505228996,
+ "routers_loss": 0.008616939187049866,
"skip_count": 2.0,
"step": 2796,
"text_loss": 0.4377101957798004
@@ -26579,13 +26579,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0274658203125,
+ "grad_norm": 0.02783203125,
"learning_rate": 0.0008788680655249994,
- "loss": 0.007,
+ "loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 4513762.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038230866193771362,
+ "routers_loss": 0.003408568911254406,
"skip_count": 0.0,
"step": 2798,
"text_loss": 0.435138463973999
@@ -26598,13 +26598,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0311279296875,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0008786660145151826,
- "loss": 0.009,
+ "loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 4516696.0,
"repeat_count": 1.0,
- "routers_loss": 0.0031088131945580244,
+ "routers_loss": 0.0029398901388049126,
"skip_count": 0.0,
"step": 2800,
"text_loss": 0.3195655047893524
@@ -26617,13 +26617,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.033203125,
"learning_rate": 0.0008784638183998348,
- "loss": 0.0083,
+ "loss": 0.0081,
"macro_f1": 0.3333333432674408,
"num_tokens": 4519760.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014194221002981067,
+ "routers_loss": 0.0013777425047010183,
"skip_count": 0.0,
"step": 2802,
"text_loss": 0.8129430413246155
@@ -26636,13 +26636,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.032470703125,
"learning_rate": 0.0008782614772564379,
- "loss": 0.0099,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 4522106.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031931858975440264,
+ "routers_loss": 0.0031694830395281315,
"skip_count": 0.0,
"step": 2804,
"text_loss": 0.18083660304546356
@@ -26655,13 +26655,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0008780589911625293,
- "loss": 0.0117,
+ "loss": 0.0114,
"macro_f1": 0.3333333432674408,
"num_tokens": 4525743.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021834284998476505,
+ "routers_loss": 0.002161208540201187,
"skip_count": 0.0,
"step": 2806,
"text_loss": 0.8228182792663574
@@ -26674,13 +26674,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0703125,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0008778563601957021,
- "loss": 0.0098,
+ "loss": 0.0095,
"macro_f1": 0.6666666865348816,
"num_tokens": 4529573.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035390176344662905,
+ "routers_loss": 0.0028444856870919466,
"skip_count": 1.0,
"step": 2808,
"text_loss": 0.3715563118457794
@@ -26693,13 +26693,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04296875,
+ "grad_norm": 0.044677734375,
"learning_rate": 0.0008776535844336049,
- "loss": 0.0095,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 4532452.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038604713045060635,
+ "routers_loss": 0.003807213855907321,
"skip_count": 0.0,
"step": 2810,
"text_loss": 0.6012523174285889
@@ -26712,13 +26712,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0008774506639539417,
- "loss": 0.0072,
+ "loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 4536077.0,
"repeat_count": 0.0,
- "routers_loss": 0.00669970503076911,
+ "routers_loss": 0.006698979996144772,
"skip_count": 0.0,
"step": 2812,
"text_loss": 0.27097949385643005
@@ -26731,13 +26731,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.099609375,
"learning_rate": 0.0008772475988344722,
- "loss": 0.0132,
+ "loss": 0.013,
"macro_f1": 0.6666666865348816,
"num_tokens": 4539057.0,
"repeat_count": 0.0,
- "routers_loss": 0.004594485275447369,
+ "routers_loss": 0.004849409218877554,
"skip_count": 1.0,
"step": 2814,
"text_loss": 1.026973843574524
@@ -26750,13 +26750,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0008770443891530109,
- "loss": 0.0116,
+ "loss": 0.0115,
"macro_f1": 0.5934640765190125,
"num_tokens": 4542253.0,
"repeat_count": 0.0,
- "routers_loss": 0.01891930215060711,
+ "routers_loss": 0.019148651510477066,
"skip_count": 3.0,
"step": 2816,
"text_loss": 0.2717585563659668
@@ -26769,13 +26769,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.054931640625,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0008768410349874286,
"loss": 0.0098,
"macro_f1": 0.6601307392120361,
"num_tokens": 4545047.0,
"repeat_count": 1.0,
- "routers_loss": 0.0247862096875906,
+ "routers_loss": 0.02231316640973091,
"skip_count": 2.0,
"step": 2818,
"text_loss": 0.274346262216568
@@ -26788,13 +26788,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0008766375364156508,
"loss": 0.0091,
"macro_f1": 0.6666666865348816,
"num_tokens": 4548371.0,
"repeat_count": 0.0,
- "routers_loss": 0.008566800504922867,
+ "routers_loss": 0.008014129474759102,
"skip_count": 2.0,
"step": 2820,
"text_loss": 0.22850871086120605
@@ -26807,13 +26807,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041748046875,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.0008764338935156586,
"loss": 0.0095,
"macro_f1": 0.3333333432674408,
"num_tokens": 4551276.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013546474510803819,
+ "routers_loss": 0.0014544493751600385,
"skip_count": 0.0,
"step": 2822,
"text_loss": 0.6308462023735046
@@ -26826,13 +26826,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.0390625,
"learning_rate": 0.000876230106365488,
- "loss": 0.0122,
+ "loss": 0.0123,
"macro_f1": 0.6666666865348816,
"num_tokens": 4554143.0,
"repeat_count": 0.0,
- "routers_loss": 0.009204468689858913,
+ "routers_loss": 0.00818584579974413,
"skip_count": 3.0,
"step": 2824,
"text_loss": 0.3484207093715668
@@ -26845,13 +26845,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03271484375,
+ "grad_norm": 0.0264892578125,
"learning_rate": 0.0008760261750432312,
- "loss": 0.0067,
+ "loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 4557256.0,
"repeat_count": 0.0,
- "routers_loss": 0.00787584763020277,
+ "routers_loss": 0.006275608204305172,
"skip_count": 3.0,
"step": 2826,
"text_loss": 0.1927330046892166
@@ -26864,13 +26864,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.0008758220996270348,
- "loss": 0.0102,
+ "loss": 0.0103,
"macro_f1": 1.0,
"num_tokens": 4560202.0,
"repeat_count": 2.0,
- "routers_loss": 0.0057869357988238335,
+ "routers_loss": 0.0055974251590669155,
"skip_count": 2.0,
"step": 2828,
"text_loss": 0.7796496748924255
@@ -26883,13 +26883,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.046142578125,
"learning_rate": 0.0008756178801951007,
- "loss": 0.0128,
+ "loss": 0.0129,
"macro_f1": 0.3333333432674408,
"num_tokens": 4563508.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018274546600878239,
+ "routers_loss": 0.0019799957517534494,
"skip_count": 0.0,
"step": 2830,
"text_loss": 0.49633297324180603
@@ -26902,13 +26902,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.0458984375,
"learning_rate": 0.0008754135168256865,
- "loss": 0.0094,
+ "loss": 0.0095,
"macro_f1": 0.3333333432674408,
"num_tokens": 4566776.0,
"repeat_count": 0.0,
- "routers_loss": 0.004527154844254255,
+ "routers_loss": 0.004538947716355324,
"skip_count": 0.0,
"step": 2832,
"text_loss": 0.5346745252609253
@@ -26921,13 +26921,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0008752090095971044,
"loss": 0.0091,
"macro_f1": 0.3333333432674408,
"num_tokens": 4569787.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018263199599459767,
+ "routers_loss": 0.001663343166001141,
"skip_count": 0.0,
"step": 2834,
"text_loss": 0.5524004697799683
@@ -26940,13 +26940,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.000875004358587722,
- "loss": 0.0088,
+ "loss": 0.0087,
"macro_f1": 0.3333333432674408,
"num_tokens": 4572813.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022649941965937614,
+ "routers_loss": 0.0022988212294876575,
"skip_count": 0.0,
"step": 2836,
"text_loss": 0.4232870042324066
@@ -26959,13 +26959,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.038330078125,
"learning_rate": 0.000874799563875962,
"loss": 0.0083,
"macro_f1": 0.6666666865348816,
"num_tokens": 4575563.0,
"repeat_count": 0.0,
- "routers_loss": 0.00791149027645588,
+ "routers_loss": 0.007781553082168102,
"skip_count": 1.0,
"step": 2838,
"text_loss": 0.19239822030067444
@@ -26978,13 +26978,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.03515625,
"learning_rate": 0.0008745946255403021,
"loss": 0.0072,
"macro_f1": 0.5492662787437439,
"num_tokens": 4578117.0,
"repeat_count": 0.0,
- "routers_loss": 0.016813624650239944,
+ "routers_loss": 0.01872488670051098,
"skip_count": 2.0,
"step": 2840,
"text_loss": 0.2148810178041458
@@ -26999,11 +26999,11 @@
"f1_skip": 1.0,
"grad_norm": 0.04296875,
"learning_rate": 0.0008743895436592749,
- "loss": 0.0079,
+ "loss": 0.0078,
"macro_f1": 1.0,
"num_tokens": 4582330.0,
"repeat_count": 1.0,
- "routers_loss": 0.004429332446306944,
+ "routers_loss": 0.005634195636957884,
"skip_count": 1.0,
"step": 2842,
"text_loss": 0.4929640591144562
@@ -27016,13 +27016,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.048583984375,
"learning_rate": 0.0008741843183114685,
- "loss": 0.0084,
+ "loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 4585765.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007147722644731402,
+ "routers_loss": 0.0008928569150157273,
"skip_count": 0.0,
"step": 2844,
"text_loss": 0.32702967524528503
@@ -27035,13 +27035,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.044189453125,
+ "grad_norm": 0.0439453125,
"learning_rate": 0.0008739789495755253,
- "loss": 0.0092,
+ "loss": 0.0094,
"macro_f1": 0.6666666865348816,
"num_tokens": 4589000.0,
"repeat_count": 0.0,
- "routers_loss": 0.015438012778759003,
+ "routers_loss": 0.014715569093823433,
"skip_count": 4.0,
"step": 2846,
"text_loss": 0.25125816464424133
@@ -27054,13 +27054,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.049560546875,
"learning_rate": 0.0008737734375301433,
- "loss": 0.0138,
+ "loss": 0.0135,
"macro_f1": 0.3333333432674408,
"num_tokens": 4592391.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015892626252025366,
+ "routers_loss": 0.0017551190685480833,
"skip_count": 0.0,
"step": 2848,
"text_loss": 0.6595172882080078
@@ -27073,13 +27073,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.027099609375,
"learning_rate": 0.0008735677822540749,
- "loss": 0.0086,
+ "loss": 0.0085,
"macro_f1": 0.3333333432674408,
"num_tokens": 4596662.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006934175617061555,
+ "routers_loss": 0.0006456313421949744,
"skip_count": 0.0,
"step": 2850,
"text_loss": 0.6290773153305054
@@ -27092,13 +27092,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.0008733619838261276,
"loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 4599682.0,
"repeat_count": 0.0,
- "routers_loss": 0.006811433006078005,
+ "routers_loss": 0.00765060493722558,
"skip_count": 2.0,
"step": 2852,
"text_loss": 0.3268161416053772
@@ -27111,13 +27111,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0008731560423251637,
- "loss": 0.0104,
+ "loss": 0.01,
"macro_f1": 1.0,
"num_tokens": 4603324.0,
"repeat_count": 1.0,
- "routers_loss": 0.012574959546327591,
+ "routers_loss": 0.01161442045122385,
"skip_count": 2.0,
"step": 2854,
"text_loss": 0.3029932975769043
@@ -27130,13 +27130,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 1.0,
"f1_skip": 0.888888955116272,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.0008729499578301005,
"loss": 0.0098,
"macro_f1": 0.9555556178092957,
"num_tokens": 4606975.0,
"repeat_count": 1.0,
- "routers_loss": 0.01913273334503174,
+ "routers_loss": 0.02055389992892742,
"skip_count": 5.0,
"step": 2856,
"text_loss": 0.6268532872200012
@@ -27149,13 +27149,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.05078125,
"learning_rate": 0.00087274373041991,
- "loss": 0.0082,
+ "loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 4609629.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012737065553665161,
+ "routers_loss": 0.0013911726418882608,
"skip_count": 0.0,
"step": 2858,
"text_loss": 0.534355640411377
@@ -27168,13 +27168,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0008725373601736188,
- "loss": 0.0079,
+ "loss": 0.0081,
"macro_f1": 0.6666666865348816,
"num_tokens": 4612913.0,
"repeat_count": 2.0,
- "routers_loss": 0.009088932536542416,
+ "routers_loss": 0.01010701060295105,
"skip_count": 0.0,
"step": 2860,
"text_loss": 0.3391380310058594
@@ -27187,13 +27187,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0260009765625,
+ "grad_norm": 0.0255126953125,
"learning_rate": 0.0008723308471703085,
- "loss": 0.0078,
+ "loss": 0.008,
"macro_f1": 0.6666666865348816,
"num_tokens": 4616718.0,
"repeat_count": 0.0,
- "routers_loss": 0.006364458240568638,
+ "routers_loss": 0.005969462916254997,
"skip_count": 1.0,
"step": 2862,
"text_loss": 0.47250816226005554
@@ -27206,13 +27206,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047607421875,
+ "grad_norm": 0.046630859375,
"learning_rate": 0.0008721241914891152,
- "loss": 0.0084,
+ "loss": 0.0083,
"macro_f1": 0.3333333432674408,
"num_tokens": 4619680.0,
"repeat_count": 0.0,
- "routers_loss": 0.002686808817088604,
+ "routers_loss": 0.0027780034579336643,
"skip_count": 0.0,
"step": 2864,
"text_loss": 0.3249278664588928
@@ -27225,13 +27225,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.0439453125,
"learning_rate": 0.0008719173932092295,
- "loss": 0.0047,
+ "loss": 0.0044,
"macro_f1": 0.3333333432674408,
"num_tokens": 4622700.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018892486114054918,
+ "routers_loss": 0.0015912104863673449,
"skip_count": 0.0,
"step": 2866,
"text_loss": 0.7789985537528992
@@ -27244,13 +27244,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.049072265625,
+ "grad_norm": 0.05126953125,
"learning_rate": 0.0008717104524098973,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 4626637.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035258810967206955,
+ "routers_loss": 0.0036539011634886265,
"skip_count": 0.0,
"step": 2868,
"text_loss": 0.619088351726532
@@ -27263,13 +27263,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.10400390625,
"learning_rate": 0.0008715033691704187,
- "loss": 0.0121,
+ "loss": 0.0118,
"macro_f1": 0.6666666865348816,
"num_tokens": 4629863.0,
"repeat_count": 0.0,
- "routers_loss": 0.007305602077394724,
+ "routers_loss": 0.008402476087212563,
"skip_count": 1.0,
"step": 2870,
"text_loss": 0.5550018548965454
@@ -27282,13 +27282,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.06298828125,
"learning_rate": 0.0008712961435701479,
- "loss": 0.0162,
+ "loss": 0.0161,
"macro_f1": 0.6666666865348816,
"num_tokens": 4632657.0,
"repeat_count": 0.0,
- "routers_loss": 0.012898211367428303,
+ "routers_loss": 0.01400839351117611,
"skip_count": 1.0,
"step": 2872,
"text_loss": 0.17368625104427338
@@ -27301,13 +27301,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.0008710887756884947,
- "loss": 0.0088,
+ "loss": 0.0086,
"macro_f1": 0.3333333432674408,
"num_tokens": 4635885.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013437134912237525,
+ "routers_loss": 0.0014573842054232955,
"skip_count": 0.0,
"step": 2874,
"text_loss": 0.5138643383979797
@@ -27320,13 +27320,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0008708812656049225,
- "loss": 0.0091,
+ "loss": 0.009,
"macro_f1": 0.6666666865348816,
"num_tokens": 4639341.0,
"repeat_count": 0.0,
- "routers_loss": 0.002090727211907506,
+ "routers_loss": 0.002810224425047636,
"skip_count": 1.0,
"step": 2876,
"text_loss": 0.70310378074646
@@ -27341,11 +27341,11 @@
"f1_skip": 0.8571428656578064,
"grad_norm": 0.03564453125,
"learning_rate": 0.0008706736133989497,
- "loss": 0.0107,
+ "loss": 0.0105,
"macro_f1": 0.9449735879898071,
"num_tokens": 4642163.0,
"repeat_count": 2.0,
- "routers_loss": 0.030176319181919098,
+ "routers_loss": 0.029783209785819054,
"skip_count": 4.0,
"step": 2878,
"text_loss": 0.26898008584976196
@@ -27358,13 +27358,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.04150390625,
"learning_rate": 0.0008704658191501491,
- "loss": 0.0091,
+ "loss": 0.0095,
"macro_f1": 0.3333333432674408,
"num_tokens": 4645858.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009633690933696926,
+ "routers_loss": 0.0009193966398015618,
"skip_count": 0.0,
"step": 2880,
"text_loss": 0.6047570705413818
@@ -27377,13 +27377,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.060302734375,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.0008702578829381475,
"loss": 0.0131,
"macro_f1": 0.8814815282821655,
"num_tokens": 4649237.0,
"repeat_count": 2.0,
- "routers_loss": 0.0568491593003273,
+ "routers_loss": 0.05698608607053757,
"skip_count": 4.0,
"step": 2882,
"text_loss": 0.10695219784975052
@@ -27396,13 +27396,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0306396484375,
+ "grad_norm": 0.0311279296875,
"learning_rate": 0.0008700498048426269,
- "loss": 0.0082,
+ "loss": 0.0083,
"macro_f1": 0.3333333432674408,
"num_tokens": 4652362.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012279651127755642,
+ "routers_loss": 0.0011786938412114978,
"skip_count": 0.0,
"step": 2884,
"text_loss": 0.4442957937717438
@@ -27415,13 +27415,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.046142578125,
"learning_rate": 0.0008698415849433229,
- "loss": 0.0097,
+ "loss": 0.0092,
"macro_f1": 0.5492662787437439,
"num_tokens": 4655616.0,
"repeat_count": 2.0,
- "routers_loss": 0.02166076935827732,
+ "routers_loss": 0.02142646163702011,
"skip_count": 0.0,
"step": 2886,
"text_loss": 0.5820964574813843
@@ -27434,13 +27434,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0008696332233200262,
- "loss": 0.012,
+ "loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 4659294.0,
"repeat_count": 0.0,
- "routers_loss": 0.003944257274270058,
+ "routers_loss": 0.004038636106997728,
"skip_count": 0.0,
"step": 2888,
"text_loss": 0.11847645789384842
@@ -27453,13 +27453,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.0478515625,
"learning_rate": 0.0008694247200525806,
- "loss": 0.0092,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 4662512.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013393335975706577,
+ "routers_loss": 0.0013256469974294305,
"skip_count": 0.0,
"step": 2890,
"text_loss": 0.4873582720756531
@@ -27472,13 +27472,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.0008692160752208856,
- "loss": 0.0128,
+ "loss": 0.0129,
"macro_f1": 0.3272727429866791,
"num_tokens": 4666190.0,
"repeat_count": 0.0,
- "routers_loss": 0.0443510003387928,
+ "routers_loss": 0.04477972164750099,
"skip_count": 1.0,
"step": 2892,
"text_loss": 0.44243401288986206
@@ -27491,13 +27491,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.083984375,
+ "grad_norm": 0.09521484375,
"learning_rate": 0.0008690072889048941,
- "loss": 0.0125,
+ "loss": 0.0127,
"macro_f1": 1.0,
"num_tokens": 4668884.0,
"repeat_count": 1.0,
- "routers_loss": 0.0047337980940938,
+ "routers_loss": 0.004407547414302826,
"skip_count": 2.0,
"step": 2894,
"text_loss": 0.6847127079963684
@@ -27510,13 +27510,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.041015625,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0008687983611846133,
- "loss": 0.0082,
+ "loss": 0.008,
"macro_f1": 0.6666666865348816,
"num_tokens": 4672093.0,
"repeat_count": 0.0,
- "routers_loss": 0.0055244253017008305,
+ "routers_loss": 0.005245382897555828,
"skip_count": 1.0,
"step": 2896,
"text_loss": 0.25583332777023315
@@ -27529,13 +27529,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.0458984375,
"learning_rate": 0.0008685892921401049,
- "loss": 0.011,
+ "loss": 0.0108,
"macro_f1": 0.3333333432674408,
"num_tokens": 4674917.0,
"repeat_count": 0.0,
- "routers_loss": 0.001250729663297534,
+ "routers_loss": 0.0010470855049788952,
"skip_count": 0.0,
"step": 2898,
"text_loss": 0.41998377442359924
@@ -27548,13 +27548,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0008683800818514844,
- "loss": 0.0061,
+ "loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 4677739.0,
"repeat_count": 0.0,
- "routers_loss": 0.00974183902144432,
+ "routers_loss": 0.009026622399687767,
"skip_count": 2.0,
"step": 2900,
"text_loss": 0.303053081035614
@@ -27567,13 +27567,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.078125,
+ "grad_norm": 0.09619140625,
"learning_rate": 0.0008681707303989215,
- "loss": 0.0111,
+ "loss": 0.0108,
"macro_f1": 0.3333333432674408,
"num_tokens": 4680721.0,
"repeat_count": 0.0,
- "routers_loss": 0.004882345907390118,
+ "routers_loss": 0.004500916693359613,
"skip_count": 0.0,
"step": 2902,
"text_loss": 0.5573288798332214
@@ -27586,13 +27586,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0008679612378626404,
"loss": 0.0098,
"macro_f1": 0.6666666865348816,
"num_tokens": 4683339.0,
"repeat_count": 0.0,
- "routers_loss": 0.00568242697045207,
+ "routers_loss": 0.005047840531915426,
"skip_count": 1.0,
"step": 2904,
"text_loss": 0.321353554725647
@@ -27605,13 +27605,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0306396484375,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0008677516043229187,
- "loss": 0.0082,
+ "loss": 0.0083,
"macro_f1": 0.3272727429866791,
"num_tokens": 4686453.0,
"repeat_count": 0.0,
- "routers_loss": 0.010831202380359173,
+ "routers_loss": 0.010256914421916008,
"skip_count": 1.0,
"step": 2906,
"text_loss": 0.4300784468650818
@@ -27624,13 +27624,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.05615234375,
+ "grad_norm": 0.05029296875,
"learning_rate": 0.0008675418298600883,
- "loss": 0.0087,
+ "loss": 0.0083,
"macro_f1": 0.6666666865348816,
"num_tokens": 4689645.0,
"repeat_count": 1.0,
- "routers_loss": 0.00235295994207263,
+ "routers_loss": 0.0022669637110084295,
"skip_count": 0.0,
"step": 2908,
"text_loss": 0.5064885020256042
@@ -27643,13 +27643,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0008673319145545358,
"loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 4692320.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011642680037766695,
+ "routers_loss": 0.0011188550852239132,
"skip_count": 0.0,
"step": 2910,
"text_loss": 0.7114819884300232
@@ -27662,13 +27662,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0008671218584867003,
- "loss": 0.0104,
+ "loss": 0.0102,
"macro_f1": 0.6666666865348816,
"num_tokens": 4695116.0,
"repeat_count": 0.0,
- "routers_loss": 0.00278888875618577,
+ "routers_loss": 0.002966561820358038,
"skip_count": 2.0,
"step": 2912,
"text_loss": 0.5662392973899841
@@ -27681,13 +27681,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.049560546875,
+ "grad_norm": 0.047607421875,
"learning_rate": 0.0008669116617370762,
- "loss": 0.008,
+ "loss": 0.0081,
"macro_f1": 0.3333333432674408,
"num_tokens": 4698040.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014630162622779608,
+ "routers_loss": 0.0012894890969619155,
"skip_count": 0.0,
"step": 2914,
"text_loss": 0.718977689743042
@@ -27700,13 +27700,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0008667013243862111,
- "loss": 0.0159,
+ "loss": 0.0162,
"macro_f1": 0.3333333432674408,
"num_tokens": 4700963.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011393720051273704,
+ "routers_loss": 0.0007232456118799746,
"skip_count": 0.0,
"step": 2916,
"text_loss": 0.3447718024253845
@@ -27719,13 +27719,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02978515625,
+ "grad_norm": 0.0289306640625,
"learning_rate": 0.000866490846514707,
- "loss": 0.0072,
+ "loss": 0.0075,
"macro_f1": 0.3272727429866791,
"num_tokens": 4704471.0,
"repeat_count": 1.0,
- "routers_loss": 0.014218449592590332,
+ "routers_loss": 0.015166680328547955,
"skip_count": 0.0,
"step": 2918,
"text_loss": 0.454946368932724
@@ -27738,13 +27738,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.052978515625,
+ "grad_norm": 0.04736328125,
"learning_rate": 0.000866280228203219,
"loss": 0.0073,
"macro_f1": 1.0,
"num_tokens": 4707238.0,
"repeat_count": 1.0,
- "routers_loss": 0.005367610137909651,
+ "routers_loss": 0.0061312485486269,
"skip_count": 1.0,
"step": 2920,
"text_loss": 0.721788227558136
@@ -27757,13 +27757,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048828125,
+ "grad_norm": 0.055908203125,
"learning_rate": 0.0008660694695324564,
- "loss": 0.0124,
+ "loss": 0.0125,
"macro_f1": 0.3333333432674408,
"num_tokens": 4711323.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020303199999034405,
+ "routers_loss": 0.00169933564029634,
"skip_count": 0.0,
"step": 2922,
"text_loss": 0.7562121748924255
@@ -27776,13 +27776,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06201171875,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0008658585705831829,
- "loss": 0.0123,
+ "loss": 0.0128,
"macro_f1": 0.3333333432674408,
"num_tokens": 4714417.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022230520844459534,
+ "routers_loss": 0.0022731393110007048,
"skip_count": 0.0,
"step": 2924,
"text_loss": 0.5726147890090942
@@ -27795,13 +27795,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.06787109375,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0008656475314362148,
- "loss": 0.0133,
+ "loss": 0.0131,
"macro_f1": 0.8817967176437378,
"num_tokens": 4717445.0,
"repeat_count": 2.0,
- "routers_loss": 0.06414645165205002,
+ "routers_loss": 0.06477782875299454,
"skip_count": 3.0,
"step": 2926,
"text_loss": 0.4505867660045624
@@ -27814,13 +27814,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 1.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.0625,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0008654363521724229,
- "loss": 0.0128,
+ "loss": 0.0129,
"macro_f1": 0.9449735879898071,
"num_tokens": 4722253.0,
"repeat_count": 2.0,
- "routers_loss": 0.022727061063051224,
+ "routers_loss": 0.027405790984630585,
"skip_count": 4.0,
"step": 2928,
"text_loss": 0.24767601490020752
@@ -27833,13 +27833,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.0537109375,
"learning_rate": 0.0008652250328727315,
- "loss": 0.0114,
+ "loss": 0.0112,
"macro_f1": 0.6666666865348816,
"num_tokens": 4725465.0,
"repeat_count": 0.0,
- "routers_loss": 0.006181784905493259,
+ "routers_loss": 0.006544729229062796,
"skip_count": 2.0,
"step": 2930,
"text_loss": 0.4478724002838135
@@ -27852,13 +27852,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.0517578125,
"learning_rate": 0.0008650135736181184,
- "loss": 0.0133,
+ "loss": 0.0134,
"macro_f1": 0.6666666865348816,
"num_tokens": 4729213.0,
"repeat_count": 1.0,
- "routers_loss": 0.005527070257812738,
+ "routers_loss": 0.0055119614116847515,
"skip_count": 0.0,
"step": 2932,
"text_loss": 0.6749323010444641
@@ -27871,13 +27871,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05517578125,
+ "grad_norm": 0.045166015625,
"learning_rate": 0.0008648019744896154,
- "loss": 0.0102,
+ "loss": 0.0101,
"macro_f1": 0.3333333432674408,
"num_tokens": 4732280.0,
"repeat_count": 0.0,
- "routers_loss": 0.008868738077580929,
+ "routers_loss": 0.008374541997909546,
"skip_count": 0.0,
"step": 2934,
"text_loss": 0.4647359251976013
@@ -27890,13 +27890,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.057373046875,
+ "grad_norm": 0.06201171875,
"learning_rate": 0.0008645902355683077,
- "loss": 0.0089,
+ "loss": 0.0091,
"macro_f1": 0.6595745086669922,
"num_tokens": 4736244.0,
"repeat_count": 1.0,
- "routers_loss": 0.07285884022712708,
+ "routers_loss": 0.068686343729496,
"skip_count": 4.0,
"step": 2936,
"text_loss": 0.5356017351150513
@@ -27909,13 +27909,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.043212890625,
+ "grad_norm": 0.042236328125,
"learning_rate": 0.0008643783569353339,
- "loss": 0.0072,
+ "loss": 0.007,
"macro_f1": 0.6666666865348816,
"num_tokens": 4739810.0,
"repeat_count": 2.0,
- "routers_loss": 0.019306030124425888,
+ "routers_loss": 0.017954571172595024,
"skip_count": 0.0,
"step": 2938,
"text_loss": 0.3145926296710968
@@ -27928,13 +27928,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.054443359375,
"learning_rate": 0.0008641663386718863,
- "loss": 0.0084,
+ "loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 4742720.0,
"repeat_count": 0.0,
- "routers_loss": 0.00626454409211874,
+ "routers_loss": 0.006261351052671671,
"skip_count": 1.0,
"step": 2940,
"text_loss": 0.3200613856315613
@@ -27949,11 +27949,11 @@
"f1_skip": 1.0,
"grad_norm": 0.04150390625,
"learning_rate": 0.0008639541808592109,
- "loss": 0.0091,
+ "loss": 0.0093,
"macro_f1": 1.0,
"num_tokens": 4745870.0,
"repeat_count": 1.0,
- "routers_loss": 0.0019172134343534708,
+ "routers_loss": 0.0025341357104480267,
"skip_count": 1.0,
"step": 2942,
"text_loss": 0.5020416378974915
@@ -27968,11 +27968,11 @@
"f1_skip": 1.0,
"grad_norm": 0.025634765625,
"learning_rate": 0.0008637418835786067,
- "loss": 0.0095,
+ "loss": 0.0094,
"macro_f1": 0.6666666865348816,
"num_tokens": 4748943.0,
"repeat_count": 0.0,
- "routers_loss": 0.009745351038873196,
+ "routers_loss": 0.008970048278570175,
"skip_count": 2.0,
"step": 2944,
"text_loss": 0.14517110586166382
@@ -27985,13 +27985,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.043701171875,
+ "grad_norm": 0.055908203125,
"learning_rate": 0.0008635294469114265,
- "loss": 0.011,
+ "loss": 0.0112,
"macro_f1": 0.3333333432674408,
"num_tokens": 4751360.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020624736789613962,
+ "routers_loss": 0.002133632078766823,
"skip_count": 0.0,
"step": 2946,
"text_loss": 0.5367856025695801
@@ -28004,13 +28004,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.08837890625,
"learning_rate": 0.0008633168709390766,
- "loss": 0.0118,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 4754403.0,
"repeat_count": 0.0,
- "routers_loss": 0.001082106726244092,
+ "routers_loss": 0.0011866620043292642,
"skip_count": 0.0,
"step": 2948,
"text_loss": 0.38302522897720337
@@ -28023,13 +28023,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.037109375,
"learning_rate": 0.0008631041557430163,
- "loss": 0.0061,
+ "loss": 0.0058,
"macro_f1": 0.6666666865348816,
"num_tokens": 4757867.0,
"repeat_count": 2.0,
- "routers_loss": 0.0026527612935751677,
+ "routers_loss": 0.0026854004245251417,
"skip_count": 0.0,
"step": 2950,
"text_loss": 0.43433454632759094
@@ -28042,13 +28042,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.05859375,
"learning_rate": 0.0008628913014047585,
"loss": 0.0102,
"macro_f1": 0.3333333432674408,
"num_tokens": 4761171.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027245471719652414,
+ "routers_loss": 0.002433479530736804,
"skip_count": 0.0,
"step": 2952,
"text_loss": 0.4725971519947052
@@ -28061,13 +28061,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0286865234375,
+ "grad_norm": 0.028564453125,
"learning_rate": 0.0008626783080058696,
- "loss": 0.0065,
+ "loss": 0.0066,
"macro_f1": 0.3272727429866791,
"num_tokens": 4764752.0,
"repeat_count": 1.0,
- "routers_loss": 0.01764744706451893,
+ "routers_loss": 0.017182493582367897,
"skip_count": 0.0,
"step": 2954,
"text_loss": 0.460641473531723
@@ -28080,13 +28080,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0859375,
+ "grad_norm": 0.12353515625,
"learning_rate": 0.0008624651756279687,
- "loss": 0.0196,
+ "loss": 0.0198,
"macro_f1": 0.3333333432674408,
"num_tokens": 4767453.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019560824148356915,
+ "routers_loss": 0.0018134774873033166,
"skip_count": 0.0,
"step": 2956,
"text_loss": 0.4091459810733795
@@ -28099,13 +28099,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 0.800000011920929,
"f1_skip": 1.0,
- "grad_norm": 0.051025390625,
+ "grad_norm": 0.053466796875,
"learning_rate": 0.000862251904352729,
"loss": 0.0108,
"macro_f1": 0.9259259104728699,
"num_tokens": 4771110.0,
"repeat_count": 3.0,
- "routers_loss": 0.03031078353524208,
+ "routers_loss": 0.0365753099322319,
"skip_count": 3.0,
"step": 2958,
"text_loss": 0.22408585250377655
@@ -28118,13 +28118,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05224609375,
+ "grad_norm": 0.05029296875,
"learning_rate": 0.000862038494261876,
"loss": 0.0109,
"macro_f1": 0.3272727429866791,
"num_tokens": 4774464.0,
"repeat_count": 0.0,
- "routers_loss": 0.024790454655885696,
+ "routers_loss": 0.024343067780137062,
"skip_count": 1.0,
"step": 2960,
"text_loss": 0.16483014822006226
@@ -28137,13 +28137,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052490234375,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0008618249454371891,
- "loss": 0.0099,
+ "loss": 0.01,
"macro_f1": 0.3333333432674408,
"num_tokens": 4777894.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008704765350557864,
+ "routers_loss": 0.0008310087723657489,
"skip_count": 0.0,
"step": 2962,
"text_loss": 0.5573428869247437
@@ -28156,13 +28156,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0008616112579605006,
- "loss": 0.0116,
+ "loss": 0.0117,
"macro_f1": 0.3333333432674408,
"num_tokens": 4781116.0,
"repeat_count": 0.0,
- "routers_loss": 0.0066874073818326,
+ "routers_loss": 0.0065494864247739315,
"skip_count": 0.0,
"step": 2964,
"text_loss": 0.18816794455051422
@@ -28175,13 +28175,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.04248046875,
"learning_rate": 0.0008613974319136957,
- "loss": 0.0091,
+ "loss": 0.009,
"macro_f1": 0.3333333432674408,
"num_tokens": 4784886.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021798228845000267,
+ "routers_loss": 0.0019726944155991077,
"skip_count": 0.0,
"step": 2966,
"text_loss": 0.5097305774688721
@@ -28194,13 +28194,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.076171875,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0008611834673787134,
"loss": 0.0118,
"macro_f1": 0.3333333432674408,
"num_tokens": 4787563.0,
"repeat_count": 0.0,
- "routers_loss": 0.0063707553781569,
+ "routers_loss": 0.006327496841549873,
"skip_count": 0.0,
"step": 2968,
"text_loss": 0.6953814029693604
@@ -28213,13 +28213,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.5,
"f1_skip": 1.0,
- "grad_norm": 0.0595703125,
+ "grad_norm": 0.056884765625,
"learning_rate": 0.0008609693644375449,
- "loss": 0.0088,
+ "loss": 0.0086,
"macro_f1": 0.8200000524520874,
"num_tokens": 4790421.0,
"repeat_count": 3.0,
- "routers_loss": 0.044509731233119965,
+ "routers_loss": 0.042896661907434464,
"skip_count": 1.0,
"step": 2970,
"text_loss": 0.2573051154613495
@@ -28227,18 +28227,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 1.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 13.953331376577633,
- "f1_execute": 0.9795917868614197,
+ "f1_execute": 1.0,
"f1_repeat": 1.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.1640625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.14453125,
"learning_rate": 0.000860755123172235,
- "loss": 0.01,
- "macro_f1": 0.8820862174034119,
+ "loss": 0.0096,
+ "macro_f1": 1.0,
"num_tokens": 4793786.0,
"repeat_count": 2.0,
- "routers_loss": 0.01667599380016327,
+ "routers_loss": 0.013228793628513813,
"skip_count": 1.0,
"step": 2972,
"text_loss": 0.46614497900009155
@@ -28251,13 +28251,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0274658203125,
+ "grad_norm": 0.0296630859375,
"learning_rate": 0.0008605407436648815,
- "loss": 0.0069,
+ "loss": 0.007,
"macro_f1": 0.6666666865348816,
"num_tokens": 4796864.0,
"repeat_count": 0.0,
- "routers_loss": 0.008433761075139046,
+ "routers_loss": 0.007294759154319763,
"skip_count": 2.0,
"step": 2974,
"text_loss": 0.21555091440677643
@@ -28270,13 +28270,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.057861328125,
"learning_rate": 0.0008603262259976348,
- "loss": 0.0131,
+ "loss": 0.0129,
"macro_f1": 1.0,
"num_tokens": 4800080.0,
"repeat_count": 1.0,
- "routers_loss": 0.002439796691760421,
+ "routers_loss": 0.0024024227168411016,
"skip_count": 5.0,
"step": 2976,
"text_loss": 0.7855485081672668
@@ -28289,13 +28289,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05126953125,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0008601115702526987,
- "loss": 0.0112,
+ "loss": 0.0113,
"macro_f1": 0.3333333432674408,
"num_tokens": 4802899.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015027766348794103,
+ "routers_loss": 0.001433031284250319,
"skip_count": 0.0,
"step": 2978,
"text_loss": 0.6777765154838562
@@ -28308,13 +28308,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06103515625,
+ "grad_norm": 0.04931640625,
"learning_rate": 0.0008598967765123293,
- "loss": 0.0091,
+ "loss": 0.0088,
"macro_f1": 0.3333333432674408,
"num_tokens": 4805835.0,
"repeat_count": 0.0,
- "routers_loss": 0.003235677955672145,
+ "routers_loss": 0.003073975909501314,
"skip_count": 0.0,
"step": 2980,
"text_loss": 0.5926910638809204
@@ -28322,18 +28322,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.5,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 14.0,
- "f1_execute": 0.9090908765792847,
- "f1_repeat": 0.6666666865348816,
+ "f1_execute": 0.9333333373069763,
+ "f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.052734375,
+ "grad_norm": 0.05322265625,
"learning_rate": 0.0008596818448588364,
- "loss": 0.0141,
- "macro_f1": 0.7474747896194458,
+ "loss": 0.0139,
+ "macro_f1": 0.8666667342185974,
"num_tokens": 4809028.0,
"repeat_count": 1.0,
- "routers_loss": 0.063179150223732,
+ "routers_loss": 0.06438573449850082,
"skip_count": 6.0,
"step": 2982,
"text_loss": 0.23975612223148346
@@ -28346,13 +28346,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0299072265625,
+ "grad_norm": 0.0302734375,
"learning_rate": 0.0008594667753745821,
- "loss": 0.0055,
+ "loss": 0.0054,
"macro_f1": 0.3272727429866791,
"num_tokens": 4812831.0,
"repeat_count": 0.0,
- "routers_loss": 0.015444152988493443,
+ "routers_loss": 0.014817612245678902,
"skip_count": 1.0,
"step": 2984,
"text_loss": 0.17292268574237823
@@ -28365,13 +28365,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.060546875,
+ "grad_norm": 0.07421875,
"learning_rate": 0.0008592515681419813,
- "loss": 0.0079,
+ "loss": 0.0078,
"macro_f1": 0.5492662787437439,
"num_tokens": 4816005.0,
"repeat_count": 2.0,
- "routers_loss": 0.02485196851193905,
+ "routers_loss": 0.025407327339053154,
"skip_count": 0.0,
"step": 2986,
"text_loss": 0.6403061151504517
@@ -28384,13 +28384,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0008590362232435018,
- "loss": 0.0102,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 4818901.0,
"repeat_count": 0.0,
- "routers_loss": 0.006175600457936525,
+ "routers_loss": 0.006826757453382015,
"skip_count": 0.0,
"step": 2988,
"text_loss": 0.2572069466114044
@@ -28403,13 +28403,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041748046875,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0008588207407616644,
- "loss": 0.0085,
+ "loss": 0.0086,
"macro_f1": 0.3333333432674408,
"num_tokens": 4823120.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008576468680985272,
+ "routers_loss": 0.0009054148104041815,
"skip_count": 0.0,
"step": 2990,
"text_loss": 0.4827076196670532
@@ -28422,13 +28422,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02392578125,
+ "grad_norm": 0.0247802734375,
"learning_rate": 0.0008586051207790422,
- "loss": 0.0059,
+ "loss": 0.0055,
"macro_f1": 0.3333333432674408,
"num_tokens": 4825774.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011548360344022512,
+ "routers_loss": 0.0012294676853343844,
"skip_count": 0.0,
"step": 2992,
"text_loss": 0.40157821774482727
@@ -28441,13 +28441,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.056396484375,
+ "grad_norm": 0.052734375,
"learning_rate": 0.0008583893633782612,
- "loss": 0.0085,
+ "loss": 0.0084,
"macro_f1": 0.5492662787437439,
"num_tokens": 4828841.0,
"repeat_count": 0.0,
- "routers_loss": 0.01307896338403225,
+ "routers_loss": 0.011474622413516045,
"skip_count": 2.0,
"step": 2994,
"text_loss": 0.14842072129249573
@@ -28460,13 +28460,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0615234375,
+ "grad_norm": 0.058837890625,
"learning_rate": 0.0008581734686419999,
"loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 4831458.0,
"repeat_count": 0.0,
- "routers_loss": 0.009716883301734924,
+ "routers_loss": 0.009154081344604492,
"skip_count": 2.0,
"step": 2996,
"text_loss": 0.365400105714798
@@ -28479,13 +28479,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031494140625,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.00085795743665299,
"loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 4834609.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026114562060683966,
+ "routers_loss": 0.002899336162954569,
"skip_count": 0.0,
"step": 2998,
"text_loss": 0.5574684143066406
@@ -28498,13 +28498,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052001953125,
+ "grad_norm": 0.0517578125,
"learning_rate": 0.0008577412674940152,
"loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 4838324.0,
"repeat_count": 0.0,
- "routers_loss": 0.003787368768826127,
+ "routers_loss": 0.0034664268605411053,
"skip_count": 0.0,
"step": 3000,
"text_loss": 0.6752855777740479
diff --git a/checkpoint-3000/training_args.bin b/checkpoint-3000/training_args.bin
index deeea733277b4031781a5b299881dd8e675e7606..a3d3ae372faf14539639f54454aa52b6ee730c4a 100644
--- a/checkpoint-3000/training_args.bin
+++ b/checkpoint-3000/training_args.bin
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:0b3f5975f57762b552c7ee29776bf32a4dbb125781a0658488d3884fb25c5296
+oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8
size 5880
diff --git a/checkpoint-4000/model-00002-of-00002.safetensors b/checkpoint-4000/model-00002-of-00002.safetensors
index 538ae52a52776454580f9817a68b94d1d18e395f..3d224309e0a868d86d141bce673aeb8ef8112f3d 100644
--- a/checkpoint-4000/model-00002-of-00002.safetensors
+++ b/checkpoint-4000/model-00002-of-00002.safetensors
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:3f665e64ef533a14606501a6daab770caac72a570b75afcb29c6765710e6f735
+oid sha256:1cc14c8bc6b81f0324d9ed1ed6b83526c997381eecbc191424a4bcf38ef3bbc2
size 1481790520
diff --git a/checkpoint-4000/optimizer.pt b/checkpoint-4000/optimizer.pt
index 4a0a5e373e2f7ccaf38f047fc6ae528ff7d3d9a8..40f996bb17bb179c7ce09890432de5718894f71f 100644
--- a/checkpoint-4000/optimizer.pt
+++ b/checkpoint-4000/optimizer.pt
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:32422466c15a2f6655c13d290e30cb4d1f0c57cb2c65fc6c873e0977bd171463
+oid sha256:938be4a6076a4306441bcc5f97aef5747a2cae3994dc47c31ec908bf5cfe80fc
size 44191162
diff --git a/checkpoint-4000/trainer_state.json b/checkpoint-4000/trainer_state.json
index 556d13e8db3354de1db66eb08234bff04bd4ce19..6780c28acf81cf1fb2f2ab9f6c72f85ed4db76c4 100644
--- a/checkpoint-4000/trainer_state.json
+++ b/checkpoint-4000/trainer_state.json
@@ -12,18 +12,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 31.0,
+ "avg_layers": 25.0,
"epoch": 0.009392427355444672,
- "f1_execute": 0.4864864945411682,
+ "f1_execute": 0.6976743936538696,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 2.40625,
+ "grad_norm": 2.25,
"learning_rate": 2e-06,
- "loss": 0.5484,
- "macro_f1": 0.1621621698141098,
+ "loss": 0.4974,
+ "macro_f1": 0.23255813121795654,
"num_tokens": 3175.0,
"repeat_count": 0.0,
- "routers_loss": 0.503563642501831,
+ "routers_loss": 0.4339469373226166,
"skip_count": 0.0,
"step": 2,
"text_loss": 0.3330848515033722
@@ -31,18 +31,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 23.0,
"epoch": 0.018784854710889344,
- "f1_execute": 0.4864864945411682,
+ "f1_execute": 0.7272726893424988,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.9140625,
+ "grad_norm": 1.8359375,
"learning_rate": 6e-06,
- "loss": 0.536,
- "macro_f1": 0.1621621698141098,
+ "loss": 0.4988,
+ "macro_f1": 0.24242423474788666,
"num_tokens": 5816.0,
"repeat_count": 0.0,
- "routers_loss": 0.4589468538761139,
+ "routers_loss": 0.4511934816837311,
"skip_count": 1.0,
"step": 4,
"text_loss": 0.4571273922920227
@@ -50,37 +50,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 32.0,
+ "avg_layers": 28.0,
"epoch": 0.02817728206633402,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.6666666865348816,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 2.375,
+ "grad_norm": 2.234375,
"learning_rate": 1e-05,
- "loss": 0.5469,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.5113,
+ "macro_f1": 0.222222238779068,
"num_tokens": 9739.0,
"repeat_count": 0.0,
- "routers_loss": 0.5736724138259888,
+ "routers_loss": 0.49306994676589966,
"skip_count": 0.0,
"step": 6,
"text_loss": 0.41060560941696167
},
{
- "acc_repeat": 1.0,
- "acc_skip": 0.5,
- "avg_layers": 33.0,
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 0.03756970942177869,
- "f1_execute": 0.47058823704719543,
- "f1_repeat": 0.1538461595773697,
- "f1_skip": 0.222222238779068,
- "grad_norm": 1.8515625,
+ "f1_execute": 0.5641025900840759,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.7265625,
"learning_rate": 1.4e-05,
- "loss": 0.5291,
- "macro_f1": 0.28221890330314636,
+ "loss": 0.4766,
+ "macro_f1": 0.18803420662879944,
"num_tokens": 12869.0,
"repeat_count": 1.0,
- "routers_loss": 0.49970296025276184,
+ "routers_loss": 0.48872503638267517,
"skip_count": 2.0,
"step": 8,
"text_loss": 0.36678561568260193
@@ -88,37 +88,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 27.0,
"epoch": 0.046962136777223364,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.6976743936538696,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.953125,
+ "grad_norm": 1.78125,
"learning_rate": 1.8e-05,
- "loss": 0.5316,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.4806,
+ "macro_f1": 0.23255813121795654,
"num_tokens": 15845.0,
"repeat_count": 0.0,
- "routers_loss": 0.5153562426567078,
+ "routers_loss": 0.45077216625213623,
"skip_count": 0.0,
"step": 10,
"text_loss": 0.5597779154777527
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 0.5,
"acc_skip": 0.3333333432674408,
- "avg_layers": 34.0,
+ "avg_layers": 26.0,
"epoch": 0.05635456413266804,
- "f1_execute": 0.5714285373687744,
- "f1_repeat": 0.0,
- "f1_skip": 0.25,
- "grad_norm": 1.6328125,
+ "f1_execute": 0.7179487347602844,
+ "f1_repeat": 0.2857142984867096,
+ "f1_skip": 0.20000000298023224,
+ "grad_norm": 1.5390625,
"learning_rate": 2.2e-05,
- "loss": 0.5051,
- "macro_f1": 0.2738095223903656,
+ "loss": 0.4557,
+ "macro_f1": 0.40122103691101074,
"num_tokens": 19353.0,
"repeat_count": 2.0,
- "routers_loss": 0.46214747428894043,
+ "routers_loss": 0.4130440056324005,
"skip_count": 3.0,
"step": 12,
"text_loss": 0.2056603729724884
@@ -126,37 +126,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 27.0,
"epoch": 0.06574699148811271,
- "f1_execute": 0.5263157486915588,
+ "f1_execute": 0.6976743936538696,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 2.671875,
+ "grad_norm": 2.4375,
"learning_rate": 2.6e-05,
- "loss": 0.5653,
- "macro_f1": 0.17543858289718628,
+ "loss": 0.5129,
+ "macro_f1": 0.23255813121795654,
"num_tokens": 22675.0,
"repeat_count": 0.0,
- "routers_loss": 0.5300976634025574,
+ "routers_loss": 0.4582902193069458,
"skip_count": 0.0,
"step": 14,
"text_loss": 0.32989829778671265
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 34.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 0.07513941884355738,
- "f1_execute": 0.6153846383094788,
+ "f1_execute": 0.6829268336296082,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 1.8828125,
+ "f1_skip": 0.2222222238779068,
+ "grad_norm": 1.7421875,
"learning_rate": 3e-05,
- "loss": 0.5225,
- "macro_f1": 0.20512822270393372,
+ "loss": 0.4729,
+ "macro_f1": 0.3017163574695587,
"num_tokens": 26022.0,
"repeat_count": 0.0,
- "routers_loss": 0.473240464925766,
+ "routers_loss": 0.42910993099212646,
"skip_count": 1.0,
"step": 16,
"text_loss": 0.1353905349969864
@@ -164,18 +164,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 38.0,
+ "avg_layers": 27.0,
"epoch": 0.08453184619900206,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.7555555105209351,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.6015625,
+ "grad_norm": 1.4765625,
"learning_rate": 3.4000000000000007e-05,
- "loss": 0.4867,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.4274,
+ "macro_f1": 0.2518518567085266,
"num_tokens": 29251.0,
"repeat_count": 0.0,
- "routers_loss": 0.4795944094657898,
+ "routers_loss": 0.3990713059902191,
"skip_count": 0.0,
"step": 18,
"text_loss": 0.3806765377521515
@@ -183,18 +183,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 36.0,
+ "avg_layers": 26.0,
"epoch": 0.09392427355444673,
- "f1_execute": 0.6153846383094788,
- "f1_repeat": 0.1538461595773697,
+ "f1_execute": 0.6829268336296082,
+ "f1_repeat": 0.2857142984867096,
"f1_skip": 0.0,
- "grad_norm": 1.3984375,
+ "grad_norm": 1.3125,
"learning_rate": 3.8e-05,
- "loss": 0.4718,
- "macro_f1": 0.25641027092933655,
+ "loss": 0.4261,
+ "macro_f1": 0.3228803873062134,
"num_tokens": 32545.0,
"repeat_count": 1.0,
- "routers_loss": 0.41872408986091614,
+ "routers_loss": 0.40146592259407043,
"skip_count": 0.0,
"step": 20,
"text_loss": 0.25648367404937744
@@ -202,18 +202,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 26.0,
"epoch": 0.1033167009098914,
- "f1_execute": 0.6341463327407837,
+ "f1_execute": 0.7272727489471436,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.7734375,
+ "grad_norm": 1.625,
"learning_rate": 4.2000000000000004e-05,
- "loss": 0.4472,
- "macro_f1": 0.21138212084770203,
+ "loss": 0.404,
+ "macro_f1": 0.24242424964904785,
"num_tokens": 36560.0,
"repeat_count": 0.0,
- "routers_loss": 0.4152105450630188,
+ "routers_loss": 0.372715026140213,
"skip_count": 0.0,
"step": 22,
"text_loss": 0.2799522578716278
@@ -221,18 +221,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 32.0,
+ "avg_layers": 27.0,
"epoch": 0.11270912826533608,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.7555555105209351,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.8046875,
+ "grad_norm": 1.6328125,
"learning_rate": 4.6e-05,
- "loss": 0.4554,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.4218,
+ "macro_f1": 0.2518518567085266,
"num_tokens": 39597.0,
"repeat_count": 0.0,
- "routers_loss": 0.47541096806526184,
+ "routers_loss": 0.4504941403865814,
"skip_count": 0.0,
"step": 24,
"text_loss": 0.6635695695877075
@@ -240,18 +240,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 34.0,
+ "avg_layers": 27.0,
"epoch": 0.12210155562078075,
- "f1_execute": 0.7826087474822998,
+ "f1_execute": 0.8085106015205383,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.875,
+ "grad_norm": 1.7109375,
"learning_rate": 5e-05,
- "loss": 0.4182,
- "macro_f1": 0.2608695924282074,
+ "loss": 0.3886,
+ "macro_f1": 0.26950353384017944,
"num_tokens": 43080.0,
"repeat_count": 0.0,
- "routers_loss": 0.37319275736808777,
+ "routers_loss": 0.3498791456222534,
"skip_count": 0.0,
"step": 26,
"text_loss": 0.7035041451454163
@@ -259,18 +259,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 0.13149398297622542,
- "f1_execute": 0.7826087474822998,
+ "f1_execute": 0.8085106015205383,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.4375,
+ "grad_norm": 1.34375,
"learning_rate": 5.4e-05,
- "loss": 0.3991,
- "macro_f1": 0.2608695924282074,
+ "loss": 0.3724,
+ "macro_f1": 0.26950353384017944,
"num_tokens": 46406.0,
"repeat_count": 0.0,
- "routers_loss": 0.3604123294353485,
+ "routers_loss": 0.31265875697135925,
"skip_count": 0.0,
"step": 28,
"text_loss": 0.6388277411460876
@@ -280,16 +280,16 @@
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.1408864103316701,
- "f1_execute": 0.8979591727256775,
+ "f1_execute": 0.8571428060531616,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.421875,
+ "grad_norm": 1.2578125,
"learning_rate": 5.800000000000001e-05,
- "loss": 0.3827,
- "macro_f1": 0.2993197441101074,
+ "loss": 0.341,
+ "macro_f1": 0.2857142686843872,
"num_tokens": 49966.0,
"repeat_count": 0.0,
- "routers_loss": 0.35880225896835327,
+ "routers_loss": 0.3200918138027191,
"skip_count": 2.0,
"step": 30,
"text_loss": 0.17372547090053558
@@ -297,18 +297,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 24.0,
+ "avg_layers": 25.0,
"epoch": 0.15027883768711475,
- "f1_execute": 0.9200000166893005,
+ "f1_execute": 0.8571428060531616,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.4609375,
+ "grad_norm": 1.4140625,
"learning_rate": 6.2e-05,
- "loss": 0.3452,
- "macro_f1": 0.30666667222976685,
+ "loss": 0.3207,
+ "macro_f1": 0.2857142686843872,
"num_tokens": 53378.0,
"repeat_count": 1.0,
- "routers_loss": 0.31086465716362,
+ "routers_loss": 0.32304447889328003,
"skip_count": 1.0,
"step": 32,
"text_loss": 0.18196581304073334
@@ -316,18 +316,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 25.0,
"epoch": 0.15967126504255943,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.3671875,
+ "grad_norm": 1.46875,
"learning_rate": 6.6e-05,
- "loss": 0.3283,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.3304,
+ "macro_f1": 0.3006536364555359,
"num_tokens": 56933.0,
"repeat_count": 0.0,
- "routers_loss": 0.2674171030521393,
+ "routers_loss": 0.24814388155937195,
"skip_count": 0.0,
"step": 34,
"text_loss": 0.28823015093803406
@@ -335,18 +335,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 0.16906369239800412,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.1015625,
+ "grad_norm": 1.1171875,
"learning_rate": 7.000000000000001e-05,
- "loss": 0.2849,
- "macro_f1": 0.3205128312110901,
+ "loss": 0.2778,
+ "macro_f1": 0.3006536066532135,
"num_tokens": 60744.0,
"repeat_count": 1.0,
- "routers_loss": 0.24587315320968628,
+ "routers_loss": 0.22411039471626282,
"skip_count": 0.0,
"step": 36,
"text_loss": 0.5260357856750488
@@ -354,18 +354,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 31.0,
+ "avg_layers": 27.0,
"epoch": 0.17845611975344877,
- "f1_execute": 0.8085106015205383,
+ "f1_execute": 0.8571428656578064,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.3046875,
+ "grad_norm": 1.484375,
"learning_rate": 7.4e-05,
- "loss": 0.2616,
- "macro_f1": 0.26950353384017944,
+ "loss": 0.2738,
+ "macro_f1": 0.2857142984867096,
"num_tokens": 64900.0,
"repeat_count": 0.0,
- "routers_loss": 0.32050269842147827,
+ "routers_loss": 0.44355395436286926,
"skip_count": 0.0,
"step": 38,
"text_loss": 0.5382097363471985
@@ -373,18 +373,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 0.18784854710889345,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.1796875,
+ "grad_norm": 1.3828125,
"learning_rate": 7.8e-05,
- "loss": 0.2084,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.2137,
+ "macro_f1": 0.3076923191547394,
"num_tokens": 68000.0,
"repeat_count": 0.0,
- "routers_loss": 0.15196125209331512,
+ "routers_loss": 0.202330082654953,
"skip_count": 0.0,
"step": 40,
"text_loss": 0.5946118831634521
@@ -392,18 +392,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 25.0,
"epoch": 0.19724097446433814,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.61328125,
+ "grad_norm": 0.78125,
"learning_rate": 8.2e-05,
- "loss": 0.1947,
+ "loss": 0.21,
"macro_f1": 0.3144654333591461,
"num_tokens": 70529.0,
"repeat_count": 0.0,
- "routers_loss": 0.14121046662330627,
+ "routers_loss": 0.18023855984210968,
"skip_count": 0.0,
"step": 42,
"text_loss": 0.5550904273986816
@@ -416,13 +416,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.50390625,
+ "grad_norm": 0.609375,
"learning_rate": 8.599999999999999e-05,
- "loss": 0.1884,
+ "loss": 0.1918,
"macro_f1": 0.32098764181137085,
"num_tokens": 73427.0,
"repeat_count": 2.0,
- "routers_loss": 0.21312278509140015,
+ "routers_loss": 0.2101590931415558,
"skip_count": 0.0,
"step": 44,
"text_loss": 0.4636923372745514
@@ -435,13 +435,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.45703125,
+ "grad_norm": 0.53125,
"learning_rate": 8.999999999999999e-05,
- "loss": 0.166,
+ "loss": 0.1881,
"macro_f1": 0.3333333432674408,
"num_tokens": 76472.0,
"repeat_count": 0.0,
- "routers_loss": 0.1184137836098671,
+ "routers_loss": 0.11800424009561539,
"skip_count": 0.0,
"step": 46,
"text_loss": 0.4187001883983612
@@ -454,13 +454,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.62890625,
+ "grad_norm": 0.953125,
"learning_rate": 9.400000000000001e-05,
- "loss": 0.1313,
+ "loss": 0.1446,
"macro_f1": 0.3272727429866791,
"num_tokens": 79124.0,
"repeat_count": 1.0,
- "routers_loss": 0.10897563397884369,
+ "routers_loss": 0.11632519960403442,
"skip_count": 0.0,
"step": 48,
"text_loss": 0.2253919243812561
@@ -468,18 +468,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 0.2348106838861168,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.4375,
+ "grad_norm": 0.58984375,
"learning_rate": 9.800000000000001e-05,
- "loss": 0.1531,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.1543,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 81980.0,
"repeat_count": 1.0,
- "routers_loss": 0.09979952871799469,
+ "routers_loss": 0.09669367223978043,
"skip_count": 0.0,
"step": 50,
"text_loss": 0.6053179502487183
@@ -487,18 +487,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 0.2442031112415615,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.515625,
+ "grad_norm": 0.8515625,
"learning_rate": 0.000102,
- "loss": 0.1265,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.1393,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 85236.0,
"repeat_count": 0.0,
- "routers_loss": 0.05543195456266403,
+ "routers_loss": 0.12471720576286316,
"skip_count": 0.0,
"step": 52,
"text_loss": 0.6027331948280334
@@ -511,13 +511,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.328125,
+ "grad_norm": 0.421875,
"learning_rate": 0.000106,
- "loss": 0.1436,
+ "loss": 0.1473,
"macro_f1": 0.32098764181137085,
"num_tokens": 88238.0,
"repeat_count": 0.0,
- "routers_loss": 0.15049344301223755,
+ "routers_loss": 0.1376056969165802,
"skip_count": 2.0,
"step": 54,
"text_loss": 0.2861751616001129
@@ -530,13 +530,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.263671875,
+ "grad_norm": 0.35546875,
"learning_rate": 0.00011,
- "loss": 0.1021,
+ "loss": 0.1082,
"macro_f1": 0.3333333432674408,
"num_tokens": 91056.0,
"repeat_count": 0.0,
- "routers_loss": 0.07367338240146637,
+ "routers_loss": 0.07449393719434738,
"skip_count": 0.0,
"step": 56,
"text_loss": 0.48106974363327026
@@ -544,18 +544,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 26.0,
"epoch": 0.2723803933078955,
- "f1_execute": 1.0,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25,
+ "grad_norm": 0.271484375,
"learning_rate": 0.000114,
- "loss": 0.114,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.1123,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 94987.0,
"repeat_count": 0.0,
- "routers_loss": 0.03782692551612854,
+ "routers_loss": 0.07064720243215561,
"skip_count": 0.0,
"step": 58,
"text_loss": 0.3554874658584595
@@ -568,13 +568,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.333984375,
+ "grad_norm": 0.5390625,
"learning_rate": 0.000118,
- "loss": 0.1197,
+ "loss": 0.1234,
"macro_f1": 0.32098764181137085,
"num_tokens": 97909.0,
"repeat_count": 0.0,
- "routers_loss": 0.14074955880641937,
+ "routers_loss": 0.16835889220237732,
"skip_count": 2.0,
"step": 60,
"text_loss": 0.5475804805755615
@@ -587,13 +587,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.000122,
- "loss": 0.1174,
+ "loss": 0.1224,
"macro_f1": 0.3333333432674408,
"num_tokens": 101043.0,
"repeat_count": 0.0,
- "routers_loss": 0.058013737201690674,
+ "routers_loss": 0.06127442046999931,
"skip_count": 0.0,
"step": 62,
"text_loss": 0.5966938734054565
@@ -606,13 +606,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.212890625,
"learning_rate": 0.000126,
- "loss": 0.0911,
+ "loss": 0.0931,
"macro_f1": 0.3333333432674408,
"num_tokens": 104103.0,
"repeat_count": 0.0,
- "routers_loss": 0.04936821386218071,
+ "routers_loss": 0.047825805842876434,
"skip_count": 0.0,
"step": 64,
"text_loss": 0.5480486750602722
@@ -625,13 +625,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.220703125,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.00013000000000000002,
- "loss": 0.1107,
+ "loss": 0.1088,
"macro_f1": 0.3006536364555359,
"num_tokens": 107009.0,
"repeat_count": 1.0,
- "routers_loss": 0.2628525495529175,
+ "routers_loss": 0.275174081325531,
"skip_count": 4.0,
"step": 66,
"text_loss": 0.41714492440223694
@@ -644,13 +644,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.000134,
- "loss": 0.1109,
+ "loss": 0.1123,
"macro_f1": 0.3333333432674408,
"num_tokens": 110486.0,
"repeat_count": 0.0,
- "routers_loss": 0.02859785594046116,
+ "routers_loss": 0.029025178402662277,
"skip_count": 0.0,
"step": 68,
"text_loss": 0.6775627732276917
@@ -663,13 +663,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.298828125,
+ "grad_norm": 0.314453125,
"learning_rate": 0.00013800000000000002,
- "loss": 0.1067,
+ "loss": 0.1049,
"macro_f1": 0.3272727429866791,
"num_tokens": 113878.0,
"repeat_count": 0.0,
- "routers_loss": 0.10459086298942566,
+ "routers_loss": 0.10141710191965103,
"skip_count": 1.0,
"step": 70,
"text_loss": 0.6678873896598816
@@ -682,13 +682,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2109375,
+ "grad_norm": 0.248046875,
"learning_rate": 0.00014199999999999998,
- "loss": 0.1166,
+ "loss": 0.1119,
"macro_f1": 0.3272727429866791,
"num_tokens": 116989.0,
"repeat_count": 0.0,
- "routers_loss": 0.0718551054596901,
+ "routers_loss": 0.08002066612243652,
"skip_count": 1.0,
"step": 72,
"text_loss": 0.405692994594574
@@ -701,13 +701,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1787109375,
"learning_rate": 0.000146,
- "loss": 0.1007,
+ "loss": 0.0944,
"macro_f1": 0.3144654333591461,
"num_tokens": 119883.0,
"repeat_count": 0.0,
- "routers_loss": 0.1850946843624115,
+ "routers_loss": 0.1867009848356247,
"skip_count": 3.0,
"step": 74,
"text_loss": 0.44616150856018066
@@ -720,13 +720,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.34375,
+ "grad_norm": 0.333984375,
"learning_rate": 0.00015,
- "loss": 0.1019,
+ "loss": 0.1003,
"macro_f1": 0.32098764181137085,
"num_tokens": 123325.0,
"repeat_count": 0.0,
- "routers_loss": 0.09809529036283493,
+ "routers_loss": 0.07042168825864792,
"skip_count": 2.0,
"step": 76,
"text_loss": 0.11340200901031494
@@ -739,13 +739,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.259765625,
+ "grad_norm": 0.26171875,
"learning_rate": 0.000154,
- "loss": 0.1088,
+ "loss": 0.1066,
"macro_f1": 0.32098764181137085,
"num_tokens": 126131.0,
"repeat_count": 0.0,
- "routers_loss": 0.11277207732200623,
+ "routers_loss": 0.11535373330116272,
"skip_count": 2.0,
"step": 78,
"text_loss": 0.3269135355949402
@@ -758,13 +758,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2412109375,
+ "grad_norm": 0.255859375,
"learning_rate": 0.000158,
- "loss": 0.0866,
+ "loss": 0.0891,
"macro_f1": 0.3272727429866791,
"num_tokens": 130349.0,
"repeat_count": 0.0,
- "routers_loss": 0.09079254418611526,
+ "routers_loss": 0.09497501701116562,
"skip_count": 1.0,
"step": 80,
"text_loss": 0.15273472666740417
@@ -777,13 +777,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.000162,
- "loss": 0.0928,
+ "loss": 0.0929,
"macro_f1": 0.3333333432674408,
"num_tokens": 133607.0,
"repeat_count": 0.0,
- "routers_loss": 0.02900076098740101,
+ "routers_loss": 0.030639523640275,
"skip_count": 0.0,
"step": 82,
"text_loss": 0.282884806394577
@@ -796,13 +796,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.00016600000000000002,
- "loss": 0.1251,
+ "loss": 0.1254,
"macro_f1": 0.3272727429866791,
"num_tokens": 136694.0,
"repeat_count": 0.0,
- "routers_loss": 0.0763339251279831,
+ "routers_loss": 0.07906441390514374,
"skip_count": 1.0,
"step": 84,
"text_loss": 0.459094375371933
@@ -817,11 +817,11 @@
"f1_skip": 0.0,
"grad_norm": 0.212890625,
"learning_rate": 0.00017,
- "loss": 0.1064,
+ "loss": 0.1071,
"macro_f1": 0.3144654333591461,
"num_tokens": 139966.0,
"repeat_count": 1.0,
- "routers_loss": 0.13191410899162292,
+ "routers_loss": 0.1124570444226265,
"skip_count": 2.0,
"step": 86,
"text_loss": 0.29985448718070984
@@ -834,13 +834,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.25390625,
"learning_rate": 0.000174,
- "loss": 0.1055,
+ "loss": 0.1031,
"macro_f1": 0.32098764181137085,
"num_tokens": 142788.0,
"repeat_count": 2.0,
- "routers_loss": 0.21200031042099,
+ "routers_loss": 0.1966402679681778,
"skip_count": 0.0,
"step": 88,
"text_loss": 0.6435291767120361
@@ -853,13 +853,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.318359375,
+ "grad_norm": 0.349609375,
"learning_rate": 0.000178,
- "loss": 0.0971,
+ "loss": 0.0963,
"macro_f1": 0.3333333432674408,
"num_tokens": 146192.0,
"repeat_count": 0.0,
- "routers_loss": 0.031911369413137436,
+ "routers_loss": 0.0325632207095623,
"skip_count": 0.0,
"step": 90,
"text_loss": 0.35170626640319824
@@ -872,13 +872,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.216796875,
+ "grad_norm": 0.2265625,
"learning_rate": 0.000182,
- "loss": 0.1056,
+ "loss": 0.1073,
"macro_f1": 0.32098764181137085,
"num_tokens": 149792.0,
"repeat_count": 1.0,
- "routers_loss": 0.14131835103034973,
+ "routers_loss": 0.15115146338939667,
"skip_count": 1.0,
"step": 92,
"text_loss": 0.83159339427948
@@ -891,13 +891,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.205078125,
"learning_rate": 0.000186,
- "loss": 0.1059,
+ "loss": 0.1073,
"macro_f1": 0.3333333432674408,
"num_tokens": 152766.0,
"repeat_count": 0.0,
- "routers_loss": 0.04137955233454704,
+ "routers_loss": 0.043313540518283844,
"skip_count": 0.0,
"step": 94,
"text_loss": 0.49707934260368347
@@ -910,13 +910,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.00019,
- "loss": 0.0934,
+ "loss": 0.0947,
"macro_f1": 0.3333333432674408,
"num_tokens": 156112.0,
"repeat_count": 0.0,
- "routers_loss": 0.03163003921508789,
+ "routers_loss": 0.032021280378103256,
"skip_count": 0.0,
"step": 96,
"text_loss": 0.27608928084373474
@@ -929,13 +929,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.2099609375,
"learning_rate": 0.000194,
- "loss": 0.0847,
+ "loss": 0.0846,
"macro_f1": 0.3076923191547394,
"num_tokens": 159454.0,
"repeat_count": 2.0,
- "routers_loss": 0.2567490339279175,
+ "routers_loss": 0.24473154544830322,
"skip_count": 2.0,
"step": 98,
"text_loss": 0.6026689410209656
@@ -948,13 +948,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.30859375,
+ "grad_norm": 0.271484375,
"learning_rate": 0.00019800000000000002,
- "loss": 0.1077,
+ "loss": 0.1028,
"macro_f1": 0.32098764181137085,
"num_tokens": 163661.0,
"repeat_count": 0.0,
- "routers_loss": 0.11468870937824249,
+ "routers_loss": 0.11468276381492615,
"skip_count": 2.0,
"step": 100,
"text_loss": 0.46733155846595764
@@ -967,13 +967,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.17578125,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.000202,
- "loss": 0.1131,
+ "loss": 0.1089,
"macro_f1": 0.3333333432674408,
"num_tokens": 167134.0,
"repeat_count": 0.0,
- "routers_loss": 0.02124219387769699,
+ "routers_loss": 0.021144939586520195,
"skip_count": 0.0,
"step": 102,
"text_loss": 0.6362994909286499
@@ -986,13 +986,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.000206,
- "loss": 0.0624,
+ "loss": 0.0621,
"macro_f1": 0.3272727429866791,
"num_tokens": 170433.0,
"repeat_count": 0.0,
- "routers_loss": 0.06983796507120132,
+ "routers_loss": 0.06594710797071457,
"skip_count": 1.0,
"step": 104,
"text_loss": 0.4515477120876312
@@ -1005,13 +1005,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.00021,
- "loss": 0.0951,
+ "loss": 0.0929,
"macro_f1": 0.3333333432674408,
"num_tokens": 173387.0,
"repeat_count": 0.0,
- "routers_loss": 0.03467355668544769,
+ "routers_loss": 0.032923027873039246,
"skip_count": 0.0,
"step": 106,
"text_loss": 0.6638453006744385
@@ -1024,13 +1024,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.240234375,
"learning_rate": 0.000214,
- "loss": 0.0881,
+ "loss": 0.0883,
"macro_f1": 0.3272727429866791,
"num_tokens": 176170.0,
"repeat_count": 1.0,
- "routers_loss": 0.08142061531543732,
+ "routers_loss": 0.08034781366586685,
"skip_count": 0.0,
"step": 108,
"text_loss": 1.186936855316162
@@ -1043,13 +1043,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.267578125,
"learning_rate": 0.000218,
- "loss": 0.0795,
+ "loss": 0.0794,
"macro_f1": 0.3272727429866791,
"num_tokens": 179877.0,
"repeat_count": 0.0,
- "routers_loss": 0.08327355235815048,
+ "routers_loss": 0.07814185321331024,
"skip_count": 1.0,
"step": 110,
"text_loss": 0.5488709211349487
@@ -1062,13 +1062,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.000222,
- "loss": 0.0943,
+ "loss": 0.0946,
"macro_f1": 0.3333333432674408,
"num_tokens": 182726.0,
"repeat_count": 0.0,
- "routers_loss": 0.019890006631612778,
+ "routers_loss": 0.01884695515036583,
"skip_count": 0.0,
"step": 112,
"text_loss": 0.5195863842964172
@@ -1081,13 +1081,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2001953125,
+ "grad_norm": 0.19921875,
"learning_rate": 0.00022600000000000002,
- "loss": 0.0933,
+ "loss": 0.0974,
"macro_f1": 0.32098764181137085,
"num_tokens": 185624.0,
"repeat_count": 0.0,
- "routers_loss": 0.09992363303899765,
+ "routers_loss": 0.09657823294401169,
"skip_count": 2.0,
"step": 114,
"text_loss": 0.43858134746551514
@@ -1100,13 +1100,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.3046875,
"learning_rate": 0.00023,
- "loss": 0.0762,
+ "loss": 0.0753,
"macro_f1": 0.3333333432674408,
"num_tokens": 188155.0,
"repeat_count": 0.0,
- "routers_loss": 0.014119029976427555,
+ "routers_loss": 0.01463601179420948,
"skip_count": 0.0,
"step": 116,
"text_loss": 0.392981618642807
@@ -1119,13 +1119,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.423828125,
+ "grad_norm": 0.439453125,
"learning_rate": 0.00023400000000000002,
- "loss": 0.0842,
+ "loss": 0.0843,
"macro_f1": 0.3333333432674408,
"num_tokens": 190970.0,
"repeat_count": 0.0,
- "routers_loss": 0.03976766765117645,
+ "routers_loss": 0.03859659656882286,
"skip_count": 0.0,
"step": 118,
"text_loss": 0.309179425239563
@@ -1138,13 +1138,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.2255859375,
"learning_rate": 0.00023799999999999998,
- "loss": 0.0517,
+ "loss": 0.053,
"macro_f1": 0.3333333432674408,
"num_tokens": 193988.0,
"repeat_count": 0.0,
- "routers_loss": 0.017428619787096977,
+ "routers_loss": 0.019092386588454247,
"skip_count": 0.0,
"step": 120,
"text_loss": 0.48543134331703186
@@ -1157,13 +1157,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.296875,
+ "grad_norm": 0.35546875,
"learning_rate": 0.000242,
- "loss": 0.1134,
+ "loss": 0.1203,
"macro_f1": 0.3272727429866791,
"num_tokens": 196475.0,
"repeat_count": 0.0,
- "routers_loss": 0.06965513527393341,
+ "routers_loss": 0.0619138665497303,
"skip_count": 1.0,
"step": 122,
"text_loss": 0.4615364074707031
@@ -1176,13 +1176,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1875,
"learning_rate": 0.000246,
- "loss": 0.0984,
+ "loss": 0.1002,
"macro_f1": 0.3272727429866791,
"num_tokens": 200045.0,
"repeat_count": 1.0,
- "routers_loss": 0.10476501286029816,
+ "routers_loss": 0.09752107411623001,
"skip_count": 0.0,
"step": 124,
"text_loss": 0.15802054107189178
@@ -1195,13 +1195,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.00025,
- "loss": 0.0771,
+ "loss": 0.0773,
"macro_f1": 0.3333333432674408,
"num_tokens": 203214.0,
"repeat_count": 0.0,
- "routers_loss": 0.028317544609308243,
+ "routers_loss": 0.02896115928888321,
"skip_count": 0.0,
"step": 126,
"text_loss": 0.4543360471725464
@@ -1214,13 +1214,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.390625,
+ "grad_norm": 0.4296875,
"learning_rate": 0.000254,
- "loss": 0.0933,
+ "loss": 0.0973,
"macro_f1": 0.3333333432674408,
"num_tokens": 206168.0,
"repeat_count": 0.0,
- "routers_loss": 0.012766432017087936,
+ "routers_loss": 0.011423567309975624,
"skip_count": 0.0,
"step": 128,
"text_loss": 0.4730179011821747
@@ -1233,13 +1233,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.353515625,
+ "grad_norm": 0.365234375,
"learning_rate": 0.00025800000000000004,
- "loss": 0.0989,
+ "loss": 0.099,
"macro_f1": 0.3333333432674408,
"num_tokens": 209907.0,
"repeat_count": 0.0,
- "routers_loss": 0.021400077268481255,
+ "routers_loss": 0.01957600563764572,
"skip_count": 0.0,
"step": 130,
"text_loss": 0.45122358202934265
@@ -1252,13 +1252,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.2060546875,
"learning_rate": 0.000262,
- "loss": 0.0873,
+ "loss": 0.0868,
"macro_f1": 0.3272727429866791,
"num_tokens": 213521.0,
"repeat_count": 0.0,
- "routers_loss": 0.05025051161646843,
+ "routers_loss": 0.04882373288273811,
"skip_count": 1.0,
"step": 132,
"text_loss": 0.4341491758823395
@@ -1271,13 +1271,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1611328125,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.000266,
- "loss": 0.085,
+ "loss": 0.0834,
"macro_f1": 0.3333333432674408,
"num_tokens": 216484.0,
"repeat_count": 0.0,
- "routers_loss": 0.017420046031475067,
+ "routers_loss": 0.016083380207419395,
"skip_count": 0.0,
"step": 134,
"text_loss": 0.46990111470222473
@@ -1290,13 +1290,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2041015625,
+ "grad_norm": 0.220703125,
"learning_rate": 0.00027,
- "loss": 0.086,
+ "loss": 0.0863,
"macro_f1": 0.3333333432674408,
"num_tokens": 219398.0,
"repeat_count": 0.0,
- "routers_loss": 0.018217921257019043,
+ "routers_loss": 0.01733536459505558,
"skip_count": 0.0,
"step": 136,
"text_loss": 0.4455361068248749
@@ -1309,13 +1309,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.00027400000000000005,
- "loss": 0.0985,
+ "loss": 0.0997,
"macro_f1": 0.3333333432674408,
"num_tokens": 222430.0,
"repeat_count": 0.0,
- "routers_loss": 0.012350660748779774,
+ "routers_loss": 0.01332803163677454,
"skip_count": 0.0,
"step": 138,
"text_loss": 0.47699397802352905
@@ -1328,13 +1328,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.302734375,
+ "grad_norm": 0.333984375,
"learning_rate": 0.00027800000000000004,
"loss": 0.0922,
"macro_f1": 0.3144654333591461,
"num_tokens": 225458.0,
"repeat_count": 1.0,
- "routers_loss": 0.14993029832839966,
+ "routers_loss": 0.14924728870391846,
"skip_count": 2.0,
"step": 140,
"text_loss": 0.5858222842216492
@@ -1347,13 +1347,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.251953125,
+ "grad_norm": 0.25,
"learning_rate": 0.00028199999999999997,
- "loss": 0.0791,
+ "loss": 0.0798,
"macro_f1": 0.3144654333591461,
"num_tokens": 229365.0,
"repeat_count": 1.0,
- "routers_loss": 0.17921413481235504,
+ "routers_loss": 0.1860177218914032,
"skip_count": 2.0,
"step": 142,
"text_loss": 0.5003137588500977
@@ -1366,13 +1366,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.00028599999999999996,
- "loss": 0.0535,
+ "loss": 0.054,
"macro_f1": 0.32098764181137085,
"num_tokens": 231787.0,
"repeat_count": 1.0,
- "routers_loss": 0.1420905590057373,
+ "routers_loss": 0.16498211026191711,
"skip_count": 1.0,
"step": 144,
"text_loss": 0.5026470422744751
@@ -1385,13 +1385,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.29296875,
+ "grad_norm": 0.306640625,
"learning_rate": 0.00029,
- "loss": 0.0956,
+ "loss": 0.0936,
"macro_f1": 0.32098764181137085,
"num_tokens": 235014.0,
"repeat_count": 1.0,
- "routers_loss": 0.12468750029802322,
+ "routers_loss": 0.11801310628652573,
"skip_count": 1.0,
"step": 146,
"text_loss": 0.611888587474823
@@ -1404,13 +1404,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.000294,
- "loss": 0.0879,
+ "loss": 0.0878,
"macro_f1": 0.3333333432674408,
"num_tokens": 238210.0,
"repeat_count": 0.0,
- "routers_loss": 0.024295611307024956,
+ "routers_loss": 0.02422776259481907,
"skip_count": 0.0,
"step": 148,
"text_loss": 0.2876914143562317
@@ -1423,13 +1423,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.000298,
- "loss": 0.087,
+ "loss": 0.0858,
"macro_f1": 0.32098764181137085,
"num_tokens": 241582.0,
"repeat_count": 0.0,
- "routers_loss": 0.07016433775424957,
+ "routers_loss": 0.07282499223947525,
"skip_count": 2.0,
"step": 150,
"text_loss": 0.3919292390346527
@@ -1442,13 +1442,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3828125,
+ "grad_norm": 0.37890625,
"learning_rate": 0.000302,
- "loss": 0.0782,
+ "loss": 0.0797,
"macro_f1": 0.32098764181137085,
"num_tokens": 244621.0,
"repeat_count": 1.0,
- "routers_loss": 0.18942493200302124,
+ "routers_loss": 0.20659038424491882,
"skip_count": 1.0,
"step": 152,
"text_loss": 0.4294498860836029
@@ -1461,13 +1461,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1787109375,
"learning_rate": 0.000306,
- "loss": 0.0713,
+ "loss": 0.072,
"macro_f1": 0.3333333432674408,
"num_tokens": 247833.0,
"repeat_count": 0.0,
- "routers_loss": 0.02319060079753399,
+ "routers_loss": 0.02428400330245495,
"skip_count": 0.0,
"step": 154,
"text_loss": 0.5930765867233276
@@ -1480,13 +1480,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.15234375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.00031,
- "loss": 0.0778,
+ "loss": 0.0772,
"macro_f1": 0.3333333432674408,
"num_tokens": 251349.0,
"repeat_count": 0.0,
- "routers_loss": 0.01764747127890587,
+ "routers_loss": 0.0167869683355093,
"skip_count": 0.0,
"step": 156,
"text_loss": 0.41063904762268066
@@ -1499,13 +1499,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.000314,
- "loss": 0.0829,
+ "loss": 0.0821,
"macro_f1": 0.3333333432674408,
"num_tokens": 254886.0,
"repeat_count": 0.0,
- "routers_loss": 0.02268100716173649,
+ "routers_loss": 0.02531604655086994,
"skip_count": 0.0,
"step": 158,
"text_loss": 0.6739020347595215
@@ -1518,13 +1518,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.201171875,
"learning_rate": 0.00031800000000000003,
- "loss": 0.0889,
+ "loss": 0.09,
"macro_f1": 0.3333333432674408,
"num_tokens": 258260.0,
"repeat_count": 0.0,
- "routers_loss": 0.016952091827988625,
+ "routers_loss": 0.017772775143384933,
"skip_count": 0.0,
"step": 160,
"text_loss": 0.46873849630355835
@@ -1537,13 +1537,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2216796875,
+ "grad_norm": 0.224609375,
"learning_rate": 0.000322,
- "loss": 0.0923,
+ "loss": 0.0893,
"macro_f1": 0.3272727429866791,
"num_tokens": 261846.0,
"repeat_count": 0.0,
- "routers_loss": 0.03669808804988861,
+ "routers_loss": 0.034902360290288925,
"skip_count": 1.0,
"step": 162,
"text_loss": 0.3727971017360687
@@ -1556,13 +1556,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.212890625,
"learning_rate": 0.000326,
- "loss": 0.0769,
+ "loss": 0.076,
"macro_f1": 0.3333333432674408,
"num_tokens": 264348.0,
"repeat_count": 0.0,
- "routers_loss": 0.012101447209715843,
+ "routers_loss": 0.013553355820477009,
"skip_count": 0.0,
"step": 164,
"text_loss": 0.5798237323760986
@@ -1575,13 +1575,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.37109375,
+ "grad_norm": 0.408203125,
"learning_rate": 0.00033,
- "loss": 0.0897,
+ "loss": 0.0926,
"macro_f1": 0.32098764181137085,
"num_tokens": 267479.0,
"repeat_count": 1.0,
- "routers_loss": 0.1562056541442871,
+ "routers_loss": 0.13571743667125702,
"skip_count": 1.0,
"step": 166,
"text_loss": 0.8084776997566223
@@ -1594,13 +1594,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.2431640625,
"learning_rate": 0.00033400000000000004,
- "loss": 0.0829,
+ "loss": 0.0817,
"macro_f1": 0.32098764181137085,
"num_tokens": 270268.0,
"repeat_count": 2.0,
- "routers_loss": 0.20807914435863495,
+ "routers_loss": 0.19884146749973297,
"skip_count": 0.0,
"step": 168,
"text_loss": 0.7366134524345398
@@ -1613,13 +1613,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.267578125,
"learning_rate": 0.00033800000000000003,
- "loss": 0.0987,
+ "loss": 0.1022,
"macro_f1": 0.32098764181137085,
"num_tokens": 273518.0,
"repeat_count": 1.0,
- "routers_loss": 0.1530539095401764,
+ "routers_loss": 0.15469175577163696,
"skip_count": 1.0,
"step": 170,
"text_loss": 0.27204006910324097
@@ -1632,13 +1632,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000342,
- "loss": 0.087,
+ "loss": 0.0865,
"macro_f1": 0.32098764181137085,
"num_tokens": 277210.0,
"repeat_count": 0.0,
- "routers_loss": 0.08004544675350189,
+ "routers_loss": 0.08603330701589584,
"skip_count": 2.0,
"step": 172,
"text_loss": 0.7137667536735535
@@ -1651,13 +1651,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1767578125,
+ "grad_norm": 0.189453125,
"learning_rate": 0.000346,
- "loss": 0.0916,
+ "loss": 0.0902,
"macro_f1": 0.3076923191547394,
"num_tokens": 280389.0,
"repeat_count": 0.0,
- "routers_loss": 0.19228078424930573,
+ "routers_loss": 0.17851492762565613,
"skip_count": 4.0,
"step": 174,
"text_loss": 0.5148105621337891
@@ -1670,13 +1670,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1611328125,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.00035,
- "loss": 0.0863,
+ "loss": 0.0853,
"macro_f1": 0.3333333432674408,
"num_tokens": 283501.0,
"repeat_count": 0.0,
- "routers_loss": 0.024507170543074608,
+ "routers_loss": 0.021331604570150375,
"skip_count": 0.0,
"step": 176,
"text_loss": 0.301013320684433
@@ -1689,13 +1689,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.000354,
- "loss": 0.0898,
+ "loss": 0.0911,
"macro_f1": 0.32098764181137085,
"num_tokens": 287154.0,
"repeat_count": 0.0,
- "routers_loss": 0.05055495724081993,
+ "routers_loss": 0.057273946702480316,
"skip_count": 2.0,
"step": 178,
"text_loss": 0.4740981459617615
@@ -1708,13 +1708,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.240234375,
"learning_rate": 0.000358,
- "loss": 0.0865,
+ "loss": 0.0904,
"macro_f1": 0.3272727429866791,
"num_tokens": 289929.0,
"repeat_count": 0.0,
- "routers_loss": 0.03999815881252289,
+ "routers_loss": 0.04116598889231682,
"skip_count": 1.0,
"step": 180,
"text_loss": 0.4838573932647705
@@ -1727,13 +1727,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.14453125,
"learning_rate": 0.000362,
- "loss": 0.0983,
+ "loss": 0.0991,
"macro_f1": 0.3333333432674408,
"num_tokens": 294293.0,
"repeat_count": 0.0,
- "routers_loss": 0.025158070027828217,
+ "routers_loss": 0.027111956849694252,
"skip_count": 0.0,
"step": 182,
"text_loss": 0.7495553493499756
@@ -1746,32 +1746,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.158203125,
"learning_rate": 0.000366,
- "loss": 0.1015,
+ "loss": 0.1038,
"macro_f1": 0.3333333432674408,
"num_tokens": 297730.0,
"repeat_count": 0.0,
- "routers_loss": 0.01825365424156189,
+ "routers_loss": 0.019166452810168266,
"skip_count": 0.0,
"step": 184,
"text_loss": 0.534831166267395
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 0.8734957440563546,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.2158203125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2236328125,
"learning_rate": 0.00037,
- "loss": 0.0736,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0784,
+ "macro_f1": 0.5427350401878357,
"num_tokens": 300593.0,
"repeat_count": 1.0,
- "routers_loss": 0.22729666531085968,
+ "routers_loss": 0.2349659502506256,
"skip_count": 2.0,
"step": 186,
"text_loss": 0.3549048602581024
@@ -1784,13 +1784,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.2041015625,
"learning_rate": 0.000374,
- "loss": 0.0838,
+ "loss": 0.0827,
"macro_f1": 0.3076923191547394,
"num_tokens": 303456.0,
"repeat_count": 2.0,
- "routers_loss": 0.24516475200653076,
+ "routers_loss": 0.22502389550209045,
"skip_count": 2.0,
"step": 188,
"text_loss": 0.8837642073631287
@@ -1803,13 +1803,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2470703125,
+ "grad_norm": 0.271484375,
"learning_rate": 0.000378,
- "loss": 0.1056,
+ "loss": 0.1085,
"macro_f1": 0.3272727429866791,
"num_tokens": 306241.0,
"repeat_count": 1.0,
- "routers_loss": 0.1307530701160431,
+ "routers_loss": 0.12291611731052399,
"skip_count": 0.0,
"step": 190,
"text_loss": 0.73353511095047
@@ -1822,13 +1822,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.15625,
"learning_rate": 0.000382,
- "loss": 0.0961,
+ "loss": 0.0969,
"macro_f1": 0.3272727429866791,
"num_tokens": 310606.0,
"repeat_count": 0.0,
- "routers_loss": 0.06541688740253448,
+ "routers_loss": 0.055988848209381104,
"skip_count": 1.0,
"step": 192,
"text_loss": 0.6261917352676392
@@ -1841,13 +1841,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.333984375,
+ "grad_norm": 0.34375,
"learning_rate": 0.000386,
- "loss": 0.1058,
+ "loss": 0.1055,
"macro_f1": 0.3144654333591461,
"num_tokens": 313564.0,
"repeat_count": 0.0,
- "routers_loss": 0.12492545694112778,
+ "routers_loss": 0.12363404780626297,
"skip_count": 3.0,
"step": 194,
"text_loss": 0.2790874242782593
@@ -1860,13 +1860,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28515625,
+ "grad_norm": 0.27734375,
"learning_rate": 0.00039000000000000005,
- "loss": 0.0966,
+ "loss": 0.0964,
"macro_f1": 0.3076923191547394,
"num_tokens": 316958.0,
"repeat_count": 2.0,
- "routers_loss": 0.2838033139705658,
+ "routers_loss": 0.2718356251716614,
"skip_count": 2.0,
"step": 196,
"text_loss": 0.14428086578845978
@@ -1881,11 +1881,11 @@
"f1_skip": 0.0,
"grad_norm": 0.2021484375,
"learning_rate": 0.00039400000000000004,
- "loss": 0.0929,
+ "loss": 0.0917,
"macro_f1": 0.32098764181137085,
"num_tokens": 320103.0,
"repeat_count": 0.0,
- "routers_loss": 0.07692629098892212,
+ "routers_loss": 0.07188102602958679,
"skip_count": 2.0,
"step": 198,
"text_loss": 0.27155816555023193
@@ -1898,13 +1898,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.201171875,
"learning_rate": 0.000398,
"loss": 0.0809,
"macro_f1": 0.32098764181137085,
"num_tokens": 323566.0,
"repeat_count": 1.0,
- "routers_loss": 0.18504399061203003,
+ "routers_loss": 0.18038256466388702,
"skip_count": 1.0,
"step": 200,
"text_loss": 0.8453494310379028
@@ -1917,13 +1917,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.000402,
- "loss": 0.078,
+ "loss": 0.0801,
"macro_f1": 0.3333333432674408,
"num_tokens": 326385.0,
"repeat_count": 0.0,
- "routers_loss": 0.014647359028458595,
+ "routers_loss": 0.014639763161540031,
"skip_count": 0.0,
"step": 202,
"text_loss": 0.5733131766319275
@@ -1936,13 +1936,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2041015625,
+ "grad_norm": 0.21875,
"learning_rate": 0.00040600000000000006,
- "loss": 0.1028,
+ "loss": 0.104,
"macro_f1": 0.3333333432674408,
"num_tokens": 329266.0,
"repeat_count": 0.0,
- "routers_loss": 0.017848484218120575,
+ "routers_loss": 0.015269627794623375,
"skip_count": 0.0,
"step": 204,
"text_loss": 0.7355639934539795
@@ -1955,13 +1955,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.27734375,
"learning_rate": 0.00041,
- "loss": 0.0832,
+ "loss": 0.0833,
"macro_f1": 0.3333333432674408,
"num_tokens": 332984.0,
"repeat_count": 0.0,
- "routers_loss": 0.01900508813560009,
+ "routers_loss": 0.018046971410512924,
"skip_count": 0.0,
"step": 206,
"text_loss": 0.587641179561615
@@ -1974,13 +1974,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.166015625,
+ "grad_norm": 0.185546875,
"learning_rate": 0.000414,
"loss": 0.0588,
"macro_f1": 0.3272727429866791,
"num_tokens": 335739.0,
"repeat_count": 1.0,
- "routers_loss": 0.13018715381622314,
+ "routers_loss": 0.12791286408901215,
"skip_count": 0.0,
"step": 208,
"text_loss": 0.6538406610488892
@@ -1993,13 +1993,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.24609375,
"learning_rate": 0.00041799999999999997,
- "loss": 0.0697,
+ "loss": 0.0732,
"macro_f1": 0.3272727429866791,
"num_tokens": 338966.0,
"repeat_count": 0.0,
- "routers_loss": 0.055288366973400116,
+ "routers_loss": 0.050490595400333405,
"skip_count": 1.0,
"step": 210,
"text_loss": 0.4188295602798462
@@ -2012,13 +2012,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.271484375,
"learning_rate": 0.000422,
- "loss": 0.0576,
+ "loss": 0.0588,
"macro_f1": 0.3144654333591461,
"num_tokens": 342063.0,
"repeat_count": 0.0,
- "routers_loss": 0.10952572524547577,
+ "routers_loss": 0.11652113497257233,
"skip_count": 3.0,
"step": 212,
"text_loss": 0.21822240948677063
@@ -2031,13 +2031,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.197265625,
+ "grad_norm": 0.2060546875,
"learning_rate": 0.000426,
- "loss": 0.062,
+ "loss": 0.0621,
"macro_f1": 0.3333333432674408,
"num_tokens": 344887.0,
"repeat_count": 0.0,
- "routers_loss": 0.02415696159005165,
+ "routers_loss": 0.023898238316178322,
"skip_count": 0.0,
"step": 214,
"text_loss": 0.24692800641059875
@@ -2050,13 +2050,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.353515625,
+ "grad_norm": 0.3671875,
"learning_rate": 0.00043,
- "loss": 0.1011,
+ "loss": 0.1005,
"macro_f1": 0.3272727429866791,
"num_tokens": 348700.0,
"repeat_count": 1.0,
- "routers_loss": 0.06956391036510468,
+ "routers_loss": 0.06414655596017838,
"skip_count": 0.0,
"step": 216,
"text_loss": 0.4744548797607422
@@ -2069,13 +2069,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1904296875,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.00043400000000000003,
- "loss": 0.076,
+ "loss": 0.0753,
"macro_f1": 0.32098764181137085,
"num_tokens": 351507.0,
"repeat_count": 1.0,
- "routers_loss": 0.1140352189540863,
+ "routers_loss": 0.11702914535999298,
"skip_count": 1.0,
"step": 218,
"text_loss": 0.5614864826202393
@@ -2090,11 +2090,11 @@
"f1_skip": 0.0,
"grad_norm": 0.189453125,
"learning_rate": 0.000438,
- "loss": 0.0788,
+ "loss": 0.0792,
"macro_f1": 0.3333333432674408,
"num_tokens": 354484.0,
"repeat_count": 0.0,
- "routers_loss": 0.011621571145951748,
+ "routers_loss": 0.014991643838584423,
"skip_count": 0.0,
"step": 220,
"text_loss": 0.47209832072257996
@@ -2107,13 +2107,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.251953125,
"learning_rate": 0.000442,
"loss": 0.106,
"macro_f1": 0.3272727429866791,
"num_tokens": 357954.0,
"repeat_count": 0.0,
- "routers_loss": 0.05813701078295708,
+ "routers_loss": 0.04747112840414047,
"skip_count": 1.0,
"step": 222,
"text_loss": 0.2968728244304657
@@ -2126,13 +2126,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.357421875,
+ "grad_norm": 0.40234375,
"learning_rate": 0.000446,
- "loss": 0.0827,
+ "loss": 0.0853,
"macro_f1": 0.32098764181137085,
"num_tokens": 360547.0,
"repeat_count": 0.0,
- "routers_loss": 0.0646885335445404,
+ "routers_loss": 0.06754162162542343,
"skip_count": 2.0,
"step": 224,
"text_loss": 0.2364148646593094
@@ -2145,13 +2145,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.244140625,
+ "grad_norm": 0.2412109375,
"learning_rate": 0.00045000000000000004,
- "loss": 0.1011,
+ "loss": 0.1016,
"macro_f1": 0.3272727429866791,
"num_tokens": 364529.0,
"repeat_count": 0.0,
- "routers_loss": 0.07224348932504654,
+ "routers_loss": 0.07830183953046799,
"skip_count": 1.0,
"step": 226,
"text_loss": 0.4787476360797882
@@ -2164,13 +2164,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.1953125,
"learning_rate": 0.00045400000000000003,
- "loss": 0.0781,
+ "loss": 0.0792,
"macro_f1": 0.3333333432674408,
"num_tokens": 367683.0,
"repeat_count": 0.0,
- "routers_loss": 0.015971746295690536,
+ "routers_loss": 0.015735948458313942,
"skip_count": 0.0,
"step": 228,
"text_loss": 0.37148505449295044
@@ -2183,13 +2183,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.25,
"learning_rate": 0.000458,
- "loss": 0.099,
+ "loss": 0.0995,
"macro_f1": 0.3333333432674408,
"num_tokens": 371402.0,
"repeat_count": 0.0,
- "routers_loss": 0.017818331718444824,
+ "routers_loss": 0.013354359194636345,
"skip_count": 0.0,
"step": 230,
"text_loss": 0.7464763522148132
@@ -2202,13 +2202,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.000462,
- "loss": 0.0757,
+ "loss": 0.0731,
"macro_f1": 0.3333333432674408,
"num_tokens": 374587.0,
"repeat_count": 0.0,
- "routers_loss": 0.01582280732691288,
+ "routers_loss": 0.013763721100986004,
"skip_count": 0.0,
"step": 232,
"text_loss": 0.8754443526268005
@@ -2221,13 +2221,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.42578125,
+ "grad_norm": 0.3984375,
"learning_rate": 0.00046600000000000005,
- "loss": 0.0876,
+ "loss": 0.0861,
"macro_f1": 0.3333333432674408,
"num_tokens": 377513.0,
"repeat_count": 0.0,
- "routers_loss": 0.011417915113270283,
+ "routers_loss": 0.010075435042381287,
"skip_count": 0.0,
"step": 234,
"text_loss": 0.31534913182258606
@@ -2240,13 +2240,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.17578125,
"learning_rate": 0.00047,
- "loss": 0.0801,
+ "loss": 0.0791,
"macro_f1": 0.3272727429866791,
"num_tokens": 380736.0,
"repeat_count": 0.0,
- "routers_loss": 0.05787832289934158,
+ "routers_loss": 0.059825167059898376,
"skip_count": 1.0,
"step": 236,
"text_loss": 0.5936337113380432
@@ -2259,13 +2259,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.236328125,
+ "grad_norm": 0.267578125,
"learning_rate": 0.000474,
- "loss": 0.0508,
+ "loss": 0.0514,
"macro_f1": 0.32098764181137085,
"num_tokens": 383236.0,
"repeat_count": 0.0,
- "routers_loss": 0.09476690739393234,
+ "routers_loss": 0.09134846180677414,
"skip_count": 2.0,
"step": 238,
"text_loss": 0.5976157784461975
@@ -2278,13 +2278,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.208984375,
"learning_rate": 0.00047799999999999996,
- "loss": 0.0833,
+ "loss": 0.0858,
"macro_f1": 0.32098764181137085,
"num_tokens": 385778.0,
"repeat_count": 1.0,
- "routers_loss": 0.1099705696105957,
+ "routers_loss": 0.11989791691303253,
"skip_count": 1.0,
"step": 240,
"text_loss": 0.3554210960865021
@@ -2297,13 +2297,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.171875,
"learning_rate": 0.000482,
- "loss": 0.0745,
+ "loss": 0.0734,
"macro_f1": 0.3333333432674408,
"num_tokens": 388777.0,
"repeat_count": 0.0,
- "routers_loss": 0.01269970741122961,
+ "routers_loss": 0.013591105118393898,
"skip_count": 0.0,
"step": 242,
"text_loss": 0.4829460382461548
@@ -2316,13 +2316,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11962890625,
+ "grad_norm": 0.12060546875,
"learning_rate": 0.000486,
- "loss": 0.061,
+ "loss": 0.0625,
"macro_f1": 0.32098764181137085,
"num_tokens": 391797.0,
"repeat_count": 0.0,
- "routers_loss": 0.08505752682685852,
+ "routers_loss": 0.0920003354549408,
"skip_count": 2.0,
"step": 244,
"text_loss": 0.3085818886756897
@@ -2335,13 +2335,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.00049,
- "loss": 0.0504,
+ "loss": 0.0501,
"macro_f1": 0.3333333432674408,
"num_tokens": 396485.0,
"repeat_count": 0.0,
- "routers_loss": 0.012750142253935337,
+ "routers_loss": 0.0129330949857831,
"skip_count": 0.0,
"step": 246,
"text_loss": 0.42803969979286194
@@ -2354,13 +2354,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.291015625,
+ "grad_norm": 0.296875,
"learning_rate": 0.000494,
- "loss": 0.0962,
+ "loss": 0.0945,
"macro_f1": 0.3144654333591461,
"num_tokens": 399923.0,
"repeat_count": 0.0,
- "routers_loss": 0.11287309974431992,
+ "routers_loss": 0.10677755624055862,
"skip_count": 3.0,
"step": 248,
"text_loss": 0.2908555567264557
@@ -2373,32 +2373,32 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.203125,
"learning_rate": 0.000498,
- "loss": 0.0821,
+ "loss": 0.0812,
"macro_f1": 0.3144654333591461,
"num_tokens": 403647.0,
"repeat_count": 0.0,
- "routers_loss": 0.1486474722623825,
+ "routers_loss": 0.1504337340593338,
"skip_count": 3.0,
"step": 250,
"text_loss": 0.333095908164978
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 1.183152333431171,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
+ "f1_skip": 0.0,
"grad_norm": 0.22265625,
"learning_rate": 0.0005020000000000001,
- "loss": 0.0832,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0828,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 409147.0,
"repeat_count": 0.0,
- "routers_loss": 0.06636594980955124,
+ "routers_loss": 0.06503184884786606,
"skip_count": 2.0,
"step": 252,
"text_loss": 0.16117942333221436
@@ -2411,13 +2411,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.267578125,
+ "grad_norm": 0.287109375,
"learning_rate": 0.000506,
- "loss": 0.1,
+ "loss": 0.0995,
"macro_f1": 0.3333333432674408,
"num_tokens": 412072.0,
"repeat_count": 0.0,
- "routers_loss": 0.015062150545418262,
+ "routers_loss": 0.016280122101306915,
"skip_count": 0.0,
"step": 254,
"text_loss": 0.4217492640018463
@@ -2430,13 +2430,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2138671875,
+ "grad_norm": 0.21484375,
"learning_rate": 0.00051,
- "loss": 0.0808,
+ "loss": 0.0803,
"macro_f1": 0.3144654333591461,
"num_tokens": 415052.0,
"repeat_count": 2.0,
- "routers_loss": 0.2051105946302414,
+ "routers_loss": 0.2117508500814438,
"skip_count": 1.0,
"step": 256,
"text_loss": 0.5795308947563171
@@ -2449,13 +2449,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2412109375,
+ "grad_norm": 0.2421875,
"learning_rate": 0.000514,
- "loss": 0.068,
+ "loss": 0.0668,
"macro_f1": 0.3272727429866791,
"num_tokens": 418099.0,
"repeat_count": 1.0,
- "routers_loss": 0.1467045396566391,
+ "routers_loss": 0.15002092719078064,
"skip_count": 0.0,
"step": 258,
"text_loss": 0.4840938448905945
@@ -2468,13 +2468,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.000518,
- "loss": 0.0543,
+ "loss": 0.0538,
"macro_f1": 0.3333333432674408,
"num_tokens": 422526.0,
"repeat_count": 0.0,
- "routers_loss": 0.013022038154304028,
+ "routers_loss": 0.012834074907004833,
"skip_count": 0.0,
"step": 260,
"text_loss": 0.36141225695610046
@@ -2487,13 +2487,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.216796875,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.000522,
- "loss": 0.0848,
+ "loss": 0.085,
"macro_f1": 0.3076923191547394,
"num_tokens": 425765.0,
"repeat_count": 2.0,
- "routers_loss": 0.2575930058956146,
+ "routers_loss": 0.23808011412620544,
"skip_count": 2.0,
"step": 262,
"text_loss": 0.27572691440582275
@@ -2506,13 +2506,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000526,
- "loss": 0.07,
+ "loss": 0.0708,
"macro_f1": 0.3272727429866791,
"num_tokens": 429048.0,
"repeat_count": 0.0,
- "routers_loss": 0.0558602549135685,
+ "routers_loss": 0.055687375366687775,
"skip_count": 1.0,
"step": 264,
"text_loss": 0.37020301818847656
@@ -2525,13 +2525,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.2080078125,
"learning_rate": 0.0005300000000000001,
- "loss": 0.082,
+ "loss": 0.0839,
"macro_f1": 0.3272727429866791,
"num_tokens": 431784.0,
"repeat_count": 0.0,
- "routers_loss": 0.09126655012369156,
+ "routers_loss": 0.0872957780957222,
"skip_count": 1.0,
"step": 266,
"text_loss": 0.5937283039093018
@@ -2544,13 +2544,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.263671875,
"learning_rate": 0.0005340000000000001,
- "loss": 0.0764,
+ "loss": 0.0733,
"macro_f1": 0.32098764181137085,
"num_tokens": 434297.0,
"repeat_count": 2.0,
- "routers_loss": 0.24805288016796112,
+ "routers_loss": 0.23507654666900635,
"skip_count": 0.0,
"step": 268,
"text_loss": 0.3367372453212738
@@ -2563,13 +2563,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.22265625,
+ "grad_norm": 0.2431640625,
"learning_rate": 0.0005380000000000001,
- "loss": 0.0686,
+ "loss": 0.0708,
"macro_f1": 0.32098764181137085,
"num_tokens": 437586.0,
"repeat_count": 0.0,
- "routers_loss": 0.13135533034801483,
+ "routers_loss": 0.12860390543937683,
"skip_count": 2.0,
"step": 270,
"text_loss": 0.7149854302406311
@@ -2582,13 +2582,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.2451171875,
"learning_rate": 0.0005420000000000001,
- "loss": 0.1083,
+ "loss": 0.1072,
"macro_f1": 0.3272727429866791,
"num_tokens": 440649.0,
"repeat_count": 0.0,
- "routers_loss": 0.04991440102458,
+ "routers_loss": 0.044308312237262726,
"skip_count": 1.0,
"step": 272,
"text_loss": 0.26778292655944824
@@ -2601,13 +2601,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.455078125,
+ "grad_norm": 0.44921875,
"learning_rate": 0.000546,
- "loss": 0.0991,
+ "loss": 0.0938,
"macro_f1": 0.3144654333591461,
"num_tokens": 443907.0,
"repeat_count": 0.0,
- "routers_loss": 0.12236632406711578,
+ "routers_loss": 0.11514109373092651,
"skip_count": 3.0,
"step": 274,
"text_loss": 0.23578761518001556
@@ -2620,13 +2620,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.25,
+ "grad_norm": 0.2578125,
"learning_rate": 0.00055,
- "loss": 0.0936,
+ "loss": 0.0932,
"macro_f1": 0.5492662787437439,
"num_tokens": 447147.0,
"repeat_count": 0.0,
- "routers_loss": 0.053506772965192795,
+ "routers_loss": 0.055705297738313675,
"skip_count": 2.0,
"step": 276,
"text_loss": 0.2513524889945984
@@ -2639,13 +2639,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.265625,
+ "grad_norm": 0.29296875,
"learning_rate": 0.000554,
- "loss": 0.066,
+ "loss": 0.0667,
"macro_f1": 0.32098764181137085,
"num_tokens": 450032.0,
"repeat_count": 0.0,
- "routers_loss": 0.13446088135242462,
+ "routers_loss": 0.13778971135616302,
"skip_count": 2.0,
"step": 278,
"text_loss": 0.4857243597507477
@@ -2658,32 +2658,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.185546875,
"learning_rate": 0.000558,
- "loss": 0.0682,
+ "loss": 0.0672,
"macro_f1": 0.3272727429866791,
"num_tokens": 453195.0,
"repeat_count": 1.0,
- "routers_loss": 0.07270720601081848,
+ "routers_loss": 0.0700262188911438,
"skip_count": 0.0,
"step": 280,
"text_loss": 0.7589789628982544
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 1.3240387437628411,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.28125,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25,
"learning_rate": 0.0005620000000000001,
- "loss": 0.0648,
- "macro_f1": 0.5427350401878357,
+ "loss": 0.0603,
+ "macro_f1": 0.3144654333591461,
"num_tokens": 455942.0,
"repeat_count": 1.0,
- "routers_loss": 0.13866399228572845,
+ "routers_loss": 0.11706235259771347,
"skip_count": 2.0,
"step": 282,
"text_loss": 0.4783432185649872
@@ -2696,13 +2696,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.236328125,
+ "grad_norm": 0.265625,
"learning_rate": 0.000566,
- "loss": 0.0782,
+ "loss": 0.0793,
"macro_f1": 0.3272727429866791,
"num_tokens": 458932.0,
"repeat_count": 0.0,
- "routers_loss": 0.0645354762673378,
+ "routers_loss": 0.07073967158794403,
"skip_count": 1.0,
"step": 284,
"text_loss": 0.7117193937301636
@@ -2715,13 +2715,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.00057,
- "loss": 0.0892,
+ "loss": 0.0915,
"macro_f1": 0.3272727429866791,
"num_tokens": 462650.0,
"repeat_count": 0.0,
- "routers_loss": 0.05967628210783005,
+ "routers_loss": 0.05301115661859512,
"skip_count": 1.0,
"step": 286,
"text_loss": 0.4175460636615753
@@ -2734,13 +2734,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23828125,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.000574,
- "loss": 0.0676,
+ "loss": 0.0675,
"macro_f1": 0.3272727429866791,
"num_tokens": 466290.0,
"repeat_count": 0.0,
- "routers_loss": 0.06438407301902771,
+ "routers_loss": 0.06356479972600937,
"skip_count": 1.0,
"step": 288,
"text_loss": 0.5832946300506592
@@ -2753,13 +2753,13 @@
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.28515625,
"learning_rate": 0.000578,
- "loss": 0.0781,
+ "loss": 0.0805,
"macro_f1": 0.3006536066532135,
"num_tokens": 469296.0,
"repeat_count": 1.0,
- "routers_loss": 0.21225209534168243,
+ "routers_loss": 0.21032999455928802,
"skip_count": 3.0,
"step": 290,
"text_loss": 0.36023473739624023
@@ -2772,13 +2772,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.244140625,
+ "grad_norm": 0.27734375,
"learning_rate": 0.0005819999999999999,
- "loss": 0.0664,
+ "loss": 0.0685,
"macro_f1": 0.32098764181137085,
"num_tokens": 472272.0,
"repeat_count": 1.0,
- "routers_loss": 0.08085516840219498,
+ "routers_loss": 0.08062280714511871,
"skip_count": 1.0,
"step": 292,
"text_loss": 0.37197956442832947
@@ -2791,13 +2791,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.28125,
"learning_rate": 0.0005859999999999999,
- "loss": 0.0874,
+ "loss": 0.0878,
"macro_f1": 0.32098764181137085,
"num_tokens": 475864.0,
"repeat_count": 0.0,
- "routers_loss": 0.05378658324480057,
+ "routers_loss": 0.05023600533604622,
"skip_count": 2.0,
"step": 294,
"text_loss": 0.4765273630619049
@@ -2810,13 +2810,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.2177734375,
"learning_rate": 0.00059,
- "loss": 0.0715,
+ "loss": 0.0728,
"macro_f1": 0.3333333432674408,
"num_tokens": 478916.0,
"repeat_count": 0.0,
- "routers_loss": 0.01145261898636818,
+ "routers_loss": 0.011689410544931889,
"skip_count": 0.0,
"step": 296,
"text_loss": 0.5878773927688599
@@ -2831,11 +2831,11 @@
"f1_skip": 0.0,
"grad_norm": 0.15625,
"learning_rate": 0.000594,
- "loss": 0.0737,
+ "loss": 0.0727,
"macro_f1": 0.3333333432674408,
"num_tokens": 482369.0,
"repeat_count": 0.0,
- "routers_loss": 0.009397956542670727,
+ "routers_loss": 0.010772093199193478,
"skip_count": 0.0,
"step": 298,
"text_loss": 0.4424116313457489
@@ -2848,13 +2848,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.181640625,
"learning_rate": 0.000598,
- "loss": 0.0802,
+ "loss": 0.0787,
"macro_f1": 0.3076923191547394,
"num_tokens": 486049.0,
"repeat_count": 2.0,
- "routers_loss": 0.2389357089996338,
+ "routers_loss": 0.23482851684093475,
"skip_count": 2.0,
"step": 300,
"text_loss": 0.21217775344848633
@@ -2862,18 +2862,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 1.417963017317288,
- "f1_execute": 0.9019607901573181,
+ "f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.2080078125,
"learning_rate": 0.000602,
- "loss": 0.0745,
- "macro_f1": 0.3006536066532135,
+ "loss": 0.073,
+ "macro_f1": 0.3076923191547394,
"num_tokens": 488683.0,
"repeat_count": 1.0,
- "routers_loss": 0.18252353370189667,
+ "routers_loss": 0.18843084573745728,
"skip_count": 3.0,
"step": 302,
"text_loss": 0.2109498232603073
@@ -2886,13 +2886,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.27734375,
+ "grad_norm": 0.279296875,
"learning_rate": 0.000606,
- "loss": 0.0935,
+ "loss": 0.0945,
"macro_f1": 0.3144654333591461,
"num_tokens": 492010.0,
"repeat_count": 0.0,
- "routers_loss": 0.18185268342494965,
+ "routers_loss": 0.17861786484718323,
"skip_count": 3.0,
"step": 304,
"text_loss": 0.8446305394172668
@@ -2905,13 +2905,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.00061,
- "loss": 0.0853,
+ "loss": 0.0827,
"macro_f1": 0.3333333432674408,
"num_tokens": 494764.0,
"repeat_count": 0.0,
- "routers_loss": 0.013210167177021503,
+ "routers_loss": 0.014124520123004913,
"skip_count": 0.0,
"step": 306,
"text_loss": 0.742735743522644
@@ -2924,13 +2924,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.26953125,
"learning_rate": 0.000614,
- "loss": 0.1089,
+ "loss": 0.1071,
"macro_f1": 0.3333333432674408,
"num_tokens": 497820.0,
"repeat_count": 0.0,
- "routers_loss": 0.016936838626861572,
+ "routers_loss": 0.017968112602829933,
"skip_count": 0.0,
"step": 308,
"text_loss": 0.28305482864379883
@@ -2943,13 +2943,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.0006180000000000001,
- "loss": 0.077,
+ "loss": 0.0775,
"macro_f1": 0.32098764181137085,
"num_tokens": 500694.0,
"repeat_count": 0.0,
- "routers_loss": 0.08630389720201492,
+ "routers_loss": 0.08593655377626419,
"skip_count": 2.0,
"step": 310,
"text_loss": 0.3496848940849304
@@ -2962,13 +2962,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.19140625,
"learning_rate": 0.000622,
- "loss": 0.0602,
+ "loss": 0.061,
"macro_f1": 0.3333333432674408,
"num_tokens": 503871.0,
"repeat_count": 0.0,
- "routers_loss": 0.013665963895618916,
+ "routers_loss": 0.016449492424726486,
"skip_count": 0.0,
"step": 312,
"text_loss": 0.6691372990608215
@@ -2981,13 +2981,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.205078125,
"learning_rate": 0.000626,
- "loss": 0.0794,
+ "loss": 0.0815,
"macro_f1": 0.3333333432674408,
"num_tokens": 506730.0,
"repeat_count": 0.0,
- "routers_loss": 0.01584783010184765,
+ "routers_loss": 0.014532964676618576,
"skip_count": 0.0,
"step": 314,
"text_loss": 0.6118118166923523
@@ -3000,13 +3000,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.224609375,
+ "grad_norm": 0.2216796875,
"learning_rate": 0.00063,
- "loss": 0.0762,
+ "loss": 0.0742,
"macro_f1": 0.3333333432674408,
"num_tokens": 510323.0,
"repeat_count": 0.0,
- "routers_loss": 0.01368923019617796,
+ "routers_loss": 0.013093139044940472,
"skip_count": 0.0,
"step": 316,
"text_loss": 0.38126271963119507
@@ -3019,13 +3019,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.388671875,
+ "grad_norm": 0.400390625,
"learning_rate": 0.000634,
- "loss": 0.0908,
+ "loss": 0.0915,
"macro_f1": 0.3333333432674408,
"num_tokens": 514075.0,
"repeat_count": 0.0,
- "routers_loss": 0.009135022759437561,
+ "routers_loss": 0.008627045899629593,
"skip_count": 0.0,
"step": 318,
"text_loss": 0.5983037948608398
@@ -3038,13 +3038,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000638,
- "loss": 0.0949,
+ "loss": 0.1008,
"macro_f1": 0.3272727429866791,
"num_tokens": 517418.0,
"repeat_count": 0.0,
- "routers_loss": 0.046641621738672256,
+ "routers_loss": 0.04561378434300423,
"skip_count": 1.0,
"step": 320,
"text_loss": 0.767257034778595
@@ -3052,18 +3052,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 1.5118872908717347,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23046875,
+ "grad_norm": 0.259765625,
"learning_rate": 0.000642,
- "loss": 0.0925,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0926,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 520443.0,
"repeat_count": 0.0,
- "routers_loss": 0.020637936890125275,
+ "routers_loss": 0.024372953921556473,
"skip_count": 0.0,
"step": 322,
"text_loss": 0.6572105884552002
@@ -3076,13 +3076,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26953125,
+ "grad_norm": 0.30078125,
"learning_rate": 0.000646,
"loss": 0.0822,
"macro_f1": 0.3272727429866791,
"num_tokens": 523317.0,
"repeat_count": 1.0,
- "routers_loss": 0.08289298415184021,
+ "routers_loss": 0.08099937438964844,
"skip_count": 0.0,
"step": 324,
"text_loss": 0.205499529838562
@@ -3090,18 +3090,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 1.530672145582624,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23828125,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.0006500000000000001,
- "loss": 0.0823,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0809,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 526355.0,
"repeat_count": 0.0,
- "routers_loss": 0.06960040330886841,
+ "routers_loss": 0.0657225176692009,
"skip_count": 1.0,
"step": 326,
"text_loss": 0.2587239742279053
@@ -3114,13 +3114,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1162109375,
+ "grad_norm": 0.111328125,
"learning_rate": 0.0006540000000000001,
- "loss": 0.0799,
+ "loss": 0.0779,
"macro_f1": 0.3333333432674408,
"num_tokens": 529689.0,
"repeat_count": 0.0,
- "routers_loss": 0.02087482251226902,
+ "routers_loss": 0.01849208027124405,
"skip_count": 0.0,
"step": 328,
"text_loss": 0.2172023057937622
@@ -3133,13 +3133,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.1845703125,
"learning_rate": 0.0006580000000000001,
- "loss": 0.0757,
+ "loss": 0.0758,
"macro_f1": 0.3333333432674408,
"num_tokens": 532603.0,
"repeat_count": 0.0,
- "routers_loss": 0.016592051833868027,
+ "routers_loss": 0.016184113919734955,
"skip_count": 0.0,
"step": 330,
"text_loss": 0.5980568528175354
@@ -3152,32 +3152,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.22265625,
+ "grad_norm": 0.220703125,
"learning_rate": 0.000662,
- "loss": 0.0438,
+ "loss": 0.0439,
"macro_f1": 0.3333333432674408,
"num_tokens": 536056.0,
"repeat_count": 0.0,
- "routers_loss": 0.012950568459928036,
+ "routers_loss": 0.01303898449987173,
"skip_count": 0.0,
"step": 332,
"text_loss": 0.5421966314315796
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
"epoch": 1.5682418550044028,
- "f1_execute": 0.8799999952316284,
+ "f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.310546875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.296875,
"learning_rate": 0.000666,
- "loss": 0.0964,
- "macro_f1": 0.29333335161209106,
+ "loss": 0.0963,
+ "macro_f1": 0.465986430644989,
"num_tokens": 539231.0,
"repeat_count": 3.0,
- "routers_loss": 0.3373340964317322,
+ "routers_loss": 0.3075675964355469,
"skip_count": 3.0,
"step": 334,
"text_loss": 0.19719554483890533
@@ -3190,13 +3190,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.171875,
+ "grad_norm": 0.173828125,
"learning_rate": 0.00067,
"loss": 0.0706,
"macro_f1": 0.3333333432674408,
"num_tokens": 542038.0,
"repeat_count": 0.0,
- "routers_loss": 0.008110735565423965,
+ "routers_loss": 0.009116224013268948,
"skip_count": 0.0,
"step": 336,
"text_loss": 0.3407036066055298
@@ -3209,13 +3209,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.248046875,
+ "grad_norm": 0.2421875,
"learning_rate": 0.000674,
- "loss": 0.0771,
+ "loss": 0.0768,
"macro_f1": 0.3333333432674408,
"num_tokens": 545019.0,
"repeat_count": 0.0,
- "routers_loss": 0.01841609925031662,
+ "routers_loss": 0.021463042125105858,
"skip_count": 0.0,
"step": 338,
"text_loss": 0.24486012756824493
@@ -3228,13 +3228,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.0006780000000000001,
- "loss": 0.0894,
+ "loss": 0.0889,
"macro_f1": 0.3333333432674408,
"num_tokens": 548036.0,
"repeat_count": 0.0,
- "routers_loss": 0.01612614095211029,
+ "routers_loss": 0.01857556402683258,
"skip_count": 0.0,
"step": 340,
"text_loss": 0.28140124678611755
@@ -3247,13 +3247,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.125,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0006820000000000001,
- "loss": 0.0611,
+ "loss": 0.0617,
"macro_f1": 0.3006536364555359,
"num_tokens": 551419.0,
"repeat_count": 2.0,
- "routers_loss": 0.26202192902565,
+ "routers_loss": 0.27090007066726685,
"skip_count": 3.0,
"step": 342,
"text_loss": 0.20690307021141052
@@ -3266,13 +3266,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.3046875,
"learning_rate": 0.0006860000000000001,
- "loss": 0.1013,
+ "loss": 0.1047,
"macro_f1": 0.32098764181137085,
"num_tokens": 554037.0,
"repeat_count": 0.0,
- "routers_loss": 0.09235779196023941,
+ "routers_loss": 0.09231195598840714,
"skip_count": 2.0,
"step": 344,
"text_loss": 0.4479128420352936
@@ -3285,13 +3285,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.255859375,
"learning_rate": 0.00069,
- "loss": 0.0856,
+ "loss": 0.0883,
"macro_f1": 0.3333333432674408,
"num_tokens": 556672.0,
"repeat_count": 0.0,
- "routers_loss": 0.010735333897173405,
+ "routers_loss": 0.00935924518853426,
"skip_count": 0.0,
"step": 346,
"text_loss": 0.6377320289611816
@@ -3304,13 +3304,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.000694,
- "loss": 0.0778,
+ "loss": 0.0781,
"macro_f1": 0.32098764181137085,
"num_tokens": 559756.0,
"repeat_count": 0.0,
- "routers_loss": 0.14742356538772583,
+ "routers_loss": 0.17641772329807281,
"skip_count": 2.0,
"step": 348,
"text_loss": 0.6097636222839355
@@ -3323,13 +3323,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.30859375,
+ "grad_norm": 0.30078125,
"learning_rate": 0.0006979999999999999,
- "loss": 0.0614,
+ "loss": 0.0616,
"macro_f1": 0.5492662787437439,
"num_tokens": 563415.0,
"repeat_count": 0.0,
- "routers_loss": 0.06606879830360413,
+ "routers_loss": 0.06240406632423401,
"skip_count": 2.0,
"step": 350,
"text_loss": 0.5291631817817688
@@ -3342,13 +3342,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.322265625,
+ "grad_norm": 0.296875,
"learning_rate": 0.0007019999999999999,
- "loss": 0.1033,
+ "loss": 0.1026,
"macro_f1": 0.3333333432674408,
"num_tokens": 566357.0,
"repeat_count": 0.0,
- "routers_loss": 0.012873432599008083,
+ "routers_loss": 0.012269247323274612,
"skip_count": 0.0,
"step": 352,
"text_loss": 0.5170195698738098
@@ -3361,13 +3361,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0007059999999999999,
- "loss": 0.0819,
+ "loss": 0.0815,
"macro_f1": 0.32098764181137085,
"num_tokens": 569449.0,
"repeat_count": 0.0,
- "routers_loss": 0.07853665202856064,
+ "routers_loss": 0.07515309751033783,
"skip_count": 2.0,
"step": 354,
"text_loss": 0.34507250785827637
@@ -3380,13 +3380,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.251953125,
+ "grad_norm": 0.263671875,
"learning_rate": 0.00071,
- "loss": 0.0804,
+ "loss": 0.0791,
"macro_f1": 0.3144654333591461,
"num_tokens": 572761.0,
"repeat_count": 1.0,
- "routers_loss": 0.2216549813747406,
+ "routers_loss": 0.20768006145954132,
"skip_count": 2.0,
"step": 356,
"text_loss": 0.3158532381057739
@@ -3399,13 +3399,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.185546875,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.000714,
- "loss": 0.0675,
+ "loss": 0.0682,
"macro_f1": 0.3333333432674408,
"num_tokens": 575909.0,
"repeat_count": 0.0,
- "routers_loss": 0.02423691377043724,
+ "routers_loss": 0.025329967960715294,
"skip_count": 0.0,
"step": 358,
"text_loss": 0.21455390751361847
@@ -3413,18 +3413,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 1.6903434106251836,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.21484375,
"learning_rate": 0.000718,
- "loss": 0.0781,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0775,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 579186.0,
"repeat_count": 1.0,
- "routers_loss": 0.07496294379234314,
+ "routers_loss": 0.07676175981760025,
"skip_count": 0.0,
"step": 360,
"text_loss": 0.61895352602005
@@ -3437,13 +3437,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2138671875,
+ "grad_norm": 0.197265625,
"learning_rate": 0.000722,
- "loss": 0.0778,
+ "loss": 0.0781,
"macro_f1": 0.32098767161369324,
"num_tokens": 582437.0,
"repeat_count": 0.0,
- "routers_loss": 0.08181872963905334,
+ "routers_loss": 0.08070661872625351,
"skip_count": 1.0,
"step": 362,
"text_loss": 0.20557661354541779
@@ -3456,13 +3456,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.2216796875,
"learning_rate": 0.000726,
- "loss": 0.1112,
+ "loss": 0.11,
"macro_f1": 0.3333333432674408,
"num_tokens": 586096.0,
"repeat_count": 0.0,
- "routers_loss": 0.016959719359874725,
+ "routers_loss": 0.015891313552856445,
"skip_count": 0.0,
"step": 364,
"text_loss": 0.597991943359375
@@ -3475,13 +3475,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.15625,
"learning_rate": 0.00073,
- "loss": 0.0577,
+ "loss": 0.0573,
"macro_f1": 0.3076923191547394,
"num_tokens": 589520.0,
"repeat_count": 1.0,
- "routers_loss": 0.13295969367027283,
+ "routers_loss": 0.12844261527061462,
"skip_count": 3.0,
"step": 366,
"text_loss": 0.2944789230823517
@@ -3494,13 +3494,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1455078125,
+ "grad_norm": 0.150390625,
"learning_rate": 0.000734,
- "loss": 0.0986,
+ "loss": 0.1005,
"macro_f1": 0.3333333432674408,
"num_tokens": 592691.0,
"repeat_count": 0.0,
- "routers_loss": 0.02476893551647663,
+ "routers_loss": 0.02382199838757515,
"skip_count": 0.0,
"step": 368,
"text_loss": 0.23989969491958618
@@ -3513,13 +3513,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1796875,
"learning_rate": 0.000738,
- "loss": 0.0682,
+ "loss": 0.0661,
"macro_f1": 0.3333333432674408,
"num_tokens": 596004.0,
"repeat_count": 0.0,
- "routers_loss": 0.019863395020365715,
+ "routers_loss": 0.018812084570527077,
"skip_count": 0.0,
"step": 370,
"text_loss": 0.22111408412456512
@@ -3532,13 +3532,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.2412109375,
"learning_rate": 0.000742,
- "loss": 0.0663,
+ "loss": 0.0666,
"macro_f1": 0.3272727429866791,
"num_tokens": 599087.0,
"repeat_count": 0.0,
- "routers_loss": 0.07230417430400848,
+ "routers_loss": 0.08290331065654755,
"skip_count": 1.0,
"step": 372,
"text_loss": 0.2567356526851654
@@ -3551,13 +3551,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.2412109375,
"learning_rate": 0.000746,
- "loss": 0.0986,
+ "loss": 0.0941,
"macro_f1": 0.32098764181137085,
"num_tokens": 602330.0,
"repeat_count": 1.0,
- "routers_loss": 0.11727793514728546,
+ "routers_loss": 0.11482042074203491,
"skip_count": 1.0,
"step": 374,
"text_loss": 0.7217292785644531
@@ -3570,13 +3570,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.224609375,
+ "grad_norm": 0.2265625,
"learning_rate": 0.00075,
- "loss": 0.0724,
+ "loss": 0.0728,
"macro_f1": 0.3272727429866791,
"num_tokens": 605503.0,
"repeat_count": 1.0,
- "routers_loss": 0.13495951890945435,
+ "routers_loss": 0.11849870532751083,
"skip_count": 0.0,
"step": 376,
"text_loss": 0.5122153759002686
@@ -3589,13 +3589,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23046875,
+ "grad_norm": 0.2333984375,
"learning_rate": 0.000754,
- "loss": 0.0823,
+ "loss": 0.0835,
"macro_f1": 0.32098767161369324,
"num_tokens": 608505.0,
"repeat_count": 0.0,
- "routers_loss": 0.07612533867359161,
+ "routers_loss": 0.07090992480516434,
"skip_count": 1.0,
"step": 378,
"text_loss": 0.2204965502023697
@@ -3608,13 +3608,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.1826171875,
"learning_rate": 0.000758,
- "loss": 0.0803,
+ "loss": 0.0794,
"macro_f1": 0.3272727429866791,
"num_tokens": 611193.0,
"repeat_count": 0.0,
- "routers_loss": 0.0484120175242424,
+ "routers_loss": 0.03812089189887047,
"skip_count": 1.0,
"step": 380,
"text_loss": 0.44909021258354187
@@ -3627,13 +3627,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.000762,
- "loss": 0.0866,
+ "loss": 0.0882,
"macro_f1": 0.3272727429866791,
"num_tokens": 614231.0,
"repeat_count": 1.0,
- "routers_loss": 0.10939671844244003,
+ "routers_loss": 0.10270529240369797,
"skip_count": 0.0,
"step": 382,
"text_loss": 0.13624964654445648
@@ -3646,13 +3646,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.326171875,
+ "grad_norm": 0.330078125,
"learning_rate": 0.0007660000000000001,
- "loss": 0.1083,
+ "loss": 0.1107,
"macro_f1": 0.32098764181137085,
"num_tokens": 617090.0,
"repeat_count": 1.0,
- "routers_loss": 0.11382336914539337,
+ "routers_loss": 0.11624004691839218,
"skip_count": 1.0,
"step": 384,
"text_loss": 0.7314052581787109
@@ -3667,11 +3667,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1396484375,
"learning_rate": 0.0007700000000000001,
- "loss": 0.0616,
+ "loss": 0.0628,
"macro_f1": 0.32098764181137085,
"num_tokens": 620596.0,
"repeat_count": 0.0,
- "routers_loss": 0.07494530081748962,
+ "routers_loss": 0.07114322483539581,
"skip_count": 2.0,
"step": 386,
"text_loss": 0.503322958946228
@@ -3684,13 +3684,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.298828125,
+ "grad_norm": 0.306640625,
"learning_rate": 0.0007740000000000001,
- "loss": 0.0816,
+ "loss": 0.0829,
"macro_f1": 0.32098764181137085,
"num_tokens": 624108.0,
"repeat_count": 0.0,
- "routers_loss": 0.05718417093157768,
+ "routers_loss": 0.06061873584985733,
"skip_count": 2.0,
"step": 388,
"text_loss": 0.11481904983520508
@@ -3703,13 +3703,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1982421875,
+ "grad_norm": 0.2099609375,
"learning_rate": 0.000778,
- "loss": 0.0783,
+ "loss": 0.0791,
"macro_f1": 0.3006536364555359,
"num_tokens": 626895.0,
"repeat_count": 1.0,
- "routers_loss": 0.2848989963531494,
+ "routers_loss": 0.2921771705150604,
"skip_count": 4.0,
"step": 390,
"text_loss": 0.3069624602794647
@@ -3722,13 +3722,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.30078125,
+ "grad_norm": 0.30859375,
"learning_rate": 0.000782,
- "loss": 0.0608,
+ "loss": 0.0605,
"macro_f1": 0.3076923191547394,
"num_tokens": 630204.0,
"repeat_count": 0.0,
- "routers_loss": 0.2050076276063919,
+ "routers_loss": 0.202707901597023,
"skip_count": 4.0,
"step": 392,
"text_loss": 0.6022785305976868
@@ -3741,13 +3741,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28125,
+ "grad_norm": 0.29296875,
"learning_rate": 0.000786,
- "loss": 0.0863,
+ "loss": 0.0877,
"macro_f1": 0.3333333432674408,
"num_tokens": 634373.0,
"repeat_count": 0.0,
- "routers_loss": 0.020946886390447617,
+ "routers_loss": 0.0221510399132967,
"skip_count": 0.0,
"step": 394,
"text_loss": 0.26787394285202026
@@ -3760,13 +3760,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.376953125,
+ "grad_norm": 0.37890625,
"learning_rate": 0.00079,
- "loss": 0.0798,
+ "loss": 0.0805,
"macro_f1": 0.32098764181137085,
"num_tokens": 637442.0,
"repeat_count": 2.0,
- "routers_loss": 0.1270289123058319,
+ "routers_loss": 0.12636390328407288,
"skip_count": 0.0,
"step": 396,
"text_loss": 0.2799781560897827
@@ -3779,13 +3779,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.2080078125,
"learning_rate": 0.0007940000000000001,
- "loss": 0.0701,
+ "loss": 0.0724,
"macro_f1": 0.32098764181137085,
"num_tokens": 641231.0,
"repeat_count": 0.0,
- "routers_loss": 0.08012636005878448,
+ "routers_loss": 0.07933453470468521,
"skip_count": 2.0,
"step": 398,
"text_loss": 0.2507784366607666
@@ -3798,13 +3798,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.0007980000000000001,
- "loss": 0.0901,
+ "loss": 0.0909,
"macro_f1": 0.3272727429866791,
"num_tokens": 644560.0,
"repeat_count": 1.0,
- "routers_loss": 0.09315784275531769,
+ "routers_loss": 0.10324911028146744,
"skip_count": 0.0,
"step": 400,
"text_loss": 0.7756280303001404
@@ -3817,13 +3817,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0008020000000000001,
- "loss": 0.078,
+ "loss": 0.0783,
"macro_f1": 0.3144654333591461,
"num_tokens": 647393.0,
"repeat_count": 1.0,
- "routers_loss": 0.18492189049720764,
+ "routers_loss": 0.18546262383460999,
"skip_count": 2.0,
"step": 402,
"text_loss": 0.5013328194618225
@@ -3836,13 +3836,13 @@
"f1_execute": 0.8571428656578064,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.283203125,
"learning_rate": 0.0008060000000000001,
- "loss": 0.0801,
+ "loss": 0.0787,
"macro_f1": 0.2857142984867096,
"num_tokens": 650355.0,
"repeat_count": 3.0,
- "routers_loss": 0.32641324400901794,
+ "routers_loss": 0.3280293643474579,
"skip_count": 4.0,
"step": 404,
"text_loss": 0.2842077314853668
@@ -3855,13 +3855,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2080078125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.0008100000000000001,
- "loss": 0.0905,
+ "loss": 0.0901,
"macro_f1": 0.3333333432674408,
"num_tokens": 654280.0,
"repeat_count": 0.0,
- "routers_loss": 0.02722037397325039,
+ "routers_loss": 0.02623247355222702,
"skip_count": 0.0,
"step": 406,
"text_loss": 0.46742817759513855
@@ -3874,13 +3874,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.216796875,
"learning_rate": 0.0008139999999999999,
- "loss": 0.0958,
+ "loss": 0.0945,
"macro_f1": 0.3333333432674408,
"num_tokens": 657568.0,
"repeat_count": 0.0,
- "routers_loss": 0.010129833593964577,
+ "routers_loss": 0.009744114242494106,
"skip_count": 0.0,
"step": 408,
"text_loss": 0.7168047428131104
@@ -3893,13 +3893,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2373046875,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.0008179999999999999,
- "loss": 0.1084,
+ "loss": 0.1065,
"macro_f1": 0.32098764181137085,
"num_tokens": 660593.0,
"repeat_count": 0.0,
- "routers_loss": 0.07298308610916138,
+ "routers_loss": 0.07591600716114044,
"skip_count": 2.0,
"step": 410,
"text_loss": 0.449823260307312
@@ -3912,13 +3912,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.15625,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0008219999999999999,
- "loss": 0.0802,
+ "loss": 0.0795,
"macro_f1": 0.3333333432674408,
"num_tokens": 663916.0,
"repeat_count": 0.0,
- "routers_loss": 0.024257874116301537,
+ "routers_loss": 0.02076602540910244,
"skip_count": 0.0,
"step": 412,
"text_loss": 0.4764713943004608
@@ -3931,13 +3931,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1904296875,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.000826,
- "loss": 0.0842,
+ "loss": 0.0836,
"macro_f1": 0.3272727429866791,
"num_tokens": 667502.0,
"repeat_count": 0.0,
- "routers_loss": 0.048864223062992096,
+ "routers_loss": 0.049170155078172684,
"skip_count": 1.0,
"step": 414,
"text_loss": 0.30333325266838074
@@ -3950,13 +3950,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1513671875,
"learning_rate": 0.00083,
- "loss": 0.1026,
+ "loss": 0.1021,
"macro_f1": 0.3272727429866791,
"num_tokens": 670510.0,
"repeat_count": 1.0,
- "routers_loss": 0.1592330038547516,
+ "routers_loss": 0.15554003417491913,
"skip_count": 0.0,
"step": 416,
"text_loss": 0.3691870868206024
@@ -3969,13 +3969,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000834,
- "loss": 0.0963,
+ "loss": 0.1013,
"macro_f1": 0.3333333432674408,
"num_tokens": 674761.0,
"repeat_count": 0.0,
- "routers_loss": 0.02291976846754551,
+ "routers_loss": 0.024516675621271133,
"skip_count": 0.0,
"step": 418,
"text_loss": 0.32850381731987
@@ -3988,13 +3988,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.000838,
- "loss": 0.0634,
+ "loss": 0.0649,
"macro_f1": 0.3333333432674408,
"num_tokens": 678055.0,
"repeat_count": 0.0,
- "routers_loss": 0.010272650048136711,
+ "routers_loss": 0.011026890948414803,
"skip_count": 0.0,
"step": 420,
"text_loss": 0.6637290716171265
@@ -4007,13 +4007,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28125,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000842,
- "loss": 0.0786,
+ "loss": 0.0771,
"macro_f1": 0.3272727429866791,
"num_tokens": 680979.0,
"repeat_count": 0.0,
- "routers_loss": 0.0692613497376442,
+ "routers_loss": 0.07451887428760529,
"skip_count": 1.0,
"step": 422,
"text_loss": 0.27131685614585876
@@ -4026,13 +4026,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12890625,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.000846,
- "loss": 0.0706,
+ "loss": 0.0714,
"macro_f1": 0.32098764181137085,
"num_tokens": 684144.0,
"repeat_count": 1.0,
- "routers_loss": 0.12713804841041565,
+ "routers_loss": 0.11341800540685654,
"skip_count": 1.0,
"step": 424,
"text_loss": 0.652126669883728
@@ -4045,13 +4045,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.00085,
- "loss": 0.0758,
+ "loss": 0.0754,
"macro_f1": 0.3272727429866791,
"num_tokens": 687004.0,
"repeat_count": 1.0,
- "routers_loss": 0.08670130372047424,
+ "routers_loss": 0.08985847979784012,
"skip_count": 0.0,
"step": 426,
"text_loss": 0.2589428424835205
@@ -4064,13 +4064,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.23828125,
"learning_rate": 0.000854,
- "loss": 0.0857,
+ "loss": 0.0866,
"macro_f1": 0.3333333432674408,
"num_tokens": 689702.0,
"repeat_count": 0.0,
- "routers_loss": 0.01053862925618887,
+ "routers_loss": 0.011355436407029629,
"skip_count": 0.0,
"step": 428,
"text_loss": 0.8909716010093689
@@ -4083,13 +4083,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.000858,
- "loss": 0.0615,
+ "loss": 0.0623,
"macro_f1": 0.3333333432674408,
"num_tokens": 692698.0,
"repeat_count": 0.0,
- "routers_loss": 0.012946994043886662,
+ "routers_loss": 0.013788948766887188,
"skip_count": 0.0,
"step": 430,
"text_loss": 0.19141142070293427
@@ -4102,13 +4102,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.000862,
- "loss": 0.0498,
+ "loss": 0.0499,
"macro_f1": 0.32098764181137085,
"num_tokens": 696007.0,
"repeat_count": 0.0,
- "routers_loss": 0.08222822099924088,
+ "routers_loss": 0.07998392730951309,
"skip_count": 2.0,
"step": 432,
"text_loss": 0.1611809879541397
@@ -4121,13 +4121,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.173828125,
"learning_rate": 0.000866,
- "loss": 0.0532,
+ "loss": 0.0541,
"macro_f1": 0.32098764181137085,
"num_tokens": 700271.0,
"repeat_count": 0.0,
- "routers_loss": 0.07086442410945892,
+ "routers_loss": 0.06988382339477539,
"skip_count": 2.0,
"step": 434,
"text_loss": 0.37254223227500916
@@ -4140,13 +4140,13 @@
"f1_execute": 0.8333333730697632,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.00087,
- "loss": 0.0825,
+ "loss": 0.0834,
"macro_f1": 0.2777777910232544,
"num_tokens": 703519.0,
"repeat_count": 3.0,
- "routers_loss": 0.29007306694984436,
+ "routers_loss": 0.28240787982940674,
"skip_count": 5.0,
"step": 436,
"text_loss": 0.29636648297309875
@@ -4159,13 +4159,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.396484375,
+ "grad_norm": 0.423828125,
"learning_rate": 0.000874,
- "loss": 0.0658,
+ "loss": 0.0657,
"macro_f1": 0.3333333432674408,
"num_tokens": 706826.0,
"repeat_count": 0.0,
- "routers_loss": 0.014652491547167301,
+ "routers_loss": 0.013924967497587204,
"skip_count": 0.0,
"step": 438,
"text_loss": 0.20867908000946045
@@ -4178,13 +4178,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.000878,
- "loss": 0.0685,
+ "loss": 0.0657,
"macro_f1": 0.3333333432674408,
"num_tokens": 710530.0,
"repeat_count": 0.0,
- "routers_loss": 0.013720969669520855,
+ "routers_loss": 0.01170142088085413,
"skip_count": 0.0,
"step": 440,
"text_loss": 0.7273373007774353
@@ -4197,13 +4197,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.173828125,
+ "grad_norm": 0.171875,
"learning_rate": 0.000882,
- "loss": 0.0771,
+ "loss": 0.076,
"macro_f1": 0.3333333432674408,
"num_tokens": 713503.0,
"repeat_count": 0.0,
- "routers_loss": 0.011687638238072395,
+ "routers_loss": 0.011930872686207294,
"skip_count": 0.0,
"step": 442,
"text_loss": 0.39314430952072144
@@ -4216,13 +4216,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.0008860000000000001,
- "loss": 0.0604,
+ "loss": 0.0592,
"macro_f1": 0.3333333432674408,
"num_tokens": 716582.0,
"repeat_count": 0.0,
- "routers_loss": 0.007869532331824303,
+ "routers_loss": 0.008630385622382164,
"skip_count": 0.0,
"step": 444,
"text_loss": 0.5925271511077881
@@ -4230,18 +4230,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 2.0939242735544465,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.23046875,
"learning_rate": 0.0008900000000000001,
- "loss": 0.0797,
- "macro_f1": 0.3076923191547394,
+ "loss": 0.0811,
+ "macro_f1": 0.3006536066532135,
"num_tokens": 719941.0,
"repeat_count": 3.0,
- "routers_loss": 0.3034668564796448,
+ "routers_loss": 0.3015584945678711,
"skip_count": 1.0,
"step": 446,
"text_loss": 0.5059905052185059
@@ -4254,13 +4254,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2314453125,
+ "grad_norm": 0.203125,
"learning_rate": 0.000894,
- "loss": 0.0823,
+ "loss": 0.0822,
"macro_f1": 0.31446540355682373,
"num_tokens": 723113.0,
"repeat_count": 1.0,
- "routers_loss": 0.11066079139709473,
+ "routers_loss": 0.10897493362426758,
"skip_count": 1.0,
"step": 448,
"text_loss": 0.19616436958312988
@@ -4273,13 +4273,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3046875,
+ "grad_norm": 0.33984375,
"learning_rate": 0.000898,
- "loss": 0.0773,
+ "loss": 0.0782,
"macro_f1": 0.32098764181137085,
"num_tokens": 726193.0,
"repeat_count": 0.0,
- "routers_loss": 0.0755370482802391,
+ "routers_loss": 0.07236456125974655,
"skip_count": 2.0,
"step": 450,
"text_loss": 0.1773054152727127
@@ -4292,13 +4292,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28125,
+ "grad_norm": 0.3203125,
"learning_rate": 0.000902,
- "loss": 0.0596,
+ "loss": 0.058,
"macro_f1": 0.3272727429866791,
"num_tokens": 729275.0,
"repeat_count": 1.0,
- "routers_loss": 0.08470689505338669,
+ "routers_loss": 0.08184371143579483,
"skip_count": 0.0,
"step": 452,
"text_loss": 0.4927310049533844
@@ -4311,13 +4311,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19921875,
+ "grad_norm": 0.1953125,
"learning_rate": 0.000906,
- "loss": 0.0608,
+ "loss": 0.0607,
"macro_f1": 0.3333333432674408,
"num_tokens": 731948.0,
"repeat_count": 0.0,
- "routers_loss": 0.0130238626152277,
+ "routers_loss": 0.014033539220690727,
"skip_count": 0.0,
"step": 454,
"text_loss": 0.4745742678642273
@@ -4330,13 +4330,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.00091,
- "loss": 0.0652,
+ "loss": 0.0651,
"macro_f1": 0.3333333432674408,
"num_tokens": 735351.0,
"repeat_count": 0.0,
- "routers_loss": 0.007108641788363457,
+ "routers_loss": 0.0071774693205952644,
"skip_count": 0.0,
"step": 456,
"text_loss": 0.18523462116718292
@@ -4351,11 +4351,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.400390625,
"learning_rate": 0.0009140000000000001,
- "loss": 0.0746,
+ "loss": 0.0738,
"macro_f1": 0.5492662787437439,
"num_tokens": 738587.0,
"repeat_count": 0.0,
- "routers_loss": 0.06834109872579575,
+ "routers_loss": 0.07781517505645752,
"skip_count": 2.0,
"step": 458,
"text_loss": 0.3459635376930237
@@ -4368,13 +4368,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.28125,
"learning_rate": 0.0009180000000000001,
- "loss": 0.0733,
+ "loss": 0.0723,
"macro_f1": 0.3076923191547394,
"num_tokens": 741779.0,
"repeat_count": 0.0,
- "routers_loss": 0.10230778902769089,
+ "routers_loss": 0.09529037028551102,
"skip_count": 2.0,
"step": 460,
"text_loss": 0.20197433233261108
@@ -4387,13 +4387,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.1865234375,
"learning_rate": 0.0009220000000000001,
- "loss": 0.0528,
+ "loss": 0.0519,
"macro_f1": 0.3333333432674408,
"num_tokens": 745355.0,
"repeat_count": 0.0,
- "routers_loss": 0.009987542405724525,
+ "routers_loss": 0.009765669703483582,
"skip_count": 0.0,
"step": 462,
"text_loss": 0.7031404376029968
@@ -4406,13 +4406,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.125,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009260000000000001,
- "loss": 0.0536,
+ "loss": 0.0527,
"macro_f1": 0.3272727429866791,
"num_tokens": 748628.0,
"repeat_count": 0.0,
- "routers_loss": 0.03448869287967682,
+ "routers_loss": 0.03344850242137909,
"skip_count": 1.0,
"step": 464,
"text_loss": 0.21274663507938385
@@ -4425,13 +4425,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.173828125,
"learning_rate": 0.00093,
- "loss": 0.053,
+ "loss": 0.0534,
"macro_f1": 0.3076923191547394,
"num_tokens": 751472.0,
"repeat_count": 2.0,
- "routers_loss": 0.13631699979305267,
+ "routers_loss": 0.1354292333126068,
"skip_count": 2.0,
"step": 466,
"text_loss": 0.5350717306137085
@@ -4444,13 +4444,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.142578125,
"learning_rate": 0.000934,
- "loss": 0.06,
+ "loss": 0.0598,
"macro_f1": 0.3272727429866791,
"num_tokens": 754479.0,
"repeat_count": 0.0,
- "routers_loss": 0.053951870650053024,
+ "routers_loss": 0.056420840322971344,
"skip_count": 1.0,
"step": 468,
"text_loss": 0.28153330087661743
@@ -4463,13 +4463,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.228515625,
+ "grad_norm": 0.234375,
"learning_rate": 0.0009379999999999999,
- "loss": 0.059,
+ "loss": 0.0597,
"macro_f1": 0.31446540355682373,
"num_tokens": 757872.0,
"repeat_count": 1.0,
- "routers_loss": 0.14479905366897583,
+ "routers_loss": 0.1622387170791626,
"skip_count": 1.0,
"step": 470,
"text_loss": 0.22956843674182892
@@ -4482,13 +4482,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.44140625,
+ "grad_norm": 0.5,
"learning_rate": 0.000942,
- "loss": 0.0913,
+ "loss": 0.0953,
"macro_f1": 0.32098764181137085,
"num_tokens": 760468.0,
"repeat_count": 0.0,
- "routers_loss": 0.056221429258584976,
+ "routers_loss": 0.05146972835063934,
"skip_count": 2.0,
"step": 472,
"text_loss": 0.4513966739177704
@@ -4501,13 +4501,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1904296875,
+ "grad_norm": 0.212890625,
"learning_rate": 0.000946,
- "loss": 0.0591,
+ "loss": 0.0592,
"macro_f1": 0.3272727429866791,
"num_tokens": 763519.0,
"repeat_count": 1.0,
- "routers_loss": 0.09729792177677155,
+ "routers_loss": 0.09022669494152069,
"skip_count": 0.0,
"step": 474,
"text_loss": 0.25758957862854004
@@ -4520,13 +4520,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12158203125,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.00095,
- "loss": 0.0496,
+ "loss": 0.0498,
"macro_f1": 0.3272727429866791,
"num_tokens": 767391.0,
"repeat_count": 0.0,
- "routers_loss": 0.029447713866829872,
+ "routers_loss": 0.03044828027486801,
"skip_count": 1.0,
"step": 476,
"text_loss": 0.21366681158542633
@@ -4539,13 +4539,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.271484375,
+ "grad_norm": 0.291015625,
"learning_rate": 0.000954,
- "loss": 0.0801,
+ "loss": 0.0802,
"macro_f1": 0.3272727429866791,
"num_tokens": 770338.0,
"repeat_count": 0.0,
- "routers_loss": 0.09337342530488968,
+ "routers_loss": 0.10397060960531235,
"skip_count": 1.0,
"step": 478,
"text_loss": 1.0396177768707275
@@ -4560,11 +4560,11 @@
"f1_skip": 0.0,
"grad_norm": 0.267578125,
"learning_rate": 0.000958,
- "loss": 0.1102,
+ "loss": 0.1099,
"macro_f1": 0.285714328289032,
"num_tokens": 773699.0,
"repeat_count": 2.0,
- "routers_loss": 0.23193210363388062,
+ "routers_loss": 0.22604143619537354,
"skip_count": 4.0,
"step": 480,
"text_loss": 0.2570283114910126
@@ -4572,18 +4572,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 2.2629879659524508,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.146484375,
"learning_rate": 0.000962,
- "loss": 0.0669,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0667,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 777473.0,
"repeat_count": 0.0,
- "routers_loss": 0.046257760375738144,
+ "routers_loss": 0.048258859664201736,
"skip_count": 1.0,
"step": 482,
"text_loss": 0.2540103495121002
@@ -4596,13 +4596,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1708984375,
+ "grad_norm": 0.197265625,
"learning_rate": 0.000966,
- "loss": 0.0552,
+ "loss": 0.0592,
"macro_f1": 0.3333333432674408,
"num_tokens": 780833.0,
"repeat_count": 0.0,
- "routers_loss": 0.01683143898844719,
+ "routers_loss": 0.023018671199679375,
"skip_count": 0.0,
"step": 484,
"text_loss": 0.38524550199508667
@@ -4615,13 +4615,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.326171875,
+ "grad_norm": 0.314453125,
"learning_rate": 0.0009699999999999999,
- "loss": 0.071,
+ "loss": 0.0709,
"macro_f1": 0.3272727429866791,
"num_tokens": 783656.0,
"repeat_count": 0.0,
- "routers_loss": 0.04129387438297272,
+ "routers_loss": 0.044845327734947205,
"skip_count": 1.0,
"step": 486,
"text_loss": 0.5859048366546631
@@ -4634,13 +4634,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000974,
- "loss": 0.0605,
+ "loss": 0.0615,
"macro_f1": 0.3333333432674408,
"num_tokens": 787173.0,
"repeat_count": 0.0,
- "routers_loss": 0.01262948103249073,
+ "routers_loss": 0.010898692533373833,
"skip_count": 0.0,
"step": 488,
"text_loss": 0.3456067442893982
@@ -4653,13 +4653,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000978,
- "loss": 0.081,
+ "loss": 0.0796,
"macro_f1": 0.32098764181137085,
"num_tokens": 790395.0,
"repeat_count": 0.0,
- "routers_loss": 0.07404553890228271,
+ "routers_loss": 0.06497956812381744,
"skip_count": 2.0,
"step": 490,
"text_loss": 0.3751123249530792
@@ -4672,13 +4672,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.000982,
- "loss": 0.0751,
+ "loss": 0.0772,
"macro_f1": 0.3272727429866791,
"num_tokens": 793137.0,
"repeat_count": 0.0,
- "routers_loss": 0.06795930862426758,
+ "routers_loss": 0.07763728499412537,
"skip_count": 1.0,
"step": 492,
"text_loss": 0.43296709656715393
@@ -4691,13 +4691,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.1416015625,
"learning_rate": 0.0009860000000000001,
- "loss": 0.0804,
+ "loss": 0.0819,
"macro_f1": 0.3333333432674408,
"num_tokens": 796497.0,
"repeat_count": 0.0,
- "routers_loss": 0.02233024686574936,
+ "routers_loss": 0.02127906307578087,
"skip_count": 0.0,
"step": 494,
"text_loss": 0.4841311275959015
@@ -4710,13 +4710,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.00099,
- "loss": 0.0731,
+ "loss": 0.073,
"macro_f1": 0.3272727429866791,
"num_tokens": 799361.0,
"repeat_count": 1.0,
- "routers_loss": 0.07979031652212143,
+ "routers_loss": 0.09518691152334213,
"skip_count": 0.0,
"step": 496,
"text_loss": 0.5094487071037292
@@ -4729,13 +4729,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.130859375,
"learning_rate": 0.000994,
- "loss": 0.0795,
+ "loss": 0.0789,
"macro_f1": 0.5492662787437439,
"num_tokens": 802629.0,
"repeat_count": 0.0,
- "routers_loss": 0.045646365731954575,
+ "routers_loss": 0.0563947930932045,
"skip_count": 2.0,
"step": 498,
"text_loss": 0.42783617973327637
@@ -4748,13 +4748,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.1865234375,
"learning_rate": 0.000998,
"loss": 0.0476,
"macro_f1": 0.3272727429866791,
"num_tokens": 805881.0,
"repeat_count": 1.0,
- "routers_loss": 0.09717849642038345,
+ "routers_loss": 0.10570426285266876,
"skip_count": 0.0,
"step": 500,
"text_loss": 0.28395503759384155
@@ -4767,13 +4767,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.30078125,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0009999999760498814,
- "loss": 0.0894,
+ "loss": 0.0849,
"macro_f1": 0.5492662787437439,
"num_tokens": 809283.0,
"repeat_count": 0.0,
- "routers_loss": 0.03948225453495979,
+ "routers_loss": 0.031202208250761032,
"skip_count": 2.0,
"step": 502,
"text_loss": 0.32970911264419556
@@ -4786,13 +4786,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.15625,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009999997844489475,
- "loss": 0.0557,
+ "loss": 0.0574,
"macro_f1": 0.3272727429866791,
"num_tokens": 812440.0,
"repeat_count": 0.0,
- "routers_loss": 0.0742638111114502,
+ "routers_loss": 0.07647835463285446,
"skip_count": 1.0,
"step": 504,
"text_loss": 0.4901447296142578
@@ -4805,13 +4805,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.25,
"learning_rate": 0.000999999401247153,
- "loss": 0.0682,
+ "loss": 0.0668,
"macro_f1": 0.32098764181137085,
"num_tokens": 815716.0,
"repeat_count": 0.0,
- "routers_loss": 0.08293049037456512,
+ "routers_loss": 0.08515176922082901,
"skip_count": 2.0,
"step": 506,
"text_loss": 0.6157599687576294
@@ -4824,13 +4824,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.25390625,
"learning_rate": 0.0009999988264446445,
- "loss": 0.0697,
+ "loss": 0.0686,
"macro_f1": 0.3333333432674408,
"num_tokens": 819086.0,
"repeat_count": 0.0,
- "routers_loss": 0.010080376639962196,
+ "routers_loss": 0.00946938619017601,
"skip_count": 0.0,
"step": 508,
"text_loss": 0.5053519010543823
@@ -4843,13 +4843,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009999980600416424,
- "loss": 0.0611,
+ "loss": 0.0574,
"macro_f1": 0.3333333432674408,
"num_tokens": 822268.0,
"repeat_count": 0.0,
- "routers_loss": 0.009179878048598766,
+ "routers_loss": 0.01058756373822689,
"skip_count": 0.0,
"step": 510,
"text_loss": 0.5570021867752075
@@ -4862,13 +4862,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11083984375,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.000999997102038441,
- "loss": 0.0689,
+ "loss": 0.0678,
"macro_f1": 0.3333333432674408,
"num_tokens": 825728.0,
"repeat_count": 0.0,
- "routers_loss": 0.006718529388308525,
+ "routers_loss": 0.008705209009349346,
"skip_count": 0.0,
"step": 512,
"text_loss": 0.6519040465354919
@@ -4881,13 +4881,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.220703125,
"learning_rate": 0.0009999959524354064,
- "loss": 0.0826,
+ "loss": 0.083,
"macro_f1": 0.3272727429866791,
"num_tokens": 829459.0,
"repeat_count": 0.0,
- "routers_loss": 0.049344487488269806,
+ "routers_loss": 0.04024193435907364,
"skip_count": 1.0,
"step": 514,
"text_loss": 0.5290043950080872
@@ -4900,13 +4900,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.259765625,
+ "grad_norm": 0.25390625,
"learning_rate": 0.00099999461123298,
- "loss": 0.0739,
+ "loss": 0.0727,
"macro_f1": 0.3333333432674408,
"num_tokens": 832291.0,
"repeat_count": 0.0,
- "routers_loss": 0.013402626849710941,
+ "routers_loss": 0.015742862597107887,
"skip_count": 0.0,
"step": 516,
"text_loss": 0.7910057902336121
@@ -4919,13 +4919,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.000999993078431675,
- "loss": 0.0761,
+ "loss": 0.0759,
"macro_f1": 0.3076923191547394,
"num_tokens": 835399.0,
"repeat_count": 1.0,
- "routers_loss": 0.16964484751224518,
+ "routers_loss": 0.16753782331943512,
"skip_count": 3.0,
"step": 518,
"text_loss": 0.45196083188056946
@@ -4938,13 +4938,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.236328125,
"learning_rate": 0.0009999913540320792,
- "loss": 0.095,
+ "loss": 0.0968,
"macro_f1": 0.31446540355682373,
"num_tokens": 838993.0,
"repeat_count": 0.0,
- "routers_loss": 0.08609295636415482,
+ "routers_loss": 0.09357143193483353,
"skip_count": 2.0,
"step": 520,
"text_loss": 0.5499435663223267
@@ -4957,13 +4957,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.2392578125,
+ "grad_norm": 0.2451171875,
"learning_rate": 0.0009999894380348536,
- "loss": 0.0816,
+ "loss": 0.0821,
"macro_f1": 0.5492662787437439,
"num_tokens": 842652.0,
"repeat_count": 0.0,
- "routers_loss": 0.05354784056544304,
+ "routers_loss": 0.056803856045007706,
"skip_count": 2.0,
"step": 522,
"text_loss": 0.197520449757576
@@ -4976,13 +4976,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.2333984375,
"learning_rate": 0.000999987330440732,
- "loss": 0.0715,
+ "loss": 0.0725,
"macro_f1": 0.4871794879436493,
"num_tokens": 847061.0,
"repeat_count": 0.0,
- "routers_loss": 0.09146631509065628,
+ "routers_loss": 0.08962195366621017,
"skip_count": 3.0,
"step": 524,
"text_loss": 0.27509039640426636
@@ -4995,13 +4995,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.189453125,
"learning_rate": 0.000999985031250522,
- "loss": 0.0574,
+ "loss": 0.0561,
"macro_f1": 0.3333333432674408,
"num_tokens": 850780.0,
"repeat_count": 0.0,
- "routers_loss": 0.02344255894422531,
+ "routers_loss": 0.022930558770895004,
"skip_count": 0.0,
"step": 526,
"text_loss": 0.13291706144809723
@@ -5014,13 +5014,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1982421875,
+ "grad_norm": 0.197265625,
"learning_rate": 0.0009999825404651053,
- "loss": 0.0621,
+ "loss": 0.0614,
"macro_f1": 0.3333333432674408,
"num_tokens": 853886.0,
"repeat_count": 0.0,
- "routers_loss": 0.018271517008543015,
+ "routers_loss": 0.017097990959882736,
"skip_count": 0.0,
"step": 528,
"text_loss": 0.21706295013427734
@@ -5033,13 +5033,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2060546875,
+ "grad_norm": 0.212890625,
"learning_rate": 0.0009999798580854356,
- "loss": 0.0717,
+ "loss": 0.0724,
"macro_f1": 0.3333333432674408,
"num_tokens": 857364.0,
"repeat_count": 0.0,
- "routers_loss": 0.026990914717316628,
+ "routers_loss": 0.02831801027059555,
"skip_count": 0.0,
"step": 530,
"text_loss": 0.9035662412643433
@@ -5052,13 +5052,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16015625,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.000999976984112541,
- "loss": 0.0681,
+ "loss": 0.0674,
"macro_f1": 0.3333333432674408,
"num_tokens": 860661.0,
"repeat_count": 0.0,
- "routers_loss": 0.019737249240279198,
+ "routers_loss": 0.019671892747282982,
"skip_count": 0.0,
"step": 532,
"text_loss": 0.8354863524436951
@@ -5071,13 +5071,13 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.3046875,
+ "grad_norm": 0.2890625,
"learning_rate": 0.0009999739185475231,
- "loss": 0.0978,
+ "loss": 0.0963,
"macro_f1": 0.47333335876464844,
"num_tokens": 864124.0,
"repeat_count": 2.0,
- "routers_loss": 0.212640181183815,
+ "routers_loss": 0.21383361518383026,
"skip_count": 3.0,
"step": 534,
"text_loss": 0.23422949016094208
@@ -5090,13 +5090,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.0009999706613915565,
- "loss": 0.0602,
+ "loss": 0.0598,
"macro_f1": 0.32098767161369324,
"num_tokens": 866976.0,
"repeat_count": 0.0,
- "routers_loss": 0.07302755117416382,
+ "routers_loss": 0.07158871740102768,
"skip_count": 1.0,
"step": 536,
"text_loss": 0.11800774186849594
@@ -5109,13 +5109,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.296875,
+ "grad_norm": 0.26953125,
"learning_rate": 0.0009999672126458894,
- "loss": 0.0825,
+ "loss": 0.0822,
"macro_f1": 0.3272727429866791,
"num_tokens": 870549.0,
"repeat_count": 0.0,
- "routers_loss": 0.08667246252298355,
+ "routers_loss": 0.08185924589633942,
"skip_count": 1.0,
"step": 538,
"text_loss": 0.19232480227947235
@@ -5128,13 +5128,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1318359375,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.000999963572311843,
- "loss": 0.0597,
+ "loss": 0.0604,
"macro_f1": 0.3333333432674408,
"num_tokens": 873733.0,
"repeat_count": 0.0,
- "routers_loss": 0.015047167427837849,
+ "routers_loss": 0.01633382774889469,
"skip_count": 0.0,
"step": 540,
"text_loss": 0.3725031912326813
@@ -5147,13 +5147,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.15234375,
"learning_rate": 0.0009999597403908128,
- "loss": 0.076,
+ "loss": 0.0761,
"macro_f1": 0.3272727429866791,
"num_tokens": 877099.0,
"repeat_count": 0.0,
- "routers_loss": 0.07481446117162704,
+ "routers_loss": 0.0782657191157341,
"skip_count": 1.0,
"step": 542,
"text_loss": 0.17589199542999268
@@ -5166,13 +5166,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.2177734375,
"learning_rate": 0.0009999557168842669,
- "loss": 0.0724,
+ "loss": 0.0716,
"macro_f1": 0.5492662787437439,
"num_tokens": 879883.0,
"repeat_count": 0.0,
- "routers_loss": 0.049495212733745575,
+ "routers_loss": 0.05275818333029747,
"skip_count": 2.0,
"step": 544,
"text_loss": 0.26448264718055725
@@ -5185,13 +5185,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.0009999515017937468,
- "loss": 0.0718,
+ "loss": 0.071,
"macro_f1": 0.32098764181137085,
"num_tokens": 882223.0,
"repeat_count": 0.0,
- "routers_loss": 0.08043002337217331,
+ "routers_loss": 0.09335892647504807,
"skip_count": 2.0,
"step": 546,
"text_loss": 0.208544060587883
@@ -5204,13 +5204,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.34765625,
+ "grad_norm": 0.376953125,
"learning_rate": 0.0009999470951208684,
- "loss": 0.086,
+ "loss": 0.0855,
"macro_f1": 0.32098764181137085,
"num_tokens": 885241.0,
"repeat_count": 2.0,
- "routers_loss": 0.22461950778961182,
+ "routers_loss": 0.22983254492282867,
"skip_count": 0.0,
"step": 548,
"text_loss": 0.6612338423728943
@@ -5223,13 +5223,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.216796875,
"learning_rate": 0.00099994249686732,
- "loss": 0.0798,
+ "loss": 0.0786,
"macro_f1": 0.3272727429866791,
"num_tokens": 887897.0,
"repeat_count": 1.0,
- "routers_loss": 0.11754962801933289,
+ "routers_loss": 0.12858282029628754,
"skip_count": 0.0,
"step": 550,
"text_loss": 0.4673548936843872
@@ -5242,13 +5242,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1611328125,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.0009999377070348638,
- "loss": 0.0978,
+ "loss": 0.0944,
"macro_f1": 0.3333333432674408,
"num_tokens": 891224.0,
"repeat_count": 0.0,
- "routers_loss": 0.017412789165973663,
+ "routers_loss": 0.017421770840883255,
"skip_count": 0.0,
"step": 552,
"text_loss": 0.6419258117675781
@@ -5261,13 +5261,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.15625,
"learning_rate": 0.000999932725625335,
- "loss": 0.0792,
+ "loss": 0.0791,
"macro_f1": 0.32098764181137085,
"num_tokens": 894578.0,
"repeat_count": 0.0,
- "routers_loss": 0.08969525247812271,
+ "routers_loss": 0.07890026271343231,
"skip_count": 2.0,
"step": 554,
"text_loss": 0.5970752239227295
@@ -5280,13 +5280,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2158203125,
+ "grad_norm": 0.216796875,
"learning_rate": 0.0009999275526406427,
- "loss": 0.0803,
+ "loss": 0.0796,
"macro_f1": 0.31446540355682373,
"num_tokens": 897145.0,
"repeat_count": 1.0,
- "routers_loss": 0.09876437485218048,
+ "routers_loss": 0.09836960583925247,
"skip_count": 1.0,
"step": 556,
"text_loss": 0.752425491809845
@@ -5299,13 +5299,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.1875,
"learning_rate": 0.0009999221880827693,
- "loss": 0.0887,
+ "loss": 0.0882,
"macro_f1": 0.3333333432674408,
"num_tokens": 900565.0,
"repeat_count": 0.0,
- "routers_loss": 0.019108204171061516,
+ "routers_loss": 0.017694659531116486,
"skip_count": 0.0,
"step": 558,
"text_loss": 0.195619136095047
@@ -5318,32 +5318,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.2021484375,
"learning_rate": 0.0009999166319537703,
- "loss": 0.0573,
+ "loss": 0.0561,
"macro_f1": 0.3333333432674408,
"num_tokens": 903506.0,
"repeat_count": 0.0,
- "routers_loss": 0.019048813730478287,
+ "routers_loss": 0.019375264644622803,
"skip_count": 0.0,
"step": 560,
"text_loss": 0.4603337347507477
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
"epoch": 2.638685060170238,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1435546875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.146484375,
"learning_rate": 0.0009999108842557748,
- "loss": 0.0947,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0953,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 906380.0,
"repeat_count": 0.0,
- "routers_loss": 0.11889495700597763,
+ "routers_loss": 0.12013207376003265,
"skip_count": 3.0,
"step": 562,
"text_loss": 0.6279402375221252
@@ -5356,13 +5356,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.228515625,
+ "grad_norm": 0.255859375,
"learning_rate": 0.0009999049449909854,
- "loss": 0.0771,
+ "loss": 0.0799,
"macro_f1": 0.3272727429866791,
"num_tokens": 909116.0,
"repeat_count": 0.0,
- "routers_loss": 0.06202332302927971,
+ "routers_loss": 0.06441342830657959,
"skip_count": 1.0,
"step": 564,
"text_loss": 0.23741699755191803
@@ -5375,13 +5375,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.15234375,
"learning_rate": 0.0009998988141616781,
- "loss": 0.0623,
+ "loss": 0.064,
"macro_f1": 0.32098767161369324,
"num_tokens": 912189.0,
"repeat_count": 0.0,
- "routers_loss": 0.08294244855642319,
+ "routers_loss": 0.08309414982795715,
"skip_count": 1.0,
"step": 566,
"text_loss": 0.27780941128730774
@@ -5394,13 +5394,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009998924917702023,
- "loss": 0.0885,
+ "loss": 0.0876,
"macro_f1": 0.3272727429866791,
"num_tokens": 916279.0,
"repeat_count": 1.0,
- "routers_loss": 0.07545182853937149,
+ "routers_loss": 0.07197169959545135,
"skip_count": 0.0,
"step": 568,
"text_loss": 0.6371755599975586
@@ -5413,13 +5413,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.2255859375,
"learning_rate": 0.0009998859778189806,
- "loss": 0.0712,
+ "loss": 0.0706,
"macro_f1": 0.3333333432674408,
"num_tokens": 919490.0,
"repeat_count": 0.0,
- "routers_loss": 0.008711219765245914,
+ "routers_loss": 0.008022273890674114,
"skip_count": 0.0,
"step": 570,
"text_loss": 0.6028938889503479
@@ -5432,13 +5432,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.000999879272310509,
- "loss": 0.0837,
+ "loss": 0.084,
"macro_f1": 0.3333333432674408,
"num_tokens": 923694.0,
"repeat_count": 0.0,
- "routers_loss": 0.01639273390173912,
+ "routers_loss": 0.01634674146771431,
"skip_count": 0.0,
"step": 572,
"text_loss": 0.7177054286003113
@@ -5451,13 +5451,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1669921875,
+ "grad_norm": 0.17578125,
"learning_rate": 0.0009998723752473574,
- "loss": 0.0707,
+ "loss": 0.0716,
"macro_f1": 0.3272727429866791,
"num_tokens": 926933.0,
"repeat_count": 0.0,
- "routers_loss": 0.04997137933969498,
+ "routers_loss": 0.060559045523405075,
"skip_count": 1.0,
"step": 574,
"text_loss": 0.5203254818916321
@@ -5470,13 +5470,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1845703125,
+ "grad_norm": 0.185546875,
"learning_rate": 0.0009998652866321687,
- "loss": 0.0799,
+ "loss": 0.0801,
"macro_f1": 0.3333333432674408,
"num_tokens": 929832.0,
"repeat_count": 0.0,
- "routers_loss": 0.011360209435224533,
+ "routers_loss": 0.011485611088573933,
"skip_count": 0.0,
"step": 576,
"text_loss": 0.6147452592849731
@@ -5489,13 +5489,13 @@
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1669921875,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.000999858006467659,
- "loss": 0.0658,
+ "loss": 0.0649,
"macro_f1": 0.29333335161209106,
"num_tokens": 933266.0,
"repeat_count": 2.0,
- "routers_loss": 0.31349560618400574,
+ "routers_loss": 0.2929030954837799,
"skip_count": 4.0,
"step": 578,
"text_loss": 0.1720666140317917
@@ -5508,13 +5508,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.24609375,
"learning_rate": 0.0009998505347566186,
- "loss": 0.0801,
+ "loss": 0.0782,
"macro_f1": 0.32098764181137085,
"num_tokens": 937545.0,
"repeat_count": 0.0,
- "routers_loss": 0.058660347014665604,
+ "routers_loss": 0.053780000656843185,
"skip_count": 2.0,
"step": 580,
"text_loss": 0.3258405327796936
@@ -5527,13 +5527,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.1416015625,
"learning_rate": 0.00099984287150191,
- "loss": 0.0578,
+ "loss": 0.0582,
"macro_f1": 0.3333333432674408,
"num_tokens": 941001.0,
"repeat_count": 0.0,
- "routers_loss": 0.025836754590272903,
+ "routers_loss": 0.02637636847794056,
"skip_count": 0.0,
"step": 582,
"text_loss": 0.23762771487236023
@@ -5546,13 +5546,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009998350167064705,
- "loss": 0.0683,
+ "loss": 0.0672,
"macro_f1": 0.3333333432674408,
"num_tokens": 943989.0,
"repeat_count": 0.0,
- "routers_loss": 0.016504868865013123,
+ "routers_loss": 0.01637580618262291,
"skip_count": 0.0,
"step": 584,
"text_loss": 0.7460582852363586
@@ -5565,13 +5565,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1787109375,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009998269703733096,
- "loss": 0.0685,
+ "loss": 0.0686,
"macro_f1": 0.3272727429866791,
"num_tokens": 947245.0,
"repeat_count": 1.0,
- "routers_loss": 0.1379794180393219,
+ "routers_loss": 0.13934117555618286,
"skip_count": 0.0,
"step": 586,
"text_loss": 0.5284690260887146
@@ -5584,13 +5584,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.13671875,
"learning_rate": 0.0009998187325055106,
- "loss": 0.0657,
+ "loss": 0.0667,
"macro_f1": 0.3333333432674408,
"num_tokens": 950116.0,
"repeat_count": 0.0,
- "routers_loss": 0.01802757754921913,
+ "routers_loss": 0.02138397842645645,
"skip_count": 0.0,
"step": 588,
"text_loss": 0.3920256197452545
@@ -5603,13 +5603,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0009998103031062305,
- "loss": 0.0762,
+ "loss": 0.0778,
"macro_f1": 0.3333333432674408,
"num_tokens": 953277.0,
"repeat_count": 0.0,
- "routers_loss": 0.006902900990098715,
+ "routers_loss": 0.007098200265318155,
"skip_count": 0.0,
"step": 590,
"text_loss": 0.7472905516624451
@@ -5622,13 +5622,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3046875,
+ "grad_norm": 0.318359375,
"learning_rate": 0.0009998016821786994,
- "loss": 0.0912,
+ "loss": 0.0872,
"macro_f1": 0.32098764181137085,
"num_tokens": 958229.0,
"repeat_count": 1.0,
- "routers_loss": 0.08348741382360458,
+ "routers_loss": 0.07946522533893585,
"skip_count": 1.0,
"step": 592,
"text_loss": 0.5506448745727539
@@ -5641,13 +5641,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.000999792869726221,
- "loss": 0.0527,
+ "loss": 0.0523,
"macro_f1": 0.3272727429866791,
"num_tokens": 961016.0,
"repeat_count": 0.0,
- "routers_loss": 0.08290062099695206,
+ "routers_loss": 0.0850791186094284,
"skip_count": 1.0,
"step": 594,
"text_loss": 0.3824431002140045
@@ -5660,13 +5660,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.0009997838657521717,
- "loss": 0.0643,
+ "loss": 0.0632,
"macro_f1": 0.3333333432674408,
"num_tokens": 963847.0,
"repeat_count": 0.0,
- "routers_loss": 0.018620988354086876,
+ "routers_loss": 0.016370445489883423,
"skip_count": 0.0,
"step": 596,
"text_loss": 0.2139475792646408
@@ -5679,13 +5679,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009997746702600026,
- "loss": 0.073,
+ "loss": 0.0702,
"macro_f1": 0.307692289352417,
"num_tokens": 966619.0,
"repeat_count": 0.0,
- "routers_loss": 0.1211671382188797,
+ "routers_loss": 0.1310746818780899,
"skip_count": 3.0,
"step": 598,
"text_loss": 0.3651018440723419
@@ -5698,13 +5698,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.23828125,
"learning_rate": 0.0009997652832532372,
- "loss": 0.079,
+ "loss": 0.0792,
"macro_f1": 0.3272727429866791,
"num_tokens": 970418.0,
"repeat_count": 1.0,
- "routers_loss": 0.15485027432441711,
+ "routers_loss": 0.14303378760814667,
"skip_count": 0.0,
"step": 600,
"text_loss": 0.7094736099243164
@@ -5717,13 +5717,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009997557047354722,
- "loss": 0.0562,
+ "loss": 0.0531,
"macro_f1": 0.3272727429866791,
"num_tokens": 973491.0,
"repeat_count": 0.0,
- "routers_loss": 0.036684274673461914,
+ "routers_loss": 0.03334212675690651,
"skip_count": 1.0,
"step": 602,
"text_loss": 0.4812237024307251
@@ -5731,18 +5731,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 2.835926034634576,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.302734375,
+ "grad_norm": 0.2890625,
"learning_rate": 0.0009997459347103783,
- "loss": 0.0985,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0956,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 976672.0,
"repeat_count": 0.0,
- "routers_loss": 0.026901578530669212,
+ "routers_loss": 0.02831871062517166,
"skip_count": 0.0,
"step": 604,
"text_loss": 0.21737146377563477
@@ -5755,13 +5755,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12158203125,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009997359731816998,
- "loss": 0.0632,
+ "loss": 0.0646,
"macro_f1": 0.3333333432674408,
"num_tokens": 979898.0,
"repeat_count": 0.0,
- "routers_loss": 0.01700405217707157,
+ "routers_loss": 0.017968013882637024,
"skip_count": 0.0,
"step": 606,
"text_loss": 0.5458008050918579
@@ -5774,13 +5774,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2099609375,
+ "grad_norm": 0.224609375,
"learning_rate": 0.0009997258201532536,
- "loss": 0.0758,
+ "loss": 0.0751,
"macro_f1": 0.3333333432674408,
"num_tokens": 982811.0,
"repeat_count": 0.0,
- "routers_loss": 0.015013590455055237,
+ "routers_loss": 0.016256732866168022,
"skip_count": 0.0,
"step": 608,
"text_loss": 0.8643257021903992
@@ -5793,13 +5793,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0009997154756289303,
- "loss": 0.0576,
+ "loss": 0.0561,
"macro_f1": 0.3333333432674408,
"num_tokens": 985245.0,
"repeat_count": 0.0,
- "routers_loss": 0.02037946693599224,
+ "routers_loss": 0.021214161068201065,
"skip_count": 0.0,
"step": 610,
"text_loss": 0.2204967886209488
@@ -5812,13 +5812,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.150390625,
"learning_rate": 0.000999704939612694,
- "loss": 0.0648,
+ "loss": 0.0636,
"macro_f1": 0.3006536364555359,
"num_tokens": 988539.0,
"repeat_count": 3.0,
- "routers_loss": 0.22834022343158722,
+ "routers_loss": 0.23249399662017822,
"skip_count": 2.0,
"step": 612,
"text_loss": 0.32489025592803955
@@ -5831,13 +5831,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009996942121085824,
- "loss": 0.0449,
+ "loss": 0.0445,
"macro_f1": 0.3333333432674408,
"num_tokens": 991660.0,
"repeat_count": 0.0,
- "routers_loss": 0.009838113561272621,
+ "routers_loss": 0.010706410743296146,
"skip_count": 0.0,
"step": 614,
"text_loss": 0.4551754891872406
@@ -5850,13 +5850,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.353515625,
+ "grad_norm": 0.3671875,
"learning_rate": 0.000999683293120706,
- "loss": 0.1009,
+ "loss": 0.1016,
"macro_f1": 0.3333333432674408,
"num_tokens": 994828.0,
"repeat_count": 0.0,
- "routers_loss": 0.005943270865827799,
+ "routers_loss": 0.006676184479147196,
"skip_count": 0.0,
"step": 616,
"text_loss": 0.6212068200111389
@@ -5869,13 +5869,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.38671875,
+ "grad_norm": 0.408203125,
"learning_rate": 0.0009996721826532491,
- "loss": 0.0941,
+ "loss": 0.0976,
"macro_f1": 0.3076923191547394,
"num_tokens": 997951.0,
"repeat_count": 2.0,
- "routers_loss": 0.21597740054130554,
+ "routers_loss": 0.2148125320672989,
"skip_count": 2.0,
"step": 618,
"text_loss": 0.26514527201652527
@@ -5888,13 +5888,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.1904296875,
"learning_rate": 0.000999660880710469,
- "loss": 0.0896,
+ "loss": 0.0909,
"macro_f1": 0.3333333432674408,
"num_tokens": 1001139.0,
"repeat_count": 0.0,
- "routers_loss": 0.023726588115096092,
+ "routers_loss": 0.022332455962896347,
"skip_count": 0.0,
"step": 620,
"text_loss": 0.26131340861320496
@@ -5907,13 +5907,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009996493872966971,
"loss": 0.0732,
"macro_f1": 0.3272727429866791,
"num_tokens": 1003678.0,
"repeat_count": 1.0,
- "routers_loss": 0.08467255532741547,
+ "routers_loss": 0.08348730951547623,
"skip_count": 0.0,
"step": 622,
"text_loss": 0.19151706993579865
@@ -5926,13 +5926,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.173828125,
"learning_rate": 0.0009996377024163374,
- "loss": 0.0816,
+ "loss": 0.0822,
"macro_f1": 0.3333333432674408,
"num_tokens": 1007082.0,
"repeat_count": 0.0,
- "routers_loss": 0.029468854889273643,
+ "routers_loss": 0.028577150776982307,
"skip_count": 0.0,
"step": 624,
"text_loss": 0.305387407541275
@@ -5945,13 +5945,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.11279296875,
"learning_rate": 0.0009996258260738676,
- "loss": 0.0891,
+ "loss": 0.0892,
"macro_f1": 0.3272727429866791,
"num_tokens": 1010064.0,
"repeat_count": 1.0,
- "routers_loss": 0.09438466280698776,
+ "routers_loss": 0.08312026411294937,
"skip_count": 0.0,
"step": 626,
"text_loss": 0.49436143040657043
@@ -5964,13 +5964,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009996137582738388,
- "loss": 0.0581,
+ "loss": 0.0591,
"macro_f1": 0.3333333432674408,
"num_tokens": 1013462.0,
"repeat_count": 0.0,
- "routers_loss": 0.013679586350917816,
+ "routers_loss": 0.013337327167391777,
"skip_count": 0.0,
"step": 628,
"text_loss": 0.6515294313430786
@@ -5983,13 +5983,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.140625,
"learning_rate": 0.000999601499020875,
- "loss": 0.0528,
+ "loss": 0.0537,
"macro_f1": 0.3333333432674408,
"num_tokens": 1016246.0,
"repeat_count": 0.0,
- "routers_loss": 0.029532987624406815,
+ "routers_loss": 0.029126765206456184,
"skip_count": 0.0,
"step": 630,
"text_loss": 0.18834827840328217
@@ -6002,13 +6002,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009995890483196746,
- "loss": 0.0601,
+ "loss": 0.0602,
"macro_f1": 0.3272727429866791,
"num_tokens": 1019286.0,
"repeat_count": 0.0,
- "routers_loss": 0.05516733601689339,
+ "routers_loss": 0.054844800382852554,
"skip_count": 1.0,
"step": 632,
"text_loss": 0.6988179087638855
@@ -6021,13 +6021,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.357421875,
+ "grad_norm": 0.322265625,
"learning_rate": 0.0009995764061750086,
- "loss": 0.0785,
+ "loss": 0.0767,
"macro_f1": 0.3333333432674408,
"num_tokens": 1022207.0,
"repeat_count": 0.0,
- "routers_loss": 0.010254866443574429,
+ "routers_loss": 0.010095693171024323,
"skip_count": 0.0,
"step": 634,
"text_loss": 0.558451771736145
@@ -6040,13 +6040,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.2890625,
"learning_rate": 0.000999563572591721,
- "loss": 0.0518,
+ "loss": 0.0521,
"macro_f1": 0.32098764181137085,
"num_tokens": 1025319.0,
"repeat_count": 1.0,
- "routers_loss": 0.07528360933065414,
+ "routers_loss": 0.0698433518409729,
"skip_count": 1.0,
"step": 636,
"text_loss": 0.5961872935295105
@@ -6059,13 +6059,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1064453125,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009995505475747302,
- "loss": 0.0844,
+ "loss": 0.0849,
"macro_f1": 0.3272727429866791,
"num_tokens": 1028362.0,
"repeat_count": 0.0,
- "routers_loss": 0.04301584139466286,
+ "routers_loss": 0.040211405605077744,
"skip_count": 1.0,
"step": 638,
"text_loss": 0.546863317489624
@@ -6078,13 +6078,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11572265625,
+ "grad_norm": 0.119140625,
"learning_rate": 0.0009995373311290272,
- "loss": 0.0699,
+ "loss": 0.0709,
"macro_f1": 0.3144654333591461,
"num_tokens": 1032199.0,
"repeat_count": 2.0,
- "routers_loss": 0.14521080255508423,
+ "routers_loss": 0.1457643061876297,
"skip_count": 1.0,
"step": 640,
"text_loss": 0.2137298285961151
@@ -6097,13 +6097,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1328125,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.0009995239232596764,
- "loss": 0.0543,
+ "loss": 0.0545,
"macro_f1": 0.3333333432674408,
"num_tokens": 1035801.0,
"repeat_count": 0.0,
- "routers_loss": 0.01074797473847866,
+ "routers_loss": 0.011394930072128773,
"skip_count": 0.0,
"step": 642,
"text_loss": 0.43054503202438354
@@ -6116,13 +6116,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009995103239718163,
- "loss": 0.0659,
+ "loss": 0.0665,
"macro_f1": 0.3333333432674408,
"num_tokens": 1039223.0,
"repeat_count": 0.0,
- "routers_loss": 0.009271817281842232,
+ "routers_loss": 0.00997432041913271,
"skip_count": 0.0,
"step": 644,
"text_loss": 0.7749615907669067
@@ -6135,13 +6135,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0009994965332706573,
- "loss": 0.0737,
+ "loss": 0.0755,
"macro_f1": 0.3144654333591461,
"num_tokens": 1042154.0,
"repeat_count": 3.0,
- "routers_loss": 0.10257050395011902,
+ "routers_loss": 0.10589150339365005,
"skip_count": 0.0,
"step": 646,
"text_loss": 0.7812211513519287
@@ -6154,13 +6154,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.0009994825511614846,
- "loss": 0.0363,
+ "loss": 0.0383,
"macro_f1": 0.3272727429866791,
"num_tokens": 1045250.0,
"repeat_count": 0.0,
- "routers_loss": 0.07091924548149109,
+ "routers_loss": 0.0748734176158905,
"skip_count": 1.0,
"step": 648,
"text_loss": 0.844803512096405
@@ -6173,13 +6173,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11572265625,
+ "grad_norm": 0.1220703125,
"learning_rate": 0.0009994683776496562,
- "loss": 0.0421,
+ "loss": 0.0433,
"macro_f1": 0.3272727429866791,
"num_tokens": 1048446.0,
"repeat_count": 0.0,
- "routers_loss": 0.034446243196725845,
+ "routers_loss": 0.03742415830492973,
"skip_count": 1.0,
"step": 650,
"text_loss": 0.2098839282989502
@@ -6192,13 +6192,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009994540127406034,
- "loss": 0.0593,
+ "loss": 0.0591,
"macro_f1": 0.32098764181137085,
"num_tokens": 1051840.0,
"repeat_count": 0.0,
- "routers_loss": 0.06077485531568527,
+ "routers_loss": 0.06025516986846924,
"skip_count": 2.0,
"step": 652,
"text_loss": 0.27727583050727844
@@ -6211,13 +6211,13 @@
"f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.181640625,
"learning_rate": 0.0009994394564398306,
- "loss": 0.0537,
+ "loss": 0.0519,
"macro_f1": 0.521541953086853,
"num_tokens": 1055142.0,
"repeat_count": 4.0,
- "routers_loss": 0.2382282167673111,
+ "routers_loss": 0.22807340323925018,
"skip_count": 2.0,
"step": 654,
"text_loss": 0.9672397971153259
@@ -6230,13 +6230,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.142578125,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009994247087529158,
- "loss": 0.0613,
+ "loss": 0.0618,
"macro_f1": 0.3333333432674408,
"num_tokens": 1057698.0,
"repeat_count": 0.0,
- "routers_loss": 0.011971636675298214,
+ "routers_loss": 0.01348950993269682,
"skip_count": 0.0,
"step": 656,
"text_loss": 0.6375506520271301
@@ -6249,13 +6249,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.212890625,
+ "grad_norm": 0.1953125,
"learning_rate": 0.0009994097696855106,
- "loss": 0.0414,
+ "loss": 0.0412,
"macro_f1": 0.3333333432674408,
"num_tokens": 1060624.0,
"repeat_count": 0.0,
- "routers_loss": 0.010221127420663834,
+ "routers_loss": 0.009649243205785751,
"skip_count": 0.0,
"step": 658,
"text_loss": 0.5315385460853577
@@ -6268,13 +6268,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2265625,
+ "grad_norm": 0.2041015625,
"learning_rate": 0.0009993946392433395,
- "loss": 0.061,
+ "loss": 0.0609,
"macro_f1": 0.307692289352417,
"num_tokens": 1065076.0,
"repeat_count": 0.0,
- "routers_loss": 0.11860335618257523,
+ "routers_loss": 0.1250980943441391,
"skip_count": 3.0,
"step": 660,
"text_loss": 0.25780341029167175
@@ -6287,13 +6287,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009993793174322006,
- "loss": 0.0485,
+ "loss": 0.0471,
"macro_f1": 0.3333333432674408,
"num_tokens": 1068365.0,
"repeat_count": 0.0,
- "routers_loss": 0.011139829643070698,
+ "routers_loss": 0.011544390581548214,
"skip_count": 0.0,
"step": 662,
"text_loss": 0.34876301884651184
@@ -6306,13 +6306,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.166015625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009993638042579654,
- "loss": 0.0478,
+ "loss": 0.0473,
"macro_f1": 0.3272727429866791,
"num_tokens": 1071693.0,
"repeat_count": 0.0,
- "routers_loss": 0.03978770971298218,
+ "routers_loss": 0.03777370601892471,
"skip_count": 1.0,
"step": 664,
"text_loss": 0.21811571717262268
@@ -6327,11 +6327,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.203125,
"learning_rate": 0.0009993480997265783,
- "loss": 0.0481,
+ "loss": 0.0475,
"macro_f1": 0.5492662787437439,
"num_tokens": 1074733.0,
"repeat_count": 0.0,
- "routers_loss": 0.051231011748313904,
+ "routers_loss": 0.049949806183576584,
"skip_count": 2.0,
"step": 666,
"text_loss": 0.38410288095474243
@@ -6344,13 +6344,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.10302734375,
"learning_rate": 0.0009993322038440572,
- "loss": 0.0615,
+ "loss": 0.0605,
"macro_f1": 0.3333333432674408,
"num_tokens": 1077993.0,
"repeat_count": 0.0,
- "routers_loss": 0.024917088449001312,
+ "routers_loss": 0.0247171800583601,
"skip_count": 0.0,
"step": 668,
"text_loss": 0.25576895475387573
@@ -6363,13 +6363,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1982421875,
+ "grad_norm": 0.216796875,
"learning_rate": 0.000999316116616494,
- "loss": 0.0627,
+ "loss": 0.0619,
"macro_f1": 0.3333333432674408,
"num_tokens": 1080491.0,
"repeat_count": 0.0,
- "routers_loss": 0.008834881708025932,
+ "routers_loss": 0.008118715137243271,
"skip_count": 0.0,
"step": 670,
"text_loss": 0.6269792914390564
@@ -6382,13 +6382,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.173828125,
"learning_rate": 0.0009992998380500527,
"loss": 0.0462,
"macro_f1": 0.3272727429866791,
"num_tokens": 1083817.0,
"repeat_count": 0.0,
- "routers_loss": 0.033405229449272156,
+ "routers_loss": 0.03366057574748993,
"skip_count": 1.0,
"step": 672,
"text_loss": 0.26891493797302246
@@ -6401,13 +6401,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.13671875,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009992833681509716,
- "loss": 0.0523,
+ "loss": 0.0529,
"macro_f1": 0.3333333432674408,
"num_tokens": 1087368.0,
"repeat_count": 0.0,
- "routers_loss": 0.020753704011440277,
+ "routers_loss": 0.020552074536681175,
"skip_count": 0.0,
"step": 674,
"text_loss": 0.14421936869621277
@@ -6420,13 +6420,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.18359375,
"learning_rate": 0.0009992667069255619,
- "loss": 0.0698,
+ "loss": 0.0696,
"macro_f1": 0.31446540355682373,
"num_tokens": 1090452.0,
"repeat_count": 0.0,
- "routers_loss": 0.06932353973388672,
+ "routers_loss": 0.06937336176633835,
"skip_count": 2.0,
"step": 676,
"text_loss": 0.24999259412288666
@@ -6439,13 +6439,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.08740234375,
"learning_rate": 0.0009992498543802085,
- "loss": 0.059,
+ "loss": 0.0588,
"macro_f1": 0.3272727429866791,
"num_tokens": 1093996.0,
"repeat_count": 1.0,
- "routers_loss": 0.032903749495744705,
+ "routers_loss": 0.0380021296441555,
"skip_count": 0.0,
"step": 678,
"text_loss": 0.42473849654197693
@@ -6458,32 +6458,32 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.2099609375,
+ "grad_norm": 0.2119140625,
"learning_rate": 0.0009992328105213688,
- "loss": 0.0417,
+ "loss": 0.0411,
"macro_f1": 0.4400000274181366,
"num_tokens": 1096837.0,
"repeat_count": 1.0,
- "routers_loss": 0.19733747839927673,
+ "routers_loss": 0.20885063707828522,
"skip_count": 4.0,
"step": 680,
"text_loss": 0.3829527199268341
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 26.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 3.2019371881420606,
- "f1_execute": 1.0,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.154296875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009992155753555747,
- "loss": 0.0729,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0722,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1100320.0,
"repeat_count": 0.0,
- "routers_loss": 0.013452666811645031,
+ "routers_loss": 0.018230699002742767,
"skip_count": 2.0,
"step": 682,
"text_loss": 0.6190969944000244
@@ -6496,13 +6496,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.30859375,
"learning_rate": 0.0009991981488894303,
"loss": 0.0681,
"macro_f1": 0.32098767161369324,
"num_tokens": 1103682.0,
"repeat_count": 0.0,
- "routers_loss": 0.05302857980132103,
+ "routers_loss": 0.05550144240260124,
"skip_count": 1.0,
"step": 684,
"text_loss": 0.44418027997016907
@@ -6515,13 +6515,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.0009991805311296133,
- "loss": 0.0527,
+ "loss": 0.0507,
"macro_f1": 0.32098764181137085,
"num_tokens": 1106427.0,
"repeat_count": 0.0,
- "routers_loss": 0.08124994486570358,
+ "routers_loss": 0.07990608364343643,
"skip_count": 2.0,
"step": 686,
"text_loss": 0.5577231645584106
@@ -6534,13 +6534,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.22265625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009991627220828753,
- "loss": 0.0579,
+ "loss": 0.0568,
"macro_f1": 0.32098764181137085,
"num_tokens": 1109314.0,
"repeat_count": 0.0,
- "routers_loss": 0.058633625507354736,
+ "routers_loss": 0.05167485028505325,
"skip_count": 2.0,
"step": 688,
"text_loss": 0.27325430512428284
@@ -6553,13 +6553,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009991447217560408,
- "loss": 0.0533,
+ "loss": 0.0521,
"macro_f1": 0.5492662787437439,
"num_tokens": 1112748.0,
"repeat_count": 0.0,
- "routers_loss": 0.04703643172979355,
+ "routers_loss": 0.04621964320540428,
"skip_count": 2.0,
"step": 690,
"text_loss": 0.5288321375846863
@@ -6572,13 +6572,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.000999126530156007,
- "loss": 0.0485,
+ "loss": 0.0499,
"macro_f1": 0.307692289352417,
"num_tokens": 1116965.0,
"repeat_count": 1.0,
- "routers_loss": 0.11615128815174103,
+ "routers_loss": 0.11950276792049408,
"skip_count": 2.0,
"step": 692,
"text_loss": 0.14215624332427979
@@ -6591,13 +6591,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2314453125,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.0009991081472897454,
- "loss": 0.0718,
+ "loss": 0.0722,
"macro_f1": 0.3333333432674408,
"num_tokens": 1120570.0,
"repeat_count": 0.0,
- "routers_loss": 0.017403846606612206,
+ "routers_loss": 0.01905500330030918,
"skip_count": 0.0,
"step": 694,
"text_loss": 0.41862696409225464
@@ -6610,13 +6610,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.0009990895731643002,
- "loss": 0.0444,
+ "loss": 0.0464,
"macro_f1": 0.3272727429866791,
"num_tokens": 1124009.0,
"repeat_count": 1.0,
- "routers_loss": 0.07067303359508514,
+ "routers_loss": 0.06974572688341141,
"skip_count": 0.0,
"step": 696,
"text_loss": 0.41160130500793457
@@ -6629,13 +6629,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.000999070807786789,
- "loss": 0.0527,
+ "loss": 0.0531,
"macro_f1": 0.3272727429866791,
"num_tokens": 1127370.0,
"repeat_count": 1.0,
- "routers_loss": 0.07131028175354004,
+ "routers_loss": 0.07055293023586273,
"skip_count": 0.0,
"step": 698,
"text_loss": 0.48068273067474365
@@ -6648,13 +6648,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.18359375,
+ "grad_norm": 0.197265625,
"learning_rate": 0.000999051851164403,
- "loss": 0.0629,
+ "loss": 0.0619,
"macro_f1": 0.32098764181137085,
"num_tokens": 1130234.0,
"repeat_count": 1.0,
- "routers_loss": 0.1152748316526413,
+ "routers_loss": 0.12506946921348572,
"skip_count": 1.0,
"step": 700,
"text_loss": 0.47925490140914917
@@ -6667,13 +6667,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.216796875,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.000999032703304406,
- "loss": 0.0663,
+ "loss": 0.0674,
"macro_f1": 0.3333333432674408,
"num_tokens": 1132874.0,
"repeat_count": 0.0,
- "routers_loss": 0.0077212234027683735,
+ "routers_loss": 0.00809287466108799,
"skip_count": 0.0,
"step": 702,
"text_loss": 0.47433632612228394
@@ -6686,13 +6686,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.1064453125,
"learning_rate": 0.0009990133642141358,
- "loss": 0.0494,
+ "loss": 0.0497,
"macro_f1": 0.5492662787437439,
"num_tokens": 1136011.0,
"repeat_count": 0.0,
- "routers_loss": 0.02726336568593979,
+ "routers_loss": 0.0319170281291008,
"skip_count": 2.0,
"step": 704,
"text_loss": 0.6574832201004028
@@ -6705,13 +6705,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.32421875,
+ "grad_norm": 0.33984375,
"learning_rate": 0.000998993833901003,
- "loss": 0.0615,
+ "loss": 0.0619,
"macro_f1": 0.32098764181137085,
"num_tokens": 1139674.0,
"repeat_count": 0.0,
- "routers_loss": 0.0958542674779892,
+ "routers_loss": 0.09850362688302994,
"skip_count": 2.0,
"step": 706,
"text_loss": 0.7660127282142639
@@ -6724,13 +6724,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009989741123724919,
- "loss": 0.0583,
+ "loss": 0.0574,
"macro_f1": 0.3333333432674408,
"num_tokens": 1143558.0,
"repeat_count": 0.0,
- "routers_loss": 0.007100600749254227,
+ "routers_loss": 0.006673311349004507,
"skip_count": 0.0,
"step": 708,
"text_loss": 0.5976111888885498
@@ -6743,13 +6743,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009989541996361594,
- "loss": 0.0445,
+ "loss": 0.045,
"macro_f1": 0.3333333432674408,
"num_tokens": 1146122.0,
"repeat_count": 0.0,
- "routers_loss": 0.0047812811098992825,
+ "routers_loss": 0.004988791421055794,
"skip_count": 0.0,
"step": 710,
"text_loss": 0.5256119966506958
@@ -6762,13 +6762,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009989340956996367,
- "loss": 0.052,
+ "loss": 0.0528,
"macro_f1": 0.3333333432674408,
"num_tokens": 1149546.0,
"repeat_count": 0.0,
- "routers_loss": 0.006643407512456179,
+ "routers_loss": 0.0067769973538815975,
"skip_count": 0.0,
"step": 712,
"text_loss": 0.5040497779846191
@@ -6781,13 +6781,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2890625,
+ "grad_norm": 0.26953125,
"learning_rate": 0.0009989138005706273,
- "loss": 0.0719,
+ "loss": 0.0735,
"macro_f1": 0.32098764181137085,
"num_tokens": 1153195.0,
"repeat_count": 0.0,
- "routers_loss": 0.0910436138510704,
+ "routers_loss": 0.09899546951055527,
"skip_count": 2.0,
"step": 714,
"text_loss": 0.20803412795066833
@@ -6800,13 +6800,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1484375,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.000998893314256908,
- "loss": 0.0649,
+ "loss": 0.064,
"macro_f1": 0.3333333432674408,
"num_tokens": 1157081.0,
"repeat_count": 0.0,
- "routers_loss": 0.010978946462273598,
+ "routers_loss": 0.010492355562746525,
"skip_count": 0.0,
"step": 716,
"text_loss": 0.23077639937400818
@@ -6819,13 +6819,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009988726367663298,
- "loss": 0.0543,
+ "loss": 0.0539,
"macro_f1": 0.3333333432674408,
"num_tokens": 1160079.0,
"repeat_count": 0.0,
- "routers_loss": 0.009956461377441883,
+ "routers_loss": 0.01063773687928915,
"skip_count": 0.0,
"step": 718,
"text_loss": 0.6085864901542664
@@ -6838,13 +6838,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009988517681068163,
- "loss": 0.0412,
+ "loss": 0.0421,
"macro_f1": 0.3272727429866791,
"num_tokens": 1163249.0,
"repeat_count": 1.0,
- "routers_loss": 0.057210199534893036,
+ "routers_loss": 0.05981874838471413,
"skip_count": 0.0,
"step": 720,
"text_loss": 0.4047050476074219
@@ -6857,32 +6857,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009988307082863638,
- "loss": 0.0364,
+ "loss": 0.0361,
"macro_f1": 0.3333333432674408,
"num_tokens": 1166259.0,
"repeat_count": 0.0,
- "routers_loss": 0.01035996899008751,
+ "routers_loss": 0.009750043973326683,
"skip_count": 0.0,
"step": 722,
"text_loss": 0.5306474566459656
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 3.3991781626063986,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.2412109375,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.240234375,
"learning_rate": 0.0009988094573130434,
- "loss": 0.0661,
- "macro_f1": 0.3076923191547394,
+ "loss": 0.063,
+ "macro_f1": 0.5359477400779724,
"num_tokens": 1168887.0,
"repeat_count": 2.0,
- "routers_loss": 0.18087820708751678,
+ "routers_loss": 0.18601104617118835,
"skip_count": 2.0,
"step": 724,
"text_loss": 0.53528892993927
@@ -6895,32 +6895,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009987880151949974,
- "loss": 0.0505,
+ "loss": 0.0496,
"macro_f1": 0.3272727429866791,
"num_tokens": 1172625.0,
"repeat_count": 0.0,
- "routers_loss": 0.04720238968729973,
+ "routers_loss": 0.02845010720193386,
"skip_count": 1.0,
"step": 726,
"text_loss": 0.4760453701019287
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 26.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 3.417963017317288,
- "f1_execute": 1.0,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.2216796875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2177734375,
"learning_rate": 0.0009987663819404434,
- "loss": 0.0603,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.06,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1176580.0,
"repeat_count": 0.0,
- "routers_loss": 0.015407778322696686,
+ "routers_loss": 0.017596980556845665,
"skip_count": 2.0,
"step": 728,
"text_loss": 0.5146099328994751
@@ -6933,13 +6933,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.000998744557557671,
- "loss": 0.0489,
+ "loss": 0.0484,
"macro_f1": 0.3272727429866791,
"num_tokens": 1179804.0,
"repeat_count": 0.0,
- "routers_loss": 0.060891781002283096,
+ "routers_loss": 0.0625474750995636,
"skip_count": 1.0,
"step": 730,
"text_loss": 0.27738022804260254
@@ -6947,18 +6947,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 3.436747872028177,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.203125,
"learning_rate": 0.0009987225420550433,
- "loss": 0.0825,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0796,
+ "macro_f1": 0.307692289352417,
"num_tokens": 1182658.0,
"repeat_count": 1.0,
- "routers_loss": 0.1661442220211029,
+ "routers_loss": 0.16188351809978485,
"skip_count": 2.0,
"step": 732,
"text_loss": 0.23231445252895355
@@ -6966,18 +6966,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 3.446140299383622,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.2001953125,
"learning_rate": 0.0009987003354409965,
- "loss": 0.0634,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0626,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1185451.0,
"repeat_count": 0.0,
- "routers_loss": 0.02108248695731163,
+ "routers_loss": 0.02391529455780983,
"skip_count": 0.0,
"step": 734,
"text_loss": 0.4496627151966095
@@ -6990,13 +6990,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.248046875,
+ "grad_norm": 0.234375,
"learning_rate": 0.0009986779377240405,
- "loss": 0.0534,
+ "loss": 0.0513,
"macro_f1": 0.32098767161369324,
"num_tokens": 1188666.0,
"repeat_count": 0.0,
- "routers_loss": 0.08318125456571579,
+ "routers_loss": 0.08435963839292526,
"skip_count": 1.0,
"step": 736,
"text_loss": 0.4950787127017975
@@ -7009,13 +7009,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11962890625,
+ "grad_norm": 0.1220703125,
"learning_rate": 0.000998655348912758,
- "loss": 0.0514,
+ "loss": 0.0515,
"macro_f1": 0.3333333432674408,
"num_tokens": 1193035.0,
"repeat_count": 0.0,
- "routers_loss": 0.015889234840869904,
+ "routers_loss": 0.01648722216486931,
"skip_count": 0.0,
"step": 738,
"text_loss": 0.24761848151683807
@@ -7028,13 +7028,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1513671875,
"learning_rate": 0.0009986325690158051,
"loss": 0.0435,
"macro_f1": 0.3333333432674408,
"num_tokens": 1196840.0,
"repeat_count": 0.0,
- "routers_loss": 0.01378484908491373,
+ "routers_loss": 0.013143910095095634,
"skip_count": 0.0,
"step": 740,
"text_loss": 0.15662719309329987
@@ -7047,13 +7047,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1787109375,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009986095980419113,
- "loss": 0.076,
+ "loss": 0.0757,
"macro_f1": 0.3333333432674408,
"num_tokens": 1200573.0,
"repeat_count": 0.0,
- "routers_loss": 0.02673683874309063,
+ "routers_loss": 0.026706280186772346,
"skip_count": 0.0,
"step": 742,
"text_loss": 0.16725164651870728
@@ -7066,13 +7066,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.185546875,
+ "grad_norm": 0.1982421875,
"learning_rate": 0.0009985864359998787,
- "loss": 0.0778,
+ "loss": 0.0795,
"macro_f1": 0.3006536364555359,
"num_tokens": 1203589.0,
"repeat_count": 2.0,
- "routers_loss": 0.27776041626930237,
+ "routers_loss": 0.28607678413391113,
"skip_count": 3.0,
"step": 744,
"text_loss": 0.6350882053375244
@@ -7085,13 +7085,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009985630828985835,
- "loss": 0.0575,
+ "loss": 0.0572,
"macro_f1": 0.3272727429866791,
"num_tokens": 1206422.0,
"repeat_count": 0.0,
- "routers_loss": 0.0575483962893486,
+ "routers_loss": 0.05685260891914368,
"skip_count": 1.0,
"step": 746,
"text_loss": 0.33779552578926086
@@ -7104,13 +7104,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009985395387469742,
- "loss": 0.0478,
+ "loss": 0.0458,
"macro_f1": 0.5492662787437439,
"num_tokens": 1211588.0,
"repeat_count": 0.0,
- "routers_loss": 0.0458797849714756,
+ "routers_loss": 0.0437830351293087,
"skip_count": 2.0,
"step": 748,
"text_loss": 0.28664472699165344
@@ -7123,13 +7123,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.15625,
"learning_rate": 0.0009985158035540735,
- "loss": 0.0701,
+ "loss": 0.0714,
"macro_f1": 0.32098764181137085,
"num_tokens": 1214580.0,
"repeat_count": 2.0,
- "routers_loss": 0.07850238680839539,
+ "routers_loss": 0.07074898481369019,
"skip_count": 0.0,
"step": 750,
"text_loss": 0.3939313292503357
@@ -7142,13 +7142,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.21484375,
"learning_rate": 0.0009984918773289762,
- "loss": 0.0702,
+ "loss": 0.0699,
"macro_f1": 0.3333333432674408,
"num_tokens": 1217388.0,
"repeat_count": 0.0,
- "routers_loss": 0.009507967159152031,
+ "routers_loss": 0.009757856838405132,
"skip_count": 0.0,
"step": 752,
"text_loss": 0.37641215324401855
@@ -7161,13 +7161,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1484375,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009984677600808512,
- "loss": 0.0543,
+ "loss": 0.054,
"macro_f1": 0.3333333432674408,
"num_tokens": 1219960.0,
"repeat_count": 0.0,
- "routers_loss": 0.02620997279882431,
+ "routers_loss": 0.02515069581568241,
"skip_count": 0.0,
"step": 754,
"text_loss": 0.155938982963562
@@ -7180,13 +7180,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3359375,
+ "grad_norm": 0.30078125,
"learning_rate": 0.0009984434518189405,
- "loss": 0.0791,
+ "loss": 0.0764,
"macro_f1": 0.3333333432674408,
"num_tokens": 1223234.0,
"repeat_count": 0.0,
- "routers_loss": 0.02798631228506565,
+ "routers_loss": 0.025766927748918533,
"skip_count": 0.0,
"step": 756,
"text_loss": 0.691118061542511
@@ -7201,11 +7201,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1416015625,
"learning_rate": 0.0009984189525525584,
- "loss": 0.046,
+ "loss": 0.0451,
"macro_f1": 0.5359477400779724,
"num_tokens": 1225764.0,
"repeat_count": 2.0,
- "routers_loss": 0.16614431142807007,
+ "routers_loss": 0.1782722771167755,
"skip_count": 2.0,
"step": 758,
"text_loss": 0.3592209219932556
@@ -7218,13 +7218,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.193359375,
+ "grad_norm": 0.189453125,
"learning_rate": 0.0009983942622910935,
- "loss": 0.0669,
+ "loss": 0.0659,
"macro_f1": 0.3333333432674408,
"num_tokens": 1230097.0,
"repeat_count": 0.0,
- "routers_loss": 0.008541896007955074,
+ "routers_loss": 0.00825568474829197,
"skip_count": 0.0,
"step": 760,
"text_loss": 0.4646475315093994
@@ -7237,13 +7237,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009983693810440074,
- "loss": 0.0478,
+ "loss": 0.0477,
"macro_f1": 0.32098764181137085,
"num_tokens": 1233140.0,
"repeat_count": 0.0,
- "routers_loss": 0.045411624014377594,
+ "routers_loss": 0.04156976938247681,
"skip_count": 2.0,
"step": 762,
"text_loss": 0.298682302236557
@@ -7256,13 +7256,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.380859375,
+ "grad_norm": 0.3515625,
"learning_rate": 0.000998344308820834,
- "loss": 0.0689,
+ "loss": 0.0666,
"macro_f1": 0.3272727429866791,
"num_tokens": 1236305.0,
"repeat_count": 0.0,
- "routers_loss": 0.052299100905656815,
+ "routers_loss": 0.05697929114103317,
"skip_count": 1.0,
"step": 764,
"text_loss": 0.5249121189117432
@@ -7275,13 +7275,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.18359375,
"learning_rate": 0.0009983190456311817,
- "loss": 0.0602,
+ "loss": 0.0592,
"macro_f1": 0.3144654333591461,
"num_tokens": 1239673.0,
"repeat_count": 0.0,
- "routers_loss": 0.09140212833881378,
+ "routers_loss": 0.09547408670186996,
"skip_count": 3.0,
"step": 766,
"text_loss": 0.41277334094047546
@@ -7294,13 +7294,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.201171875,
+ "grad_norm": 0.185546875,
"learning_rate": 0.000998293591484731,
- "loss": 0.0475,
+ "loss": 0.0484,
"macro_f1": 0.5492662787437439,
"num_tokens": 1242292.0,
"repeat_count": 0.0,
- "routers_loss": 0.030750583857297897,
+ "routers_loss": 0.030693158507347107,
"skip_count": 2.0,
"step": 768,
"text_loss": 0.1583656519651413
@@ -7313,13 +7313,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000998267946391236,
- "loss": 0.052,
+ "loss": 0.051,
"macro_f1": 0.3333333432674408,
"num_tokens": 1244661.0,
"repeat_count": 0.0,
- "routers_loss": 0.010202950797975063,
+ "routers_loss": 0.01211300864815712,
"skip_count": 0.0,
"step": 770,
"text_loss": 0.4629349112510681
@@ -7332,13 +7332,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0009982421103605238,
- "loss": 0.0434,
+ "loss": 0.0441,
"macro_f1": 0.32098764181137085,
"num_tokens": 1248688.0,
"repeat_count": 0.0,
- "routers_loss": 0.07364192605018616,
+ "routers_loss": 0.0665968507528305,
"skip_count": 2.0,
"step": 772,
"text_loss": 0.4019293785095215
@@ -7353,11 +7353,11 @@
"f1_skip": 0.0,
"grad_norm": 0.2890625,
"learning_rate": 0.000998216083402495,
- "loss": 0.0606,
+ "loss": 0.0613,
"macro_f1": 0.32098764181137085,
"num_tokens": 1251395.0,
"repeat_count": 0.0,
- "routers_loss": 0.06553081423044205,
+ "routers_loss": 0.07186859846115112,
"skip_count": 2.0,
"step": 774,
"text_loss": 0.4659276604652405
@@ -7370,13 +7370,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.302734375,
"learning_rate": 0.0009981898655271235,
- "loss": 0.0475,
+ "loss": 0.0488,
"macro_f1": 0.3333333432674408,
"num_tokens": 1254888.0,
"repeat_count": 0.0,
- "routers_loss": 0.008751659654080868,
+ "routers_loss": 0.007823926396667957,
"skip_count": 0.0,
"step": 776,
"text_loss": 0.5160359740257263
@@ -7389,13 +7389,13 @@
"f1_execute": 0.9130434989929199,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.11962890625,
"learning_rate": 0.0009981634567444557,
- "loss": 0.0777,
+ "loss": 0.0775,
"macro_f1": 0.590062141418457,
"num_tokens": 1258250.0,
"repeat_count": 3.0,
- "routers_loss": 0.24522721767425537,
+ "routers_loss": 0.24624499678611755,
"skip_count": 4.0,
"step": 778,
"text_loss": 0.29319918155670166
@@ -7408,13 +7408,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.263671875,
"learning_rate": 0.0009981368570646115,
"loss": 0.0885,
"macro_f1": 0.3272727429866791,
"num_tokens": 1260916.0,
"repeat_count": 0.0,
- "routers_loss": 0.03767623379826546,
+ "routers_loss": 0.030730176717042923,
"skip_count": 1.0,
"step": 780,
"text_loss": 0.624981164932251
@@ -7427,13 +7427,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009981100664977838,
- "loss": 0.0708,
+ "loss": 0.0699,
"macro_f1": 0.3333333432674408,
"num_tokens": 1264004.0,
"repeat_count": 0.0,
- "routers_loss": 0.006098059006035328,
+ "routers_loss": 0.006829176563769579,
"skip_count": 0.0,
"step": 782,
"text_loss": 0.6137266159057617
@@ -7446,13 +7446,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009980830850542391,
- "loss": 0.0589,
+ "loss": 0.058,
"macro_f1": 0.3333333432674408,
"num_tokens": 1267130.0,
"repeat_count": 0.0,
- "routers_loss": 0.01731623336672783,
+ "routers_loss": 0.018471000716090202,
"skip_count": 0.0,
"step": 784,
"text_loss": 0.15213175117969513
@@ -7465,13 +7465,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.0009980559127443166,
- "loss": 0.0526,
+ "loss": 0.052,
"macro_f1": 0.3333333432674408,
"num_tokens": 1271129.0,
"repeat_count": 0.0,
- "routers_loss": 0.0076471962966024876,
+ "routers_loss": 0.007903140969574451,
"skip_count": 0.0,
"step": 786,
"text_loss": 0.5768613219261169
@@ -7484,13 +7484,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.130859375,
"learning_rate": 0.000998028549578429,
- "loss": 0.0745,
+ "loss": 0.0719,
"macro_f1": 0.307692289352417,
"num_tokens": 1274232.0,
"repeat_count": 0.0,
- "routers_loss": 0.0637628585100174,
+ "routers_loss": 0.06737866252660751,
"skip_count": 3.0,
"step": 788,
"text_loss": 0.2877073585987091
@@ -7503,13 +7503,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009980009955670615,
- "loss": 0.0699,
+ "loss": 0.0698,
"macro_f1": 0.3144654333591461,
"num_tokens": 1277193.0,
"repeat_count": 0.0,
- "routers_loss": 0.10882514715194702,
+ "routers_loss": 0.10194934904575348,
"skip_count": 3.0,
"step": 790,
"text_loss": 0.11860492825508118
@@ -7522,13 +7522,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.126953125,
"learning_rate": 0.000997973250720773,
- "loss": 0.056,
+ "loss": 0.0552,
"macro_f1": 0.32098764181137085,
"num_tokens": 1280960.0,
"repeat_count": 0.0,
- "routers_loss": 0.10924118757247925,
+ "routers_loss": 0.10297708213329315,
"skip_count": 2.0,
"step": 792,
"text_loss": 0.13477706909179688
@@ -7541,13 +7541,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009979453150501954,
- "loss": 0.0664,
+ "loss": 0.0663,
"macro_f1": 0.32098764181137085,
"num_tokens": 1284611.0,
"repeat_count": 1.0,
- "routers_loss": 0.06571807712316513,
+ "routers_loss": 0.06122037023305893,
"skip_count": 1.0,
"step": 794,
"text_loss": 0.40569379925727844
@@ -7560,13 +7560,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1181640625,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.000997917188566034,
- "loss": 0.0616,
+ "loss": 0.062,
"macro_f1": 0.32098764181137085,
"num_tokens": 1287834.0,
"repeat_count": 0.0,
- "routers_loss": 0.058966971933841705,
+ "routers_loss": 0.061135001480579376,
"skip_count": 2.0,
"step": 796,
"text_loss": 0.2829287648200989
@@ -7579,32 +7579,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.109375,
"learning_rate": 0.0009978888712790664,
- "loss": 0.067,
+ "loss": 0.0654,
"macro_f1": 0.3272727429866791,
"num_tokens": 1291666.0,
"repeat_count": 0.0,
- "routers_loss": 0.04844636470079422,
+ "routers_loss": 0.04841872677206993,
"skip_count": 1.0,
"step": 798,
"text_loss": 1.011757254600525
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.4000000059604645,
- "avg_layers": 26.0,
+ "acc_skip": 0.20000000298023224,
+ "avg_layers": 27.0,
"epoch": 3.756090402113296,
- "f1_execute": 0.9166666865348816,
+ "f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
- "f1_skip": 0.5714285969734192,
- "grad_norm": 0.1416015625,
+ "f1_skip": 0.3333333134651184,
+ "grad_norm": 0.14453125,
"learning_rate": 0.0009978603632001444,
- "loss": 0.0634,
- "macro_f1": 0.4960317611694336,
+ "loss": 0.0636,
+ "macro_f1": 0.4104308485984802,
"num_tokens": 1294627.0,
"repeat_count": 1.0,
- "routers_loss": 0.1591777801513672,
+ "routers_loss": 0.15698759257793427,
"skip_count": 5.0,
"step": 800,
"text_loss": 0.4457623362541199
@@ -7617,13 +7617,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.283203125,
"learning_rate": 0.0009978316643401916,
- "loss": 0.0694,
+ "loss": 0.0688,
"macro_f1": 0.3333333432674408,
"num_tokens": 1297711.0,
"repeat_count": 0.0,
- "routers_loss": 0.017735568806529045,
+ "routers_loss": 0.018952010199427605,
"skip_count": 0.0,
"step": 802,
"text_loss": 0.2069481462240219
@@ -7636,13 +7636,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.14453125,
"learning_rate": 0.0009978027747102062,
- "loss": 0.0477,
+ "loss": 0.0479,
"macro_f1": 0.3333333432674408,
"num_tokens": 1300569.0,
"repeat_count": 0.0,
- "routers_loss": 0.012401525862514973,
+ "routers_loss": 0.014538386836647987,
"skip_count": 0.0,
"step": 804,
"text_loss": 0.4983852505683899
@@ -7655,13 +7655,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2080078125,
+ "grad_norm": 0.2109375,
"learning_rate": 0.0009977736943212584,
- "loss": 0.0735,
+ "loss": 0.0721,
"macro_f1": 0.32098764181137085,
"num_tokens": 1303969.0,
"repeat_count": 0.0,
- "routers_loss": 0.10736164450645447,
+ "routers_loss": 0.11164087057113647,
"skip_count": 2.0,
"step": 806,
"text_loss": 0.2910642921924591
@@ -7674,13 +7674,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2001953125,
+ "grad_norm": 0.1826171875,
"learning_rate": 0.000997744423184492,
- "loss": 0.0428,
+ "loss": 0.0424,
"macro_f1": 0.3272727429866791,
"num_tokens": 1307263.0,
"repeat_count": 0.0,
- "routers_loss": 0.0595436617732048,
+ "routers_loss": 0.06073406711220741,
"skip_count": 1.0,
"step": 808,
"text_loss": 0.18831779062747955
@@ -7693,13 +7693,13 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.26171875,
"learning_rate": 0.0009977149613111236,
- "loss": 0.0494,
+ "loss": 0.0486,
"macro_f1": 0.4400000274181366,
"num_tokens": 1309953.0,
"repeat_count": 1.0,
- "routers_loss": 0.12617000937461853,
+ "routers_loss": 0.11035524308681488,
"skip_count": 4.0,
"step": 810,
"text_loss": 0.7872759699821472
@@ -7712,13 +7712,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1669921875,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.0009976853087124433,
- "loss": 0.0537,
+ "loss": 0.0536,
"macro_f1": 0.3333333432674408,
"num_tokens": 1313243.0,
"repeat_count": 0.0,
- "routers_loss": 0.021242506802082062,
+ "routers_loss": 0.021804286167025566,
"skip_count": 0.0,
"step": 812,
"text_loss": 0.22349292039871216
@@ -7731,13 +7731,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.318359375,
+ "grad_norm": 0.28125,
"learning_rate": 0.0009976554653998138,
- "loss": 0.0617,
+ "loss": 0.0612,
"macro_f1": 0.31446540355682373,
"num_tokens": 1316165.0,
"repeat_count": 0.0,
- "routers_loss": 0.10387415438890457,
+ "routers_loss": 0.10715524107217789,
"skip_count": 2.0,
"step": 814,
"text_loss": 0.18035532534122467
@@ -7750,13 +7750,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.000997625431384671,
- "loss": 0.0565,
+ "loss": 0.0564,
"macro_f1": 0.3333333432674408,
"num_tokens": 1319206.0,
"repeat_count": 0.0,
- "routers_loss": 0.007816939614713192,
+ "routers_loss": 0.007173649035394192,
"skip_count": 0.0,
"step": 816,
"text_loss": 0.48928648233413696
@@ -7769,13 +7769,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.0009975952066785243,
- "loss": 0.0654,
+ "loss": 0.0655,
"macro_f1": 0.3006536364555359,
"num_tokens": 1322549.0,
"repeat_count": 1.0,
- "routers_loss": 0.22526368498802185,
+ "routers_loss": 0.22308112680912018,
"skip_count": 4.0,
"step": 818,
"text_loss": 0.5211259722709656
@@ -7788,13 +7788,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.0009975647912929557,
- "loss": 0.056,
+ "loss": 0.0564,
"macro_f1": 0.3333333432674408,
"num_tokens": 1325213.0,
"repeat_count": 0.0,
- "routers_loss": 0.010998851619660854,
+ "routers_loss": 0.00998698640614748,
"skip_count": 0.0,
"step": 820,
"text_loss": 0.7117052674293518
@@ -7807,13 +7807,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.15234375,
"learning_rate": 0.0009975341852396205,
- "loss": 0.0712,
+ "loss": 0.0723,
"macro_f1": 0.32098764181137085,
"num_tokens": 1328383.0,
"repeat_count": 0.0,
- "routers_loss": 0.07115054875612259,
+ "routers_loss": 0.07454588264226913,
"skip_count": 2.0,
"step": 822,
"text_loss": 0.34539610147476196
@@ -7826,13 +7826,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009975033885302469,
- "loss": 0.0611,
+ "loss": 0.0604,
"macro_f1": 0.3333333432674408,
"num_tokens": 1331406.0,
"repeat_count": 0.0,
- "routers_loss": 0.008062695153057575,
+ "routers_loss": 0.009157589636743069,
"skip_count": 0.0,
"step": 824,
"text_loss": 0.7484824657440186
@@ -7845,13 +7845,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.0009974724011766363,
- "loss": 0.0496,
+ "loss": 0.0474,
"macro_f1": 0.3272727429866791,
"num_tokens": 1334410.0,
"repeat_count": 1.0,
- "routers_loss": 0.16666285693645477,
+ "routers_loss": 0.17149391770362854,
"skip_count": 0.0,
"step": 826,
"text_loss": 0.5913820266723633
@@ -7864,13 +7864,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1708984375,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009974412231906632,
- "loss": 0.0567,
+ "loss": 0.058,
"macro_f1": 0.32098764181137085,
"num_tokens": 1337653.0,
"repeat_count": 1.0,
- "routers_loss": 0.0908689796924591,
+ "routers_loss": 0.09743282198905945,
"skip_count": 1.0,
"step": 828,
"text_loss": 0.2505693733692169
@@ -7883,13 +7883,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16015625,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0009974098545842748,
- "loss": 0.0648,
+ "loss": 0.0638,
"macro_f1": 0.3272727429866791,
"num_tokens": 1340860.0,
"repeat_count": 0.0,
- "routers_loss": 0.04364728182554245,
+ "routers_loss": 0.041490405797958374,
"skip_count": 1.0,
"step": 830,
"text_loss": 0.5585370063781738
@@ -7897,18 +7897,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 3.906369239800411,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2060546875,
+ "grad_norm": 0.193359375,
"learning_rate": 0.0009973782953694918,
- "loss": 0.0772,
- "macro_f1": 0.3076923191547394,
+ "loss": 0.0746,
+ "macro_f1": 0.3006536066532135,
"num_tokens": 1344232.0,
"repeat_count": 1.0,
- "routers_loss": 0.15315109491348267,
+ "routers_loss": 0.16080693900585175,
"skip_count": 3.0,
"step": 832,
"text_loss": 0.4782734513282776
@@ -7921,13 +7921,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.000997346545558408,
- "loss": 0.0527,
+ "loss": 0.0522,
"macro_f1": 0.3333333432674408,
"num_tokens": 1347667.0,
"repeat_count": 0.0,
- "routers_loss": 0.01342768594622612,
+ "routers_loss": 0.01173500344157219,
"skip_count": 0.0,
"step": 834,
"text_loss": 0.25036177039146423
@@ -7940,13 +7940,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1748046875,
+ "grad_norm": 0.173828125,
"learning_rate": 0.0009973146051631895,
- "loss": 0.0513,
+ "loss": 0.0522,
"macro_f1": 0.3333333432674408,
"num_tokens": 1350707.0,
"repeat_count": 0.0,
- "routers_loss": 0.01158806961029768,
+ "routers_loss": 0.011477196589112282,
"skip_count": 0.0,
"step": 836,
"text_loss": 0.5482863187789917
@@ -7959,13 +7959,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.0009972824741960764,
- "loss": 0.0549,
+ "loss": 0.0536,
"macro_f1": 0.3333333432674408,
"num_tokens": 1353704.0,
"repeat_count": 0.0,
- "routers_loss": 0.01255605649203062,
+ "routers_loss": 0.010528896935284138,
"skip_count": 0.0,
"step": 838,
"text_loss": 0.6732596158981323
@@ -7978,13 +7978,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12255859375,
+ "grad_norm": 0.1181640625,
"learning_rate": 0.000997250152669381,
- "loss": 0.0578,
+ "loss": 0.0573,
"macro_f1": 0.3333333432674408,
"num_tokens": 1356608.0,
"repeat_count": 0.0,
- "routers_loss": 0.010225459933280945,
+ "routers_loss": 0.010678744874894619,
"skip_count": 0.0,
"step": 840,
"text_loss": 0.5479338765144348
@@ -7997,13 +7997,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.181640625,
"learning_rate": 0.000997217640595489,
- "loss": 0.0633,
+ "loss": 0.0631,
"macro_f1": 0.3333333432674408,
"num_tokens": 1359809.0,
"repeat_count": 0.0,
- "routers_loss": 0.007837744429707527,
+ "routers_loss": 0.00835978239774704,
"skip_count": 0.0,
"step": 842,
"text_loss": 0.42543259263038635
@@ -8016,13 +8016,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.0009971849379868593,
- "loss": 0.0674,
+ "loss": 0.0653,
"macro_f1": 0.3333333432674408,
"num_tokens": 1362201.0,
"repeat_count": 0.0,
- "routers_loss": 0.008631376549601555,
+ "routers_loss": 0.009930923581123352,
"skip_count": 0.0,
"step": 844,
"text_loss": 0.720462441444397
@@ -8035,13 +8035,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009971520448560235,
- "loss": 0.0612,
+ "loss": 0.0615,
"macro_f1": 0.3272727429866791,
"num_tokens": 1365790.0,
"repeat_count": 0.0,
- "routers_loss": 0.06206027418375015,
+ "routers_loss": 0.06344373524188995,
"skip_count": 1.0,
"step": 846,
"text_loss": 0.8423607349395752
@@ -8049,18 +8049,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 3.9815086586439685,
- "f1_execute": 0.9411765336990356,
+ "f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
- "f1_skip": 0.5,
- "grad_norm": 0.16015625,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.16796875,
"learning_rate": 0.000997118961215586,
- "loss": 0.0678,
- "macro_f1": 0.480392187833786,
+ "loss": 0.0674,
+ "macro_f1": 0.4533333480358124,
"num_tokens": 1368387.0,
"repeat_count": 1.0,
- "routers_loss": 0.1463794708251953,
+ "routers_loss": 0.14688406884670258,
"skip_count": 3.0,
"step": 848,
"text_loss": 0.3933577537536621
@@ -8073,13 +8073,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000997085687078225,
- "loss": 0.052,
+ "loss": 0.0518,
"macro_f1": 0.3333333432674408,
"num_tokens": 1371189.0,
"repeat_count": 0.0,
- "routers_loss": 0.01140492781996727,
+ "routers_loss": 0.009953443892300129,
"skip_count": 0.0,
"step": 850,
"text_loss": 0.41469162702560425
@@ -8092,13 +8092,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.15625,
"learning_rate": 0.0009970522224566909,
- "loss": 0.0563,
+ "loss": 0.0555,
"macro_f1": 0.32098767161369324,
"num_tokens": 1374008.0,
"repeat_count": 0.0,
- "routers_loss": 0.05136030167341232,
+ "routers_loss": 0.048870690166950226,
"skip_count": 1.0,
"step": 852,
"text_loss": 0.613615870475769
@@ -8111,32 +8111,32 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.283203125,
"learning_rate": 0.0009970185673638075,
- "loss": 0.0627,
+ "loss": 0.0629,
"macro_f1": 0.32098764181137085,
"num_tokens": 1376662.0,
"repeat_count": 1.0,
- "routers_loss": 0.07274381071329117,
+ "routers_loss": 0.06865929812192917,
"skip_count": 1.0,
"step": 854,
"text_loss": 0.4392736256122589
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 4.01878485471089,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.162109375,
"learning_rate": 0.0009969847218124716,
- "loss": 0.0503,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0506,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1380049.0,
"repeat_count": 0.0,
- "routers_loss": 0.024335317313671112,
+ "routers_loss": 0.02382219396531582,
"skip_count": 1.0,
"step": 856,
"text_loss": 0.19115346670150757
@@ -8149,13 +8149,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009969506858156527,
- "loss": 0.0359,
+ "loss": 0.0344,
"macro_f1": 0.3272727429866791,
"num_tokens": 1383008.0,
"repeat_count": 0.0,
- "routers_loss": 0.046614740043878555,
+ "routers_loss": 0.03907281160354614,
"skip_count": 1.0,
"step": 858,
"text_loss": 0.34842637181282043
@@ -8168,13 +8168,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11181640625,
+ "grad_norm": 0.12060546875,
"learning_rate": 0.0009969164593863935,
- "loss": 0.0372,
+ "loss": 0.0365,
"macro_f1": 0.3333333432674408,
"num_tokens": 1387051.0,
"repeat_count": 0.0,
- "routers_loss": 0.006380240898579359,
+ "routers_loss": 0.007645803038030863,
"skip_count": 0.0,
"step": 860,
"text_loss": 0.3810436725616455
@@ -8187,13 +8187,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009968820425378098,
- "loss": 0.0473,
+ "loss": 0.0463,
"macro_f1": 0.3272727429866791,
"num_tokens": 1390244.0,
"repeat_count": 1.0,
- "routers_loss": 0.04770716652274132,
+ "routers_loss": 0.04435238987207413,
"skip_count": 0.0,
"step": 862,
"text_loss": 0.34853485226631165
@@ -8206,32 +8206,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3359375,
+ "grad_norm": 0.28515625,
"learning_rate": 0.00099684743528309,
- "loss": 0.0434,
+ "loss": 0.0424,
"macro_f1": 0.3333333432674408,
"num_tokens": 1392976.0,
"repeat_count": 0.0,
- "routers_loss": 0.006983708590269089,
+ "routers_loss": 0.006071661598980427,
"skip_count": 0.0,
"step": 864,
"text_loss": 0.6395178437232971
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 4.065746991488113,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.080078125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0810546875,
"learning_rate": 0.0009968126376354958,
- "loss": 0.0476,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0477,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1396061.0,
"repeat_count": 0.0,
- "routers_loss": 0.046313900500535965,
+ "routers_loss": 0.05011235550045967,
"skip_count": 2.0,
"step": 866,
"text_loss": 0.09103966504335403
@@ -8244,32 +8244,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009967776496083616,
"loss": 0.0509,
"macro_f1": 0.3272727429866791,
"num_tokens": 1398993.0,
"repeat_count": 1.0,
- "routers_loss": 0.0401870422065258,
+ "routers_loss": 0.03979124873876572,
"skip_count": 0.0,
"step": 868,
"text_loss": 0.27257058024406433
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 4.084531846199002,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.14453125,
"learning_rate": 0.000996742471215095,
- "loss": 0.0505,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0516,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1402080.0,
"repeat_count": 0.0,
- "routers_loss": 0.03313451260328293,
+ "routers_loss": 0.030823837965726852,
"skip_count": 2.0,
"step": 870,
"text_loss": 0.7047103047370911
@@ -8282,13 +8282,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009967071024691763,
- "loss": 0.0468,
+ "loss": 0.0461,
"macro_f1": 0.3333333432674408,
"num_tokens": 1404890.0,
"repeat_count": 0.0,
- "routers_loss": 0.010118982754647732,
+ "routers_loss": 0.009721715934574604,
"skip_count": 0.0,
"step": 872,
"text_loss": 0.959106981754303
@@ -8301,13 +8301,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.000996671543384159,
- "loss": 0.0498,
+ "loss": 0.05,
"macro_f1": 0.3333333432674408,
"num_tokens": 1407853.0,
"repeat_count": 0.0,
- "routers_loss": 0.005856200121343136,
+ "routers_loss": 0.006025883834809065,
"skip_count": 0.0,
"step": 874,
"text_loss": 0.47571972012519836
@@ -8320,13 +8320,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.09765625,
"learning_rate": 0.0009966357939736692,
- "loss": 0.0417,
+ "loss": 0.0416,
"macro_f1": 0.3272727429866791,
"num_tokens": 1410723.0,
"repeat_count": 0.0,
- "routers_loss": 0.02768322452902794,
+ "routers_loss": 0.025964925065636635,
"skip_count": 0.0,
"step": 876,
"text_loss": 0.4964611530303955
@@ -8339,13 +8339,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1025390625,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009965998542514065,
- "loss": 0.0419,
+ "loss": 0.0415,
"macro_f1": 0.32098764181137085,
"num_tokens": 1414008.0,
"repeat_count": 0.0,
- "routers_loss": 0.09382032603025436,
+ "routers_loss": 0.09509637206792831,
"skip_count": 2.0,
"step": 878,
"text_loss": 0.621494710445404
@@ -8358,32 +8358,32 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009965637242311427,
- "loss": 0.0466,
+ "loss": 0.0472,
"macro_f1": 0.542222261428833,
"num_tokens": 1417447.0,
"repeat_count": 0.0,
- "routers_loss": 0.026867631822824478,
+ "routers_loss": 0.02520318515598774,
"skip_count": 4.0,
"step": 880,
"text_loss": 0.40209758281707764
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6666666865348816,
- "avg_layers": 24.0,
+ "acc_skip": 0.5,
+ "avg_layers": 25.0,
"epoch": 4.14088641033167,
- "f1_execute": 0.95652174949646,
+ "f1_execute": 0.936170220375061,
"f1_repeat": 0.0,
- "f1_skip": 0.800000011920929,
- "grad_norm": 0.26171875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000996527403926723,
- "loss": 0.0496,
- "macro_f1": 0.5855072736740112,
+ "loss": 0.0495,
+ "macro_f1": 0.5342789888381958,
"num_tokens": 1419905.0,
"repeat_count": 0.0,
- "routers_loss": 0.12731307744979858,
+ "routers_loss": 0.13183781504631042,
"skip_count": 6.0,
"step": 882,
"text_loss": 0.642185389995575
@@ -8396,13 +8396,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.0009964908933520655,
- "loss": 0.039,
+ "loss": 0.0375,
"macro_f1": 0.3333333432674408,
"num_tokens": 1423436.0,
"repeat_count": 0.0,
- "routers_loss": 0.008483970537781715,
+ "routers_loss": 0.009429510682821274,
"skip_count": 0.0,
"step": 884,
"text_loss": 0.48232755064964294
@@ -8415,13 +8415,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.18359375,
+ "grad_norm": 0.1669921875,
"learning_rate": 0.0009964541925211613,
- "loss": 0.0348,
+ "loss": 0.0349,
"macro_f1": 0.32098764181137085,
"num_tokens": 1426842.0,
"repeat_count": 0.0,
- "routers_loss": 0.07847871631383896,
+ "routers_loss": 0.07629609107971191,
"skip_count": 2.0,
"step": 886,
"text_loss": 0.16620934009552002
@@ -8434,13 +8434,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0009964173014480738,
- "loss": 0.036,
+ "loss": 0.0348,
"macro_f1": 0.5492662787437439,
"num_tokens": 1430430.0,
"repeat_count": 0.0,
- "routers_loss": 0.04574459046125412,
+ "routers_loss": 0.036814019083976746,
"skip_count": 2.0,
"step": 888,
"text_loss": 0.4866008758544922
@@ -8453,13 +8453,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10595703125,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009963802201469398,
- "loss": 0.0485,
+ "loss": 0.0476,
"macro_f1": 0.3333333432674408,
"num_tokens": 1433821.0,
"repeat_count": 0.0,
- "routers_loss": 0.004683624487370253,
+ "routers_loss": 0.0041250260546803474,
"skip_count": 0.0,
"step": 890,
"text_loss": 0.578216552734375
@@ -8472,13 +8472,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2158203125,
+ "grad_norm": 0.2373046875,
"learning_rate": 0.0009963429486319693,
- "loss": 0.0476,
+ "loss": 0.0463,
"macro_f1": 0.32098764181137085,
"num_tokens": 1436976.0,
"repeat_count": 0.0,
- "routers_loss": 0.06499828398227692,
+ "routers_loss": 0.06213559955358505,
"skip_count": 2.0,
"step": 892,
"text_loss": 0.221701517701149
@@ -8486,18 +8486,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
- "avg_layers": 25.0,
+ "avg_layers": 26.0,
"epoch": 4.197240974464338,
- "f1_execute": 0.9411764740943909,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.4000000059604645,
- "grad_norm": 0.310546875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.361328125,
"learning_rate": 0.0009963054869174446,
- "loss": 0.0326,
- "macro_f1": 0.44705885648727417,
+ "loss": 0.0313,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 1440397.0,
"repeat_count": 0.0,
- "routers_loss": 0.08285653591156006,
+ "routers_loss": 0.07532428950071335,
"skip_count": 2.0,
"step": 894,
"text_loss": 0.6922838091850281
@@ -8510,13 +8510,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.154296875,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.0009962678350177209,
- "loss": 0.0497,
+ "loss": 0.0472,
"macro_f1": 0.3272727429866791,
"num_tokens": 1443604.0,
"repeat_count": 0.0,
- "routers_loss": 0.04252336546778679,
+ "routers_loss": 0.0419243648648262,
"skip_count": 1.0,
"step": 896,
"text_loss": 0.22092342376708984
@@ -8524,18 +8524,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 4.216025829175227,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10302734375,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009962299929472268,
- "loss": 0.0349,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.034,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 1446257.0,
"repeat_count": 2.0,
- "routers_loss": 0.126711905002594,
+ "routers_loss": 0.10849297791719437,
"skip_count": 0.0,
"step": 898,
"text_loss": 0.26394811272621155
@@ -8548,13 +8548,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.000996191960720463,
- "loss": 0.0392,
+ "loss": 0.0394,
"macro_f1": 0.3333333432674408,
"num_tokens": 1449669.0,
"repeat_count": 0.0,
- "routers_loss": 0.00955706462264061,
+ "routers_loss": 0.0092767970636487,
"skip_count": 0.0,
"step": 900,
"text_loss": 0.5338577628135681
@@ -8567,13 +8567,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009961537383520042,
- "loss": 0.0377,
+ "loss": 0.0354,
"macro_f1": 0.3272727429866791,
"num_tokens": 1452450.0,
"repeat_count": 1.0,
- "routers_loss": 0.03127318620681763,
+ "routers_loss": 0.02985367365181446,
"skip_count": 0.0,
"step": 902,
"text_loss": 0.5875228047370911
@@ -8586,13 +8586,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.0009961153258564966,
- "loss": 0.0389,
+ "loss": 0.0378,
"macro_f1": 0.3144654333591461,
"num_tokens": 1456909.0,
"repeat_count": 0.0,
- "routers_loss": 0.06743519753217697,
+ "routers_loss": 0.06794842332601547,
"skip_count": 3.0,
"step": 904,
"text_loss": 0.40959444642066956
@@ -8605,13 +8605,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009960767232486604,
- "loss": 0.0477,
+ "loss": 0.0476,
"macro_f1": 0.3333333432674408,
"num_tokens": 1461712.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025313226506114006,
+ "routers_loss": 0.0023562447167932987,
"skip_count": 0.0,
"step": 906,
"text_loss": 0.3932875096797943
@@ -8624,13 +8624,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.08203125,
"learning_rate": 0.000996037930543288,
- "loss": 0.052,
+ "loss": 0.0505,
"macro_f1": 0.3272727429866791,
"num_tokens": 1464817.0,
"repeat_count": 0.0,
- "routers_loss": 0.037147488445043564,
+ "routers_loss": 0.03880339860916138,
"skip_count": 1.0,
"step": 908,
"text_loss": 0.17482402920722961
@@ -8643,13 +8643,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.2119140625,
"learning_rate": 0.000995998947755245,
- "loss": 0.0501,
+ "loss": 0.0479,
"macro_f1": 0.3272727429866791,
"num_tokens": 1467810.0,
"repeat_count": 0.0,
- "routers_loss": 0.021232586354017258,
+ "routers_loss": 0.01736828312277794,
"skip_count": 1.0,
"step": 910,
"text_loss": 0.4140470325946808
@@ -8662,13 +8662,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009959597748994695,
- "loss": 0.0759,
+ "loss": 0.0752,
"macro_f1": 0.3333333432674408,
"num_tokens": 1470802.0,
"repeat_count": 0.0,
- "routers_loss": 0.010563847608864307,
+ "routers_loss": 0.011824851855635643,
"skip_count": 0.0,
"step": 912,
"text_loss": 0.7153383493423462
@@ -8681,13 +8681,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009959204119909726,
- "loss": 0.0425,
+ "loss": 0.0421,
"macro_f1": 0.3272727429866791,
"num_tokens": 1474539.0,
"repeat_count": 0.0,
- "routers_loss": 0.0267612524330616,
+ "routers_loss": 0.025456594303250313,
"skip_count": 0.0,
"step": 914,
"text_loss": 0.42812058329582214
@@ -8700,13 +8700,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009958808590448385,
- "loss": 0.0501,
+ "loss": 0.0489,
"macro_f1": 0.3333333432674408,
"num_tokens": 1477552.0,
"repeat_count": 0.0,
- "routers_loss": 0.005838244222104549,
+ "routers_loss": 0.006795851048082113,
"skip_count": 0.0,
"step": 916,
"text_loss": 0.5402814149856567
@@ -8719,13 +8719,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009958411160762234,
- "loss": 0.0383,
+ "loss": 0.039,
"macro_f1": 0.3333333432674408,
"num_tokens": 1482547.0,
"repeat_count": 0.0,
- "routers_loss": 0.014642171561717987,
+ "routers_loss": 0.015615932643413544,
"skip_count": 0.0,
"step": 918,
"text_loss": 0.3836168050765991
@@ -8738,32 +8738,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009958011831003577,
- "loss": 0.0457,
+ "loss": 0.0448,
"macro_f1": 0.3272727429866791,
"num_tokens": 1485807.0,
"repeat_count": 0.0,
- "routers_loss": 0.04119620472192764,
+ "routers_loss": 0.043541423976421356,
"skip_count": 1.0,
"step": 920,
"text_loss": 0.4333936274051666
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 4.328734957440563,
- "f1_execute": 0.943396270275116,
- "f1_repeat": 0.0,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.154296875,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.000995761060132543,
- "loss": 0.0433,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0418,
+ "macro_f1": 0.6538461446762085,
"num_tokens": 1488941.0,
"repeat_count": 1.0,
- "routers_loss": 0.06713195145130157,
+ "routers_loss": 0.05866432189941406,
"skip_count": 2.0,
"step": 922,
"text_loss": 0.4106994867324829
@@ -8776,13 +8776,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009957207471881552,
- "loss": 0.0533,
+ "loss": 0.0531,
"macro_f1": 0.5492662787437439,
"num_tokens": 1492026.0,
"repeat_count": 0.0,
- "routers_loss": 0.024023180827498436,
+ "routers_loss": 0.02714901603758335,
"skip_count": 2.0,
"step": 924,
"text_loss": 0.542091429233551
@@ -8795,13 +8795,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.17578125,
+ "grad_norm": 0.1796875,
"learning_rate": 0.0009956802442826415,
- "loss": 0.0373,
+ "loss": 0.0386,
"macro_f1": 0.3272727429866791,
"num_tokens": 1494543.0,
"repeat_count": 1.0,
- "routers_loss": 0.05399841442704201,
+ "routers_loss": 0.0563737191259861,
"skip_count": 0.0,
"step": 926,
"text_loss": 0.47209203243255615
@@ -8814,13 +8814,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0009956395514315235,
- "loss": 0.0488,
+ "loss": 0.0496,
"macro_f1": 0.3272727429866791,
"num_tokens": 1497831.0,
"repeat_count": 1.0,
- "routers_loss": 0.0299264844506979,
+ "routers_loss": 0.03285066783428192,
"skip_count": 0.0,
"step": 928,
"text_loss": 0.6628931164741516
@@ -8833,13 +8833,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009955986686503943,
- "loss": 0.0467,
+ "loss": 0.0466,
"macro_f1": 0.3272727429866791,
"num_tokens": 1501375.0,
"repeat_count": 0.0,
- "routers_loss": 0.023478010669350624,
+ "routers_loss": 0.024297121912240982,
"skip_count": 1.0,
"step": 930,
"text_loss": 0.495676189661026
@@ -8852,13 +8852,13 @@
"f1_execute": 0.9387754797935486,
"f1_repeat": 1.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0009955575959549202,
- "loss": 0.0447,
+ "loss": 0.0424,
"macro_f1": 0.7795917987823486,
"num_tokens": 1504363.0,
"repeat_count": 1.0,
- "routers_loss": 0.12116194516420364,
+ "routers_loss": 0.12196464836597443,
"skip_count": 4.0,
"step": 932,
"text_loss": 0.26123273372650146
@@ -8871,13 +8871,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.0009955163333608408,
- "loss": 0.053,
+ "loss": 0.0538,
"macro_f1": 0.3333333432674408,
"num_tokens": 1507178.0,
"repeat_count": 0.0,
- "routers_loss": 0.011879723519086838,
+ "routers_loss": 0.012947078794240952,
"skip_count": 0.0,
"step": 934,
"text_loss": 0.32552677392959595
@@ -8890,13 +8890,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009954748808839674,
- "loss": 0.0373,
+ "loss": 0.0379,
"macro_f1": 0.3333333432674408,
"num_tokens": 1509910.0,
"repeat_count": 0.0,
- "routers_loss": 0.009245929308235645,
+ "routers_loss": 0.008946365676820278,
"skip_count": 0.0,
"step": 936,
"text_loss": 0.533141016960144
@@ -8909,13 +8909,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.140625,
"learning_rate": 0.000995433238540185,
- "loss": 0.0461,
+ "loss": 0.0466,
"macro_f1": 0.6538461446762085,
"num_tokens": 1512826.0,
"repeat_count": 1.0,
- "routers_loss": 0.032464127987623215,
+ "routers_loss": 0.029975678771734238,
"skip_count": 1.0,
"step": 938,
"text_loss": 0.2953577935695648
@@ -8928,13 +8928,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009953914063454512,
- "loss": 0.0515,
+ "loss": 0.0497,
"macro_f1": 0.3144654333591461,
"num_tokens": 1517230.0,
"repeat_count": 1.0,
- "routers_loss": 0.08835392445325851,
+ "routers_loss": 0.0889134630560875,
"skip_count": 2.0,
"step": 940,
"text_loss": 0.5368834733963013
@@ -8947,13 +8947,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.193359375,
"learning_rate": 0.000995349384315796,
- "loss": 0.0405,
+ "loss": 0.0413,
"macro_f1": 0.3333333432674408,
"num_tokens": 1519876.0,
"repeat_count": 0.0,
- "routers_loss": 0.014307246543467045,
+ "routers_loss": 0.013458753935992718,
"skip_count": 0.0,
"step": 942,
"text_loss": 0.2005518227815628
@@ -8966,13 +8966,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.000995307172467322,
- "loss": 0.0449,
+ "loss": 0.0444,
"macro_f1": 0.31446540355682373,
"num_tokens": 1522998.0,
"repeat_count": 1.0,
- "routers_loss": 0.10261563211679459,
+ "routers_loss": 0.08850377053022385,
"skip_count": 1.0,
"step": 944,
"text_loss": 0.227926567196846
@@ -8985,13 +8985,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009952647708162054,
- "loss": 0.0507,
+ "loss": 0.0503,
"macro_f1": 0.3272727429866791,
"num_tokens": 1527100.0,
"repeat_count": 0.0,
- "routers_loss": 0.03316422924399376,
+ "routers_loss": 0.03199794515967369,
"skip_count": 1.0,
"step": 946,
"text_loss": 0.4859686493873596
@@ -9004,13 +9004,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009952221793786942,
- "loss": 0.0352,
+ "loss": 0.0354,
"macro_f1": 0.3333333432674408,
"num_tokens": 1530028.0,
"repeat_count": 0.0,
- "routers_loss": 0.00902469176799059,
+ "routers_loss": 0.006507779937237501,
"skip_count": 0.0,
"step": 948,
"text_loss": 0.6855354905128479
@@ -9023,13 +9023,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.0009951793981711097,
- "loss": 0.0581,
+ "loss": 0.0584,
"macro_f1": 0.6538461446762085,
"num_tokens": 1533254.0,
"repeat_count": 1.0,
- "routers_loss": 0.06710167229175568,
+ "routers_loss": 0.06175103038549423,
"skip_count": 1.0,
"step": 950,
"text_loss": 0.7590400576591492
@@ -9042,13 +9042,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009951364272098458,
- "loss": 0.0294,
+ "loss": 0.0295,
"macro_f1": 0.5492662787437439,
"num_tokens": 1536239.0,
"repeat_count": 0.0,
- "routers_loss": 0.04208769276738167,
+ "routers_loss": 0.03773383051156998,
"skip_count": 2.0,
"step": 952,
"text_loss": 0.669784665107727
@@ -9061,13 +9061,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009950932665113688,
- "loss": 0.0505,
+ "loss": 0.0507,
"macro_f1": 0.32098764181137085,
"num_tokens": 1539682.0,
"repeat_count": 0.0,
- "routers_loss": 0.06530380249023438,
+ "routers_loss": 0.07280613481998444,
"skip_count": 2.0,
"step": 954,
"text_loss": 0.3365570902824402
@@ -9080,13 +9080,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009950499160922184,
- "loss": 0.0545,
+ "loss": 0.0541,
"macro_f1": 0.3333333432674408,
"num_tokens": 1542875.0,
"repeat_count": 0.0,
- "routers_loss": 0.01803453080356121,
+ "routers_loss": 0.01770266517996788,
"skip_count": 0.0,
"step": 956,
"text_loss": 0.0921545997262001
@@ -9099,13 +9099,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.09375,
"learning_rate": 0.000995006375969006,
- "loss": 0.0481,
+ "loss": 0.0473,
"macro_f1": 0.3272727429866791,
"num_tokens": 1547135.0,
"repeat_count": 1.0,
- "routers_loss": 0.08461762219667435,
+ "routers_loss": 0.07672002166509628,
"skip_count": 0.0,
"step": 958,
"text_loss": 0.5887606739997864
@@ -9120,11 +9120,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1376953125,
"learning_rate": 0.0009949626461584165,
- "loss": 0.0441,
+ "loss": 0.043,
"macro_f1": 0.3333333432674408,
"num_tokens": 1550100.0,
"repeat_count": 0.0,
- "routers_loss": 0.007111486047506332,
+ "routers_loss": 0.006247182376682758,
"skip_count": 0.0,
"step": 960,
"text_loss": 0.5777931213378906
@@ -9137,13 +9137,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.11181640625,
+ "grad_norm": 0.119140625,
"learning_rate": 0.0009949187266772076,
- "loss": 0.0361,
+ "loss": 0.0366,
"macro_f1": 0.5492662787437439,
"num_tokens": 1553192.0,
"repeat_count": 0.0,
- "routers_loss": 0.029776185750961304,
+ "routers_loss": 0.030319908633828163,
"skip_count": 2.0,
"step": 962,
"text_loss": 0.2370252162218094
@@ -9156,13 +9156,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009948746175422088,
- "loss": 0.0506,
+ "loss": 0.0511,
"macro_f1": 0.3333333432674408,
"num_tokens": 1556318.0,
"repeat_count": 0.0,
- "routers_loss": 0.007108999416232109,
+ "routers_loss": 0.006004320923238993,
"skip_count": 0.0,
"step": 964,
"text_loss": 0.6271032094955444
@@ -9175,13 +9175,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000994830318770323,
- "loss": 0.0498,
+ "loss": 0.0514,
"macro_f1": 0.3333333432674408,
"num_tokens": 1559195.0,
"repeat_count": 0.0,
- "routers_loss": 0.01126947533339262,
+ "routers_loss": 0.011544366367161274,
"skip_count": 0.0,
"step": 966,
"text_loss": 0.47256720066070557
@@ -9194,13 +9194,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009947858303785255,
- "loss": 0.0366,
+ "loss": 0.0374,
"macro_f1": 0.6603773832321167,
"num_tokens": 1561813.0,
"repeat_count": 1.0,
- "routers_loss": 0.05142999067902565,
+ "routers_loss": 0.05258861929178238,
"skip_count": 1.0,
"step": 968,
"text_loss": 0.7703132629394531
@@ -9213,13 +9213,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.0009947411523838648,
- "loss": 0.0461,
+ "loss": 0.0453,
"macro_f1": 0.3333333432674408,
"num_tokens": 1564634.0,
"repeat_count": 0.0,
- "routers_loss": 0.010770819149911404,
+ "routers_loss": 0.011216280050575733,
"skip_count": 0.0,
"step": 970,
"text_loss": 0.4666804075241089
@@ -9232,13 +9232,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0009946962848034608,
- "loss": 0.0692,
+ "loss": 0.0696,
"macro_f1": 0.3333333432674408,
"num_tokens": 1567959.0,
"repeat_count": 0.0,
- "routers_loss": 0.008775795809924603,
+ "routers_loss": 0.009387624450027943,
"skip_count": 0.0,
"step": 972,
"text_loss": 0.4067264199256897
@@ -9251,13 +9251,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.203125,
"learning_rate": 0.0009946512276545075,
- "loss": 0.0403,
+ "loss": 0.0397,
"macro_f1": 0.3272727429866791,
"num_tokens": 1571221.0,
"repeat_count": 1.0,
- "routers_loss": 0.05100395902991295,
+ "routers_loss": 0.041713520884513855,
"skip_count": 0.0,
"step": 974,
"text_loss": 0.5242366194725037
@@ -9270,13 +9270,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.228515625,
"learning_rate": 0.0009946059809542705,
- "loss": 0.0503,
+ "loss": 0.0487,
"macro_f1": 0.7644445300102234,
"num_tokens": 1575033.0,
"repeat_count": 2.0,
- "routers_loss": 0.06653711199760437,
+ "routers_loss": 0.05748331546783447,
"skip_count": 2.0,
"step": 976,
"text_loss": 0.5704690217971802
@@ -9284,18 +9284,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 4.591722923393014,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0009945605447200887,
- "loss": 0.0435,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0445,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1579050.0,
"repeat_count": 0.0,
- "routers_loss": 0.009865665808320045,
+ "routers_loss": 0.016765203326940536,
"skip_count": 0.0,
"step": 978,
"text_loss": 0.4804173707962036
@@ -9308,13 +9308,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.0009945149189693732,
- "loss": 0.0399,
+ "loss": 0.0406,
"macro_f1": 0.5492662787437439,
"num_tokens": 1582967.0,
"repeat_count": 0.0,
- "routers_loss": 0.021175632253289223,
+ "routers_loss": 0.021518222987651825,
"skip_count": 2.0,
"step": 980,
"text_loss": 0.4138598144054413
@@ -9327,32 +9327,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11181640625,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.0009944691037196078,
- "loss": 0.0472,
+ "loss": 0.0456,
"macro_f1": 0.3333333432674408,
"num_tokens": 1586282.0,
"repeat_count": 0.0,
- "routers_loss": 0.011803832836449146,
+ "routers_loss": 0.012246460653841496,
"skip_count": 0.0,
"step": 982,
"text_loss": 0.22561736404895782
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 0.5,
"acc_skip": 0.800000011920929,
- "avg_layers": 23.0,
+ "avg_layers": 24.0,
"epoch": 4.6199002054593485,
- "f1_execute": 0.9090908765792847,
- "f1_repeat": 0.0,
+ "f1_execute": 0.930232584476471,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 0.8000000715255737,
- "grad_norm": 0.142578125,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009944230989883491,
- "loss": 0.0467,
- "macro_f1": 0.5696970224380493,
+ "loss": 0.0456,
+ "macro_f1": 0.7989664077758789,
"num_tokens": 1589279.0,
"repeat_count": 2.0,
- "routers_loss": 0.08856551349163055,
+ "routers_loss": 0.09344895929098129,
"skip_count": 5.0,
"step": 984,
"text_loss": 0.4416656494140625
@@ -9365,13 +9365,13 @@
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.111328125,
"learning_rate": 0.0009943769047932264,
- "loss": 0.0413,
+ "loss": 0.0404,
"macro_f1": 0.5359477400779724,
"num_tokens": 1592398.0,
"repeat_count": 2.0,
- "routers_loss": 0.08593414723873138,
+ "routers_loss": 0.08916857838630676,
"skip_count": 2.0,
"step": 986,
"text_loss": 0.5536438822746277
@@ -9384,13 +9384,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.154296875,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000994330521151941,
- "loss": 0.0399,
+ "loss": 0.039,
"macro_f1": 0.32098764181137085,
"num_tokens": 1596213.0,
"repeat_count": 1.0,
- "routers_loss": 0.07049509882926941,
+ "routers_loss": 0.06114347651600838,
"skip_count": 1.0,
"step": 988,
"text_loss": 0.5835405588150024
@@ -9403,13 +9403,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.1953125,
"learning_rate": 0.000994283948082267,
- "loss": 0.0595,
+ "loss": 0.0573,
"macro_f1": 0.3333333432674408,
"num_tokens": 1598827.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019258069805800915,
+ "routers_loss": 0.0017335431184619665,
"skip_count": 0.0,
"step": 990,
"text_loss": 0.5857380032539368
@@ -9422,13 +9422,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009942371856020522,
- "loss": 0.0335,
+ "loss": 0.0341,
"macro_f1": 0.3333333432674408,
"num_tokens": 1602915.0,
"repeat_count": 0.0,
- "routers_loss": 0.014094089157879353,
+ "routers_loss": 0.014606470242142677,
"skip_count": 0.0,
"step": 992,
"text_loss": 0.6939892768859863
@@ -9436,18 +9436,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 31.0,
"epoch": 4.666862342236572,
- "f1_execute": 0.9583333134651184,
+ "f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009941902337292155,
- "loss": 0.0603,
- "macro_f1": 0.6527777910232544,
+ "loss": 0.06,
+ "macro_f1": 0.6598639488220215,
"num_tokens": 1605776.0,
"repeat_count": 3.0,
- "routers_loss": 0.06360147893428802,
+ "routers_loss": 0.06297315657138824,
"skip_count": 1.0,
"step": 994,
"text_loss": 0.37616831064224243
@@ -9460,13 +9460,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009941430924817487,
- "loss": 0.0573,
+ "loss": 0.0572,
"macro_f1": 0.5492662787437439,
"num_tokens": 1609856.0,
"repeat_count": 0.0,
- "routers_loss": 0.0326208658516407,
+ "routers_loss": 0.03297794610261917,
"skip_count": 2.0,
"step": 996,
"text_loss": 0.2098303586244583
@@ -9479,13 +9479,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09912109375,
+ "grad_norm": 0.10107421875,
"learning_rate": 0.000994095761877717,
- "loss": 0.0502,
+ "loss": 0.0499,
"macro_f1": 0.3333333432674408,
"num_tokens": 1612904.0,
"repeat_count": 0.0,
- "routers_loss": 0.012660752050578594,
+ "routers_loss": 0.012901155278086662,
"skip_count": 0.0,
"step": 998,
"text_loss": 0.20103533565998077
@@ -9498,13 +9498,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.265625,
+ "grad_norm": 0.259765625,
"learning_rate": 0.000994048241935257,
- "loss": 0.0537,
+ "loss": 0.0535,
"macro_f1": 0.3272727429866791,
"num_tokens": 1615540.0,
"repeat_count": 0.0,
- "routers_loss": 0.021756287664175034,
+ "routers_loss": 0.020434845238924026,
"skip_count": 0.0,
"step": 1000,
"text_loss": 0.32709044218063354
@@ -9512,37 +9512,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 4.70443205165835,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1669921875,
"learning_rate": 0.0009940005326725789,
- "loss": 0.0447,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.0453,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 1618786.0,
"repeat_count": 0.0,
- "routers_loss": 0.07292548567056656,
+ "routers_loss": 0.07831378281116486,
"skip_count": 2.0,
"step": 1002,
"text_loss": 0.5789632797241211
},
{
- "acc_repeat": 0.5,
+ "acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 4.713824479013795,
- "f1_execute": 0.9811320900917053,
- "f1_repeat": 0.6666666865348816,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1787109375,
+ "grad_norm": 0.21875,
"learning_rate": 0.0009939526341079647,
- "loss": 0.0505,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0511,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 1621736.0,
"repeat_count": 2.0,
- "routers_loss": 0.03397528454661369,
+ "routers_loss": 0.04863874986767769,
"skip_count": 0.0,
"step": 1004,
"text_loss": 0.6128849387168884
@@ -9555,13 +9555,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009939045462597693,
- "loss": 0.0544,
+ "loss": 0.0538,
"macro_f1": 0.3333333432674408,
"num_tokens": 1624649.0,
"repeat_count": 0.0,
- "routers_loss": 0.005987613927572966,
+ "routers_loss": 0.00677989237010479,
"skip_count": 0.0,
"step": 1006,
"text_loss": 0.6168264150619507
@@ -9574,13 +9574,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009938562691464202,
- "loss": 0.0522,
+ "loss": 0.0524,
"macro_f1": 0.3333333432674408,
"num_tokens": 1627700.0,
"repeat_count": 0.0,
- "routers_loss": 0.021656684577465057,
+ "routers_loss": 0.019490402191877365,
"skip_count": 0.0,
"step": 1008,
"text_loss": 0.17463822662830353
@@ -9593,32 +9593,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.000993807802786417,
- "loss": 0.0487,
+ "loss": 0.0475,
"macro_f1": 0.3333333432674408,
"num_tokens": 1630714.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014992234064266086,
+ "routers_loss": 0.0019022391643375158,
"skip_count": 0.0,
"step": 1010,
"text_loss": 0.5675593018531799
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.5,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 4.751394188435574,
- "f1_execute": 0.9411764740943909,
- "f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.158203125,
+ "f1_execute": 0.9599999785423279,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009937591471983322,
- "loss": 0.0491,
- "macro_f1": 0.5359477400779724,
+ "loss": 0.0501,
+ "macro_f1": 0.7644444704055786,
"num_tokens": 1633770.0,
"repeat_count": 1.0,
- "routers_loss": 0.03448791801929474,
+ "routers_loss": 0.042485643178224564,
"skip_count": 2.0,
"step": 1012,
"text_loss": 0.42387229204177856
@@ -9631,13 +9631,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1357421875,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0009937103024008109,
- "loss": 0.0541,
+ "loss": 0.0545,
"macro_f1": 0.3272727429866791,
"num_tokens": 1637120.0,
"repeat_count": 0.0,
- "routers_loss": 0.08285929262638092,
+ "routers_loss": 0.09427817165851593,
"skip_count": 1.0,
"step": 1014,
"text_loss": 0.49511051177978516
@@ -9650,13 +9650,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.125,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009936612684125702,
- "loss": 0.0515,
+ "loss": 0.0503,
"macro_f1": 0.3333333432674408,
"num_tokens": 1640165.0,
"repeat_count": 0.0,
- "routers_loss": 0.00486504752188921,
+ "routers_loss": 0.005106127820909023,
"skip_count": 0.0,
"step": 1016,
"text_loss": 0.5398799180984497
@@ -9669,13 +9669,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.2734375,
"learning_rate": 0.0009936120452524004,
- "loss": 0.051,
+ "loss": 0.0506,
"macro_f1": 0.3333333432674408,
"num_tokens": 1643251.0,
"repeat_count": 0.0,
- "routers_loss": 0.017805909737944603,
+ "routers_loss": 0.016914300620555878,
"skip_count": 0.0,
"step": 1018,
"text_loss": 0.20882178843021393
@@ -9688,13 +9688,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009935626329391637,
- "loss": 0.0547,
+ "loss": 0.0537,
"macro_f1": 0.32098764181137085,
"num_tokens": 1646560.0,
"repeat_count": 0.0,
- "routers_loss": 0.12958799302577972,
+ "routers_loss": 0.13481520116329193,
"skip_count": 2.0,
"step": 1020,
"text_loss": 0.5719883441925049
@@ -9707,13 +9707,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009935130314917948,
- "loss": 0.0595,
+ "loss": 0.0602,
"macro_f1": 0.5492662787437439,
"num_tokens": 1649538.0,
"repeat_count": 0.0,
- "routers_loss": 0.07447081059217453,
+ "routers_loss": 0.07700438797473907,
"skip_count": 2.0,
"step": 1022,
"text_loss": 0.1303367167711258
@@ -9726,13 +9726,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009934632409293015,
- "loss": 0.0619,
+ "loss": 0.0611,
"macro_f1": 0.32098764181137085,
"num_tokens": 1652397.0,
"repeat_count": 1.0,
- "routers_loss": 0.12529553472995758,
+ "routers_loss": 0.11416907608509064,
"skip_count": 1.0,
"step": 1024,
"text_loss": 0.24076920747756958
@@ -9745,13 +9745,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.306640625,
"learning_rate": 0.0009934132612707631,
- "loss": 0.0491,
+ "loss": 0.0507,
"macro_f1": 0.31446540355682373,
"num_tokens": 1654938.0,
"repeat_count": 0.0,
- "routers_loss": 0.08664281666278839,
+ "routers_loss": 0.09484589844942093,
"skip_count": 2.0,
"step": 1026,
"text_loss": 0.1652517318725586
@@ -9764,13 +9764,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009933630925353324,
- "loss": 0.0394,
+ "loss": 0.0395,
"macro_f1": 0.3333333432674408,
"num_tokens": 1658536.0,
"repeat_count": 0.0,
- "routers_loss": 0.0067965323105454445,
+ "routers_loss": 0.00741987070068717,
"skip_count": 0.0,
"step": 1028,
"text_loss": 0.49296700954437256
@@ -9783,13 +9783,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1845703125,
"learning_rate": 0.0009933127347422337,
- "loss": 0.0607,
+ "loss": 0.0602,
"macro_f1": 0.32098764181137085,
"num_tokens": 1661446.0,
"repeat_count": 0.0,
- "routers_loss": 0.08319470286369324,
+ "routers_loss": 0.08399344235658646,
"skip_count": 2.0,
"step": 1030,
"text_loss": 0.22363591194152832
@@ -9802,13 +9802,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.158203125,
"learning_rate": 0.0009932621879107648,
- "loss": 0.0476,
+ "loss": 0.0475,
"macro_f1": 0.3333333432674408,
"num_tokens": 1664612.0,
"repeat_count": 0.0,
- "routers_loss": 0.002826537238433957,
+ "routers_loss": 0.0031781597062945366,
"skip_count": 0.0,
"step": 1032,
"text_loss": 0.36083245277404785
@@ -9823,11 +9823,11 @@
"f1_skip": 0.0,
"grad_norm": 0.2275390625,
"learning_rate": 0.000993211452060295,
- "loss": 0.0431,
+ "loss": 0.042,
"macro_f1": 0.3272727429866791,
"num_tokens": 1667467.0,
"repeat_count": 0.0,
- "routers_loss": 0.03491095453500748,
+ "routers_loss": 0.03595469892024994,
"skip_count": 1.0,
"step": 1034,
"text_loss": 0.16372856497764587
@@ -9840,13 +9840,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.173828125,
+ "grad_norm": 0.189453125,
"learning_rate": 0.000993160527210266,
- "loss": 0.0616,
+ "loss": 0.061,
"macro_f1": 0.3144654333591461,
"num_tokens": 1670675.0,
"repeat_count": 3.0,
- "routers_loss": 0.1828247457742691,
+ "routers_loss": 0.1597205102443695,
"skip_count": 0.0,
"step": 1036,
"text_loss": 0.6049913763999939
@@ -9859,13 +9859,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2099609375,
+ "grad_norm": 0.2197265625,
"learning_rate": 0.000993109413380193,
- "loss": 0.0563,
+ "loss": 0.0562,
"macro_f1": 0.3333333432674408,
"num_tokens": 1673477.0,
"repeat_count": 0.0,
- "routers_loss": 0.010931054130196571,
+ "routers_loss": 0.009756010957062244,
"skip_count": 0.0,
"step": 1038,
"text_loss": 0.7034620642662048
@@ -9878,13 +9878,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.158203125,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.0009930581105896624,
- "loss": 0.0569,
+ "loss": 0.0559,
"macro_f1": 0.3272727429866791,
"num_tokens": 1676809.0,
"repeat_count": 0.0,
- "routers_loss": 0.023222090676426888,
+ "routers_loss": 0.020718922838568687,
"skip_count": 0.0,
"step": 1040,
"text_loss": 0.2814720571041107
@@ -9897,13 +9897,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.0009930066188583338,
- "loss": 0.0453,
+ "loss": 0.0445,
"macro_f1": 0.32098764181137085,
"num_tokens": 1679398.0,
"repeat_count": 1.0,
- "routers_loss": 0.07085686922073364,
+ "routers_loss": 0.04755603149533272,
"skip_count": 1.0,
"step": 1042,
"text_loss": 0.5445759296417236
@@ -9916,13 +9916,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.126953125,
"learning_rate": 0.0009929549382059388,
- "loss": 0.0515,
+ "loss": 0.0509,
"macro_f1": 0.3333333432674408,
"num_tokens": 1682269.0,
"repeat_count": 0.0,
- "routers_loss": 0.010158216580748558,
+ "routers_loss": 0.01040949858725071,
"skip_count": 0.0,
"step": 1044,
"text_loss": 0.2876914143562317
@@ -9935,13 +9935,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0009929030686522816,
- "loss": 0.0372,
+ "loss": 0.0363,
"macro_f1": 0.3333333432674408,
"num_tokens": 1685428.0,
"repeat_count": 0.0,
- "routers_loss": 0.007876895368099213,
+ "routers_loss": 0.008158888667821884,
"skip_count": 0.0,
"step": 1046,
"text_loss": 0.49053525924682617
@@ -9954,13 +9954,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009928510102172386,
- "loss": 0.0501,
+ "loss": 0.0498,
"macro_f1": 0.3333333432674408,
"num_tokens": 1688252.0,
"repeat_count": 0.0,
- "routers_loss": 0.004859173204749823,
+ "routers_loss": 0.005102572031319141,
"skip_count": 0.0,
"step": 1048,
"text_loss": 0.5274341106414795
@@ -9973,13 +9973,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.17578125,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.0009927987629207587,
- "loss": 0.0582,
+ "loss": 0.0564,
"macro_f1": 0.3333333432674408,
"num_tokens": 1691289.0,
"repeat_count": 0.0,
- "routers_loss": 0.01798083633184433,
+ "routers_loss": 0.016768503934144974,
"skip_count": 0.0,
"step": 1050,
"text_loss": 0.9935035109519958
@@ -9987,18 +9987,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 4.939242735544467,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009927463267828634,
"loss": 0.0488,
- "macro_f1": 0.3272727429866791,
+ "macro_f1": 0.3333333432674408,
"num_tokens": 1694148.0,
"repeat_count": 0.0,
- "routers_loss": 0.014295363798737526,
+ "routers_loss": 0.010905829258263111,
"skip_count": 0.0,
"step": 1052,
"text_loss": 0.20895758271217346
@@ -10011,13 +10011,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.000992693701823646,
- "loss": 0.0635,
+ "loss": 0.0624,
"macro_f1": 0.3272727429866791,
"num_tokens": 1698543.0,
"repeat_count": 1.0,
- "routers_loss": 0.1038367822766304,
+ "routers_loss": 0.10533971339464188,
"skip_count": 0.0,
"step": 1054,
"text_loss": 0.5776236653327942
@@ -10030,13 +10030,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.255859375,
"learning_rate": 0.0009926408880632726,
- "loss": 0.057,
+ "loss": 0.0556,
"macro_f1": 0.3272727429866791,
"num_tokens": 1702460.0,
"repeat_count": 0.0,
- "routers_loss": 0.029780643060803413,
+ "routers_loss": 0.026313411071896553,
"skip_count": 1.0,
"step": 1056,
"text_loss": 0.34990596771240234
@@ -10049,13 +10049,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10107421875,
+ "grad_norm": 0.099609375,
"learning_rate": 0.0009925878855219818,
- "loss": 0.0398,
+ "loss": 0.0391,
"macro_f1": 0.3333333432674408,
"num_tokens": 1705686.0,
"repeat_count": 0.0,
- "routers_loss": 0.008537676185369492,
+ "routers_loss": 0.007763393223285675,
"skip_count": 0.0,
"step": 1058,
"text_loss": 0.4980163276195526
@@ -10068,13 +10068,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.171875,
+ "grad_norm": 0.177734375,
"learning_rate": 0.000992534694220084,
- "loss": 0.0617,
+ "loss": 0.0613,
"macro_f1": 0.3272727429866791,
"num_tokens": 1708739.0,
"repeat_count": 0.0,
- "routers_loss": 0.03966755419969559,
+ "routers_loss": 0.03998444974422455,
"skip_count": 1.0,
"step": 1060,
"text_loss": 0.29092350602149963
@@ -10087,13 +10087,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1484375,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.000992481314177962,
- "loss": 0.0311,
+ "loss": 0.0312,
"macro_f1": 0.32098764181137085,
"num_tokens": 1711903.0,
"repeat_count": 1.0,
- "routers_loss": 0.06651833653450012,
+ "routers_loss": 0.06966045498847961,
"skip_count": 1.0,
"step": 1062,
"text_loss": 0.6267179250717163
@@ -10106,13 +10106,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2431640625,
+ "grad_norm": 0.244140625,
"learning_rate": 0.0009924277454160717,
- "loss": 0.0557,
+ "loss": 0.0548,
"macro_f1": 0.3272727429866791,
"num_tokens": 1715974.0,
"repeat_count": 0.0,
- "routers_loss": 0.05130369961261749,
+ "routers_loss": 0.05536063387989998,
"skip_count": 1.0,
"step": 1064,
"text_loss": 0.5813798904418945
@@ -10125,13 +10125,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009923739879549402,
- "loss": 0.0435,
+ "loss": 0.0423,
"macro_f1": 0.3333333432674408,
"num_tokens": 1718828.0,
"repeat_count": 0.0,
- "routers_loss": 0.020534176379442215,
+ "routers_loss": 0.020993782207369804,
"skip_count": 0.0,
"step": 1066,
"text_loss": 0.22665327787399292
@@ -10144,13 +10144,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.0888671875,
"learning_rate": 0.0009923200418151677,
- "loss": 0.0305,
+ "loss": 0.0301,
"macro_f1": 0.3333333432674408,
"num_tokens": 1722419.0,
"repeat_count": 0.0,
- "routers_loss": 0.007514918688684702,
+ "routers_loss": 0.007351701147854328,
"skip_count": 0.0,
"step": 1068,
"text_loss": 0.5796169638633728
@@ -10163,13 +10163,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009922659070174264,
- "loss": 0.0461,
+ "loss": 0.0452,
"macro_f1": 0.3272727429866791,
"num_tokens": 1725663.0,
"repeat_count": 1.0,
- "routers_loss": 0.024598751217126846,
+ "routers_loss": 0.026033315807580948,
"skip_count": 0.0,
"step": 1070,
"text_loss": 0.25742828845977783
@@ -10182,32 +10182,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009922115835824612,
- "loss": 0.0408,
+ "loss": 0.041,
"macro_f1": 0.3333333432674408,
"num_tokens": 1729239.0,
"repeat_count": 0.0,
- "routers_loss": 0.011866633780300617,
+ "routers_loss": 0.0118600158020854,
"skip_count": 0.0,
"step": 1072,
"text_loss": 0.21630282700061798
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 5.042265923099501,
- "f1_execute": 0.9818181991577148,
- "f1_repeat": 0.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009921570715310884,
- "loss": 0.036,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0364,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 1732507.0,
"repeat_count": 1.0,
- "routers_loss": 0.01755746826529503,
+ "routers_loss": 0.016118815168738365,
"skip_count": 0.0,
"step": 1074,
"text_loss": 0.5639925003051758
@@ -10220,13 +10220,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009921023708841974,
- "loss": 0.0415,
+ "loss": 0.0407,
"macro_f1": 0.3333333432674408,
"num_tokens": 1736182.0,
"repeat_count": 0.0,
- "routers_loss": 0.003976983483880758,
+ "routers_loss": 0.004275390412658453,
"skip_count": 0.0,
"step": 1076,
"text_loss": 0.5758615136146545
@@ -10239,13 +10239,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.0009920474816627496,
- "loss": 0.0378,
+ "loss": 0.037,
"macro_f1": 0.3333333432674408,
"num_tokens": 1739559.0,
"repeat_count": 0.0,
- "routers_loss": 0.013548235408961773,
+ "routers_loss": 0.01299292128533125,
"skip_count": 0.0,
"step": 1078,
"text_loss": 0.18221625685691833
@@ -10258,13 +10258,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009919924038877788,
"loss": 0.0343,
"macro_f1": 0.32098764181137085,
"num_tokens": 1742890.0,
"repeat_count": 0.0,
- "routers_loss": 0.03923165053129196,
+ "routers_loss": 0.038295745849609375,
"skip_count": 2.0,
"step": 1080,
"text_loss": 0.17354349792003632
@@ -10277,13 +10277,13 @@
"f1_execute": 0.9583333134651184,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009919371375803905,
- "loss": 0.0464,
+ "loss": 0.0455,
"macro_f1": 0.8194444179534912,
"num_tokens": 1746433.0,
"repeat_count": 2.0,
- "routers_loss": 0.046429626643657684,
+ "routers_loss": 0.04052971675992012,
"skip_count": 3.0,
"step": 1082,
"text_loss": 0.2250112146139145
@@ -10296,13 +10296,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1025390625,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009918816827617632,
- "loss": 0.0346,
+ "loss": 0.0353,
"macro_f1": 0.3333333432674408,
"num_tokens": 1750802.0,
"repeat_count": 0.0,
- "routers_loss": 0.008998732082545757,
+ "routers_loss": 0.009114136919379234,
"skip_count": 0.0,
"step": 1084,
"text_loss": 0.2526719272136688
@@ -10315,13 +10315,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.000991826039453147,
- "loss": 0.0386,
+ "loss": 0.0392,
"macro_f1": 0.3333333432674408,
"num_tokens": 1754272.0,
"repeat_count": 0.0,
- "routers_loss": 0.005173585377633572,
+ "routers_loss": 0.004904678091406822,
"skip_count": 0.0,
"step": 1086,
"text_loss": 0.7308789491653442
@@ -10334,13 +10334,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.138671875,
"learning_rate": 0.000991770207675865,
- "loss": 0.0308,
+ "loss": 0.0327,
"macro_f1": 0.6666666865348816,
"num_tokens": 1757231.0,
"repeat_count": 0.0,
- "routers_loss": 0.024098891764879227,
+ "routers_loss": 0.02129189297556877,
"skip_count": 2.0,
"step": 1088,
"text_loss": 0.21764220297336578
@@ -10353,13 +10353,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009917141874513113,
"loss": 0.0315,
"macro_f1": 0.3333333432674408,
"num_tokens": 1760003.0,
"repeat_count": 0.0,
- "routers_loss": 0.014002764597535133,
+ "routers_loss": 0.01310618408024311,
"skip_count": 0.0,
"step": 1090,
"text_loss": 0.33892181515693665
@@ -10372,32 +10372,32 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009916579788009537,
- "loss": 0.0462,
+ "loss": 0.0457,
"macro_f1": 0.5492662787437439,
"num_tokens": 1763052.0,
"repeat_count": 0.0,
- "routers_loss": 0.017871137708425522,
+ "routers_loss": 0.02059309557080269,
"skip_count": 2.0,
"step": 1092,
"text_loss": 0.6551769375801086
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 5.136190196653947,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1044921875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10546875,
"learning_rate": 0.0009916015817463312,
"loss": 0.0385,
- "macro_f1": 0.32098764181137085,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1766655.0,
"repeat_count": 0.0,
- "routers_loss": 0.033123619854450226,
+ "routers_loss": 0.0274797435849905,
"skip_count": 2.0,
"step": 1094,
"text_loss": 0.3984372019767761
@@ -10410,13 +10410,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.000991544996309055,
- "loss": 0.0267,
+ "loss": 0.0271,
"macro_f1": 0.3333333432674408,
"num_tokens": 1769997.0,
"repeat_count": 0.0,
- "routers_loss": 0.01279227901250124,
+ "routers_loss": 0.01437368243932724,
"skip_count": 0.0,
"step": 1096,
"text_loss": 0.4203338921070099
@@ -10429,13 +10429,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.000991488222510809,
- "loss": 0.0295,
+ "loss": 0.0292,
"macro_f1": 0.3333333432674408,
"num_tokens": 1773130.0,
"repeat_count": 0.0,
- "routers_loss": 0.001354650012217462,
+ "routers_loss": 0.001382062560878694,
"skip_count": 0.0,
"step": 1098,
"text_loss": 0.43132516741752625
@@ -10448,13 +10448,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.123046875,
"learning_rate": 0.000991431260373349,
- "loss": 0.0326,
+ "loss": 0.0329,
"macro_f1": 0.3144654333591461,
"num_tokens": 1775682.0,
"repeat_count": 1.0,
- "routers_loss": 0.1097714751958847,
+ "routers_loss": 0.1115434318780899,
"skip_count": 2.0,
"step": 1100,
"text_loss": 0.3218227028846741
@@ -10467,13 +10467,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.111328125,
"learning_rate": 0.000991374109918503,
- "loss": 0.0187,
+ "loss": 0.0185,
"macro_f1": 0.3333333432674408,
"num_tokens": 1778407.0,
"repeat_count": 0.0,
- "routers_loss": 0.009649592451751232,
+ "routers_loss": 0.009529678151011467,
"skip_count": 0.0,
"step": 1102,
"text_loss": 0.17183731496334076
@@ -10486,13 +10486,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.11083984375,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.000991316771168171,
- "loss": 0.0447,
+ "loss": 0.044,
"macro_f1": 0.5492662787437439,
"num_tokens": 1781518.0,
"repeat_count": 0.0,
- "routers_loss": 0.020858706906437874,
+ "routers_loss": 0.018668074160814285,
"skip_count": 2.0,
"step": 1104,
"text_loss": 1.1324785947799683
@@ -10505,13 +10505,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.125,
"learning_rate": 0.0009912592441443258,
- "loss": 0.0428,
+ "loss": 0.0411,
"macro_f1": 0.3272727429866791,
"num_tokens": 1784878.0,
"repeat_count": 0.0,
- "routers_loss": 0.048101235181093216,
+ "routers_loss": 0.04145100712776184,
"skip_count": 1.0,
"step": 1106,
"text_loss": 0.6082063317298889
@@ -10524,13 +10524,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009912015288690112,
- "loss": 0.0435,
+ "loss": 0.0421,
"macro_f1": 0.3272727429866791,
"num_tokens": 1788978.0,
"repeat_count": 0.0,
- "routers_loss": 0.02875671721994877,
+ "routers_loss": 0.021450644358992577,
"skip_count": 1.0,
"step": 1108,
"text_loss": 0.5597621202468872
@@ -10543,13 +10543,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08349609375,
+ "grad_norm": 0.083984375,
"learning_rate": 0.0009911436253643444,
- "loss": 0.0247,
+ "loss": 0.0238,
"macro_f1": 0.3333333432674408,
"num_tokens": 1792321.0,
"repeat_count": 0.0,
- "routers_loss": 0.019005145877599716,
+ "routers_loss": 0.017405325546860695,
"skip_count": 0.0,
"step": 1110,
"text_loss": 0.2560598850250244
@@ -10562,13 +10562,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.0009910855336525137,
- "loss": 0.0393,
+ "loss": 0.0383,
"macro_f1": 0.3333333432674408,
"num_tokens": 1795182.0,
"repeat_count": 0.0,
- "routers_loss": 0.007238700054585934,
+ "routers_loss": 0.007162237539887428,
"skip_count": 0.0,
"step": 1112,
"text_loss": 0.3438240587711334
@@ -10581,13 +10581,13 @@
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.125,
+ "grad_norm": 0.115234375,
"learning_rate": 0.00099102725375578,
"loss": 0.0326,
"macro_f1": 0.480392187833786,
"num_tokens": 1798987.0,
"repeat_count": 1.0,
- "routers_loss": 0.12206140905618668,
+ "routers_loss": 0.11149197816848755,
"skip_count": 3.0,
"step": 1114,
"text_loss": 0.20455503463745117
@@ -10595,18 +10595,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 5.239506897563839,
- "f1_execute": 0.8799999952316284,
+ "f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.10791015625,
"learning_rate": 0.0009909687856964767,
- "loss": 0.0366,
- "macro_f1": 0.29333335161209106,
+ "loss": 0.035,
+ "macro_f1": 0.3006536364555359,
"num_tokens": 1802064.0,
"repeat_count": 2.0,
- "routers_loss": 0.15721899271011353,
+ "routers_loss": 0.12679415941238403,
"skip_count": 3.0,
"step": 1116,
"text_loss": 0.11996729671955109
@@ -10619,32 +10619,32 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.125,
+ "grad_norm": 0.12451171875,
"learning_rate": 0.0009909101294970082,
- "loss": 0.0366,
+ "loss": 0.0365,
"macro_f1": 0.5492662787437439,
"num_tokens": 1805412.0,
"repeat_count": 0.0,
- "routers_loss": 0.05058665946125984,
+ "routers_loss": 0.05108053982257843,
"skip_count": 2.0,
"step": 1118,
"text_loss": 0.13224145770072937
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 5.258291752274729,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.123046875,
"learning_rate": 0.0009908512851798522,
- "loss": 0.0454,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0455,
+ "macro_f1": 0.6603773832321167,
"num_tokens": 1808196.0,
"repeat_count": 1.0,
- "routers_loss": 0.023021472617983818,
+ "routers_loss": 0.02131766639649868,
"skip_count": 1.0,
"step": 1120,
"text_loss": 0.7824069261550903
@@ -10657,13 +10657,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1435546875,
+ "grad_norm": 0.138671875,
"learning_rate": 0.0009907922527675576,
- "loss": 0.0409,
+ "loss": 0.0405,
"macro_f1": 0.3333333432674408,
"num_tokens": 1811622.0,
"repeat_count": 0.0,
- "routers_loss": 0.006660689599812031,
+ "routers_loss": 0.006226244382560253,
"skip_count": 0.0,
"step": 1122,
"text_loss": 0.5419743061065674
@@ -10676,13 +10676,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.12890625,
"learning_rate": 0.000990733032282746,
- "loss": 0.0547,
+ "loss": 0.0535,
"macro_f1": 0.5492662787437439,
"num_tokens": 1814628.0,
"repeat_count": 0.0,
- "routers_loss": 0.031727343797683716,
+ "routers_loss": 0.03088250942528248,
"skip_count": 2.0,
"step": 1124,
"text_loss": 0.37100958824157715
@@ -10695,13 +10695,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.0810546875,
"learning_rate": 0.000990673623748111,
- "loss": 0.0351,
+ "loss": 0.0348,
"macro_f1": 0.32098767161369324,
"num_tokens": 1817205.0,
"repeat_count": 0.0,
- "routers_loss": 0.06140992045402527,
+ "routers_loss": 0.05495348572731018,
"skip_count": 1.0,
"step": 1126,
"text_loss": 0.20241330564022064
@@ -10709,18 +10709,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
- "avg_layers": 25.0,
+ "avg_layers": 26.0,
"epoch": 5.295861461696507,
- "f1_execute": 0.9411764740943909,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.4000000059604645,
- "grad_norm": 0.09814453125,
+ "f1_skip": 0.5,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0009906140271864173,
- "loss": 0.0436,
- "macro_f1": 0.44705885648727417,
+ "loss": 0.0433,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 1820141.0,
"repeat_count": 0.0,
- "routers_loss": 0.03872275352478027,
+ "routers_loss": 0.037809282541275024,
"skip_count": 2.0,
"step": 1128,
"text_loss": 0.32965806126594543
@@ -10728,18 +10728,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 5.305253889051952,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09228515625,
+ "grad_norm": 0.0908203125,
"learning_rate": 0.0009905542426205032,
- "loss": 0.0353,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0348,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 1824011.0,
"repeat_count": 0.0,
- "routers_loss": 0.031013142317533493,
+ "routers_loss": 0.03320181369781494,
"skip_count": 1.0,
"step": 1130,
"text_loss": 0.36329755187034607
@@ -10752,13 +10752,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1123046875,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009904942700732777,
- "loss": 0.0333,
+ "loss": 0.0335,
"macro_f1": 0.3333333432674408,
"num_tokens": 1826873.0,
"repeat_count": 0.0,
- "routers_loss": 0.004357635974884033,
+ "routers_loss": 0.004102326463907957,
"skip_count": 0.0,
"step": 1132,
"text_loss": 0.6692602038383484
@@ -10771,13 +10771,13 @@
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11279296875,
+ "grad_norm": 0.08544921875,
"learning_rate": 0.0009904341095677226,
"loss": 0.03,
"macro_f1": 0.29333335161209106,
"num_tokens": 1830103.0,
"repeat_count": 2.0,
- "routers_loss": 0.2376353144645691,
+ "routers_loss": 0.2376193106174469,
"skip_count": 4.0,
"step": 1134,
"text_loss": 0.19212862849235535
@@ -10790,13 +10790,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10888671875,
+ "grad_norm": 0.119140625,
"learning_rate": 0.0009903737611268919,
- "loss": 0.0446,
+ "loss": 0.0445,
"macro_f1": 0.3333333432674408,
"num_tokens": 1833201.0,
"repeat_count": 0.0,
- "routers_loss": 0.004978097043931484,
+ "routers_loss": 0.005253395065665245,
"skip_count": 0.0,
"step": 1136,
"text_loss": 0.6773360371589661
@@ -10809,13 +10809,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009903132247739107,
- "loss": 0.0309,
+ "loss": 0.0305,
"macro_f1": 0.3076923191547394,
"num_tokens": 1836045.0,
"repeat_count": 1.0,
- "routers_loss": 0.14195409417152405,
+ "routers_loss": 0.14382585883140564,
"skip_count": 3.0,
"step": 1138,
"text_loss": 0.2882297933101654
@@ -10828,13 +10828,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.15234375,
+ "grad_norm": 0.150390625,
"learning_rate": 0.0009902525005319766,
- "loss": 0.0403,
+ "loss": 0.04,
"macro_f1": 0.5427350401878357,
"num_tokens": 1839721.0,
"repeat_count": 1.0,
- "routers_loss": 0.04005253314971924,
+ "routers_loss": 0.04033960774540901,
"skip_count": 2.0,
"step": 1140,
"text_loss": 0.7172559499740601
@@ -10847,13 +10847,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.12109375,
"learning_rate": 0.0009901915884243597,
- "loss": 0.0353,
+ "loss": 0.0351,
"macro_f1": 0.6666666865348816,
"num_tokens": 1842614.0,
"repeat_count": 1.0,
- "routers_loss": 0.006839688867330551,
+ "routers_loss": 0.005162308923900127,
"skip_count": 0.0,
"step": 1142,
"text_loss": 0.42892804741859436
@@ -10866,13 +10866,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.0009901304884744014,
- "loss": 0.0396,
+ "loss": 0.0386,
"macro_f1": 0.3144654333591461,
"num_tokens": 1845444.0,
"repeat_count": 1.0,
- "routers_loss": 0.10174567997455597,
+ "routers_loss": 0.10117656737565994,
"skip_count": 2.0,
"step": 1144,
"text_loss": 0.20806430280208588
@@ -10885,13 +10885,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009900692007055152,
- "loss": 0.0365,
+ "loss": 0.0357,
"macro_f1": 0.3333333432674408,
"num_tokens": 1848558.0,
"repeat_count": 0.0,
- "routers_loss": 0.014655748382210732,
+ "routers_loss": 0.014107038266956806,
"skip_count": 0.0,
"step": 1146,
"text_loss": 0.5355974435806274
@@ -10904,13 +10904,13 @@
"f1_execute": 0.9166666865348816,
"f1_repeat": 0.4000000059604645,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.158203125,
+ "grad_norm": 0.16015625,
"learning_rate": 0.000990007725141187,
- "loss": 0.0467,
+ "loss": 0.0449,
"macro_f1": 0.6611111164093018,
"num_tokens": 1852723.0,
"repeat_count": 4.0,
- "routers_loss": 0.16960746049880981,
+ "routers_loss": 0.15537866950035095,
"skip_count": 2.0,
"step": 1148,
"text_loss": 0.6388513445854187
@@ -10923,32 +10923,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1220703125,
+ "grad_norm": 0.1181640625,
"learning_rate": 0.0009899460618049741,
- "loss": 0.0399,
+ "loss": 0.0397,
"macro_f1": 0.3333333432674408,
"num_tokens": 1856181.0,
"repeat_count": 0.0,
- "routers_loss": 0.011591178365051746,
+ "routers_loss": 0.011800912208855152,
"skip_count": 0.0,
"step": 1150,
"text_loss": 0.6113069653511047
},
{
- "acc_repeat": 0.5,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 30.0,
"epoch": 5.408570589961843,
- "f1_execute": 0.9811320900917053,
- "f1_repeat": 0.6666666865348816,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09912109375,
+ "grad_norm": 0.1005859375,
"learning_rate": 0.000989884210720506,
- "loss": 0.0332,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0331,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 1859685.0,
"repeat_count": 2.0,
- "routers_loss": 0.04036068916320801,
+ "routers_loss": 0.022900646552443504,
"skip_count": 0.0,
"step": 1152,
"text_loss": 0.25718021392822266
@@ -10961,13 +10961,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009898221719114844,
- "loss": 0.0366,
+ "loss": 0.0354,
"macro_f1": 0.3272727429866791,
"num_tokens": 1862505.0,
"repeat_count": 0.0,
- "routers_loss": 0.030165785923600197,
+ "routers_loss": 0.026814989745616913,
"skip_count": 1.0,
"step": 1154,
"text_loss": 0.5426549911499023
@@ -10980,13 +10980,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0966796875,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009897599454016823,
- "loss": 0.0421,
+ "loss": 0.0401,
"macro_f1": 0.3333333432674408,
"num_tokens": 1866266.0,
"repeat_count": 0.0,
- "routers_loss": 0.003615695284679532,
+ "routers_loss": 0.0032623792067170143,
"skip_count": 0.0,
"step": 1156,
"text_loss": 0.37752896547317505
@@ -10999,13 +10999,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.07080078125,
"learning_rate": 0.0009896975312149454,
- "loss": 0.0377,
+ "loss": 0.0369,
"macro_f1": 0.3333333432674408,
"num_tokens": 1870216.0,
"repeat_count": 0.0,
- "routers_loss": 0.01679840311408043,
+ "routers_loss": 0.015617577359080315,
"skip_count": 0.0,
"step": 1158,
"text_loss": 0.18207129836082458
@@ -11018,13 +11018,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009896349293751906,
- "loss": 0.0422,
+ "loss": 0.0423,
"macro_f1": 0.3272727429866791,
"num_tokens": 1873338.0,
"repeat_count": 0.0,
- "routers_loss": 0.024936161935329437,
+ "routers_loss": 0.02250153198838234,
"skip_count": 1.0,
"step": 1160,
"text_loss": 0.548884391784668
@@ -11037,13 +11037,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009895721399064072,
- "loss": 0.0407,
+ "loss": 0.0388,
"macro_f1": 0.32098764181137085,
"num_tokens": 1876470.0,
"repeat_count": 1.0,
- "routers_loss": 0.06472968310117722,
+ "routers_loss": 0.055204521864652634,
"skip_count": 1.0,
"step": 1162,
"text_loss": 0.48052409291267395
@@ -11056,13 +11056,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009895091628326564,
- "loss": 0.031,
+ "loss": 0.0293,
"macro_f1": 0.3333333432674408,
"num_tokens": 1879354.0,
"repeat_count": 0.0,
- "routers_loss": 0.009633494541049004,
+ "routers_loss": 0.009093789383769035,
"skip_count": 0.0,
"step": 1164,
"text_loss": 0.3908069431781769
@@ -11075,13 +11075,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.140625,
"learning_rate": 0.000989445998178071,
"loss": 0.0323,
"macro_f1": 0.3272727429866791,
"num_tokens": 1881941.0,
"repeat_count": 0.0,
- "routers_loss": 0.01458993274718523,
+ "routers_loss": 0.015086972154676914,
"skip_count": 1.0,
"step": 1166,
"text_loss": 0.4884725511074066
@@ -11094,13 +11094,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009893826459668558,
- "loss": 0.0389,
+ "loss": 0.0386,
"macro_f1": 0.3144654333591461,
"num_tokens": 1885374.0,
"repeat_count": 0.0,
- "routers_loss": 0.06636982411146164,
+ "routers_loss": 0.06587666273117065,
"skip_count": 3.0,
"step": 1168,
"text_loss": 0.12760137021541595
@@ -11113,13 +11113,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.0009893191062232873,
- "loss": 0.0325,
+ "loss": 0.0322,
"macro_f1": 0.3333333432674408,
"num_tokens": 1888612.0,
"repeat_count": 0.0,
- "routers_loss": 0.005644182674586773,
+ "routers_loss": 0.006088624242693186,
"skip_count": 0.0,
"step": 1170,
"text_loss": 0.4821319580078125
@@ -11132,13 +11132,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.0009892553789717143,
- "loss": 0.0402,
+ "loss": 0.0389,
"macro_f1": 0.3333333432674408,
"num_tokens": 1891463.0,
"repeat_count": 0.0,
- "routers_loss": 0.010273848660290241,
+ "routers_loss": 0.010113578289747238,
"skip_count": 0.0,
"step": 1172,
"text_loss": 0.3613642454147339
@@ -11151,13 +11151,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009891914642365573,
- "loss": 0.0415,
+ "loss": 0.0404,
"macro_f1": 0.3333333432674408,
"num_tokens": 1894230.0,
"repeat_count": 0.0,
- "routers_loss": 0.004529652185738087,
+ "routers_loss": 0.004947459790855646,
"skip_count": 0.0,
"step": 1174,
"text_loss": 0.5037549138069153
@@ -11170,13 +11170,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.0009891273620423083,
- "loss": 0.045,
+ "loss": 0.0428,
"macro_f1": 0.3272727429866791,
"num_tokens": 1897294.0,
"repeat_count": 1.0,
- "routers_loss": 0.024671228602528572,
+ "routers_loss": 0.026075217872858047,
"skip_count": 0.0,
"step": 1176,
"text_loss": 0.32558977603912354
@@ -11189,13 +11189,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009890630724135314,
- "loss": 0.0354,
+ "loss": 0.0351,
"macro_f1": 0.3272727429866791,
"num_tokens": 1901553.0,
"repeat_count": 0.0,
- "routers_loss": 0.06466450542211533,
+ "routers_loss": 0.06650999188423157,
"skip_count": 1.0,
"step": 1178,
"text_loss": 0.23473620414733887
@@ -11208,13 +11208,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1767578125,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009889985953748625,
- "loss": 0.0278,
+ "loss": 0.0268,
"macro_f1": 0.6666666865348816,
"num_tokens": 1904556.0,
"repeat_count": 0.0,
- "routers_loss": 0.010566026903688908,
+ "routers_loss": 0.010361116379499435,
"skip_count": 1.0,
"step": 1180,
"text_loss": 0.6927042007446289
@@ -11227,13 +11227,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.103515625,
"learning_rate": 0.0009889339309510094,
- "loss": 0.037,
+ "loss": 0.0351,
"macro_f1": 0.3333333432674408,
"num_tokens": 1908053.0,
"repeat_count": 0.0,
- "routers_loss": 0.013842248357832432,
+ "routers_loss": 0.013286533765494823,
"skip_count": 0.0,
"step": 1182,
"text_loss": 0.19977325201034546
@@ -11246,13 +11246,13 @@
"f1_execute": 0.9387754797935486,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.5,
- "grad_norm": 0.07373046875,
+ "grad_norm": 0.058837890625,
"learning_rate": 0.0009888690791667518,
- "loss": 0.0215,
+ "loss": 0.0204,
"macro_f1": 0.7018141150474548,
"num_tokens": 1911754.0,
"repeat_count": 2.0,
- "routers_loss": 0.122759610414505,
+ "routers_loss": 0.11920545995235443,
"skip_count": 3.0,
"step": 1184,
"text_loss": 0.4072858691215515
@@ -11265,32 +11265,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009888040400469408,
- "loss": 0.0402,
+ "loss": 0.0391,
"macro_f1": 0.3272727429866791,
"num_tokens": 1914862.0,
"repeat_count": 0.0,
- "routers_loss": 0.035315629094839096,
+ "routers_loss": 0.03652849420905113,
"skip_count": 1.0,
"step": 1186,
"text_loss": 0.2654043138027191
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 5.577634282359847,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.0009887388136164996,
- "loss": 0.034,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0336,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1918542.0,
"repeat_count": 0.0,
- "routers_loss": 0.040048226714134216,
+ "routers_loss": 0.03991910070180893,
"skip_count": 2.0,
"step": 1188,
"text_loss": 0.21130657196044922
@@ -11298,18 +11298,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 5.587026709715292,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.09521484375,
"learning_rate": 0.000988673399900423,
- "loss": 0.044,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0429,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1921589.0,
"repeat_count": 0.0,
- "routers_loss": 0.012814820744097233,
+ "routers_loss": 0.014900135807693005,
"skip_count": 0.0,
"step": 1190,
"text_loss": 0.5519335865974426
@@ -11322,13 +11322,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009886077989237777,
- "loss": 0.0407,
+ "loss": 0.0405,
"macro_f1": 0.3272727429866791,
"num_tokens": 1924320.0,
"repeat_count": 0.0,
- "routers_loss": 0.05977959558367729,
+ "routers_loss": 0.06271552294492722,
"skip_count": 1.0,
"step": 1192,
"text_loss": 0.213813915848732
@@ -11341,13 +11341,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.0,
"f1_skip": 0.888888955116272,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.1875,
"learning_rate": 0.000988542010711702,
- "loss": 0.0334,
+ "loss": 0.0342,
"macro_f1": 0.6225374937057495,
"num_tokens": 1927178.0,
"repeat_count": 0.0,
- "routers_loss": 0.031448643654584885,
+ "routers_loss": 0.03081391751766205,
"skip_count": 5.0,
"step": 1194,
"text_loss": 0.7524349093437195
@@ -11360,13 +11360,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.265625,
+ "grad_norm": 0.255859375,
"learning_rate": 0.0009884760352894064,
- "loss": 0.0523,
+ "loss": 0.0518,
"macro_f1": 0.3333333432674408,
"num_tokens": 1930216.0,
"repeat_count": 0.0,
- "routers_loss": 0.008164947852492332,
+ "routers_loss": 0.008556773886084557,
"skip_count": 0.0,
"step": 1196,
"text_loss": 0.28230375051498413
@@ -11379,32 +11379,32 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.5,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.1064453125,
"learning_rate": 0.0009884098726821726,
- "loss": 0.0478,
+ "loss": 0.0472,
"macro_f1": 0.4871794879436493,
"num_tokens": 1933312.0,
"repeat_count": 3.0,
- "routers_loss": 0.04045635461807251,
+ "routers_loss": 0.05344727262854576,
"skip_count": 0.0,
"step": 1198,
"text_loss": 0.5509607195854187
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6666666865348816,
- "avg_layers": 26.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
"epoch": 5.633988846492516,
- "f1_execute": 0.9600000381469727,
+ "f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
- "f1_skip": 0.800000011920929,
- "grad_norm": 0.1240234375,
+ "f1_skip": 0.5,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.000988343522915354,
- "loss": 0.0447,
- "macro_f1": 0.5866667032241821,
+ "loss": 0.0441,
+ "macro_f1": 0.480392187833786,
"num_tokens": 1936160.0,
"repeat_count": 1.0,
- "routers_loss": 0.06872973591089249,
+ "routers_loss": 0.07324771583080292,
"skip_count": 3.0,
"step": 1200,
"text_loss": 0.30565372109413147
@@ -11412,18 +11412,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
- "avg_layers": 24.0,
+ "avg_layers": 25.0,
"epoch": 5.64338127384796,
- "f1_execute": 0.8695651888847351,
+ "f1_execute": 0.8936169743537903,
"f1_repeat": 0.0,
- "f1_skip": 0.4000000059604645,
- "grad_norm": 0.25390625,
+ "f1_skip": 0.444444477558136,
+ "grad_norm": 0.2470703125,
"learning_rate": 0.0009882769860143764,
- "loss": 0.0331,
- "macro_f1": 0.4231884181499481,
+ "loss": 0.0317,
+ "macro_f1": 0.4460204839706421,
"num_tokens": 1939266.0,
"repeat_count": 0.0,
- "routers_loss": 0.20964151620864868,
+ "routers_loss": 0.18620699644088745,
"skip_count": 6.0,
"step": 1202,
"text_loss": 0.976121723651886
@@ -11442,26 +11442,26 @@
"macro_f1": 0.6666666865348816,
"num_tokens": 1942173.0,
"repeat_count": 0.0,
- "routers_loss": 0.00690250750631094,
+ "routers_loss": 0.007703613489866257,
"skip_count": 1.0,
"step": 1204,
"text_loss": 0.5647401809692383
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 5.66216612855885,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009881433509120036,
- "loss": 0.0372,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0376,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1945071.0,
"repeat_count": 0.0,
- "routers_loss": 0.022315658628940582,
+ "routers_loss": 0.02162683941423893,
"skip_count": 2.0,
"step": 1206,
"text_loss": 0.24229218065738678
@@ -11474,13 +11474,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1083984375,
+ "grad_norm": 0.0966796875,
"learning_rate": 0.0009880762527618176,
- "loss": 0.0388,
+ "loss": 0.0383,
"macro_f1": 0.3333333432674408,
"num_tokens": 1949060.0,
"repeat_count": 0.0,
- "routers_loss": 0.017015069723129272,
+ "routers_loss": 0.017667081207036972,
"skip_count": 0.0,
"step": 1208,
"text_loss": 0.4035970866680145
@@ -11493,13 +11493,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009880089675798908,
- "loss": 0.0372,
+ "loss": 0.0367,
"macro_f1": 0.3333333432674408,
"num_tokens": 1951698.0,
"repeat_count": 0.0,
- "routers_loss": 0.006532609928399324,
+ "routers_loss": 0.006405784282833338,
"skip_count": 0.0,
"step": 1210,
"text_loss": 0.5319879055023193
@@ -11512,13 +11512,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009879414953920071,
- "loss": 0.0301,
+ "loss": 0.0294,
"macro_f1": 0.3333333432674408,
"num_tokens": 1955266.0,
"repeat_count": 0.0,
- "routers_loss": 0.009720963425934315,
+ "routers_loss": 0.009859707206487656,
"skip_count": 0.0,
"step": 1212,
"text_loss": 0.6687407493591309
@@ -11531,32 +11531,32 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009878738362240219,
- "loss": 0.046,
+ "loss": 0.045,
"macro_f1": 0.5492662787437439,
"num_tokens": 1958538.0,
"repeat_count": 0.0,
- "routers_loss": 0.03176085278391838,
+ "routers_loss": 0.030890554189682007,
"skip_count": 2.0,
"step": 1214,
"text_loss": 0.20820017158985138
},
{
"acc_repeat": 0.5,
- "acc_skip": 0.5,
- "avg_layers": 29.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
"epoch": 5.709128265336073,
- "f1_execute": 0.9387754797935486,
+ "f1_execute": 0.9200000166893005,
"f1_repeat": 0.5,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.2021484375,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.000987805990101862,
- "loss": 0.0323,
- "macro_f1": 0.7018141150474548,
+ "loss": 0.0317,
+ "macro_f1": 0.47333335876464844,
"num_tokens": 1961419.0,
"repeat_count": 2.0,
- "routers_loss": 0.08626245707273483,
+ "routers_loss": 0.10383198410272598,
"skip_count": 2.0,
"step": 1216,
"text_loss": 0.8664976358413696
@@ -11569,13 +11569,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009877379570515268,
- "loss": 0.0374,
+ "loss": 0.0366,
"macro_f1": 0.3333333432674408,
"num_tokens": 1964836.0,
"repeat_count": 0.0,
- "routers_loss": 0.012099343352019787,
+ "routers_loss": 0.013376163318753242,
"skip_count": 0.0,
"step": 1218,
"text_loss": 0.4223395884037018
@@ -11588,13 +11588,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.0859375,
"learning_rate": 0.0009876697370990865,
- "loss": 0.0342,
+ "loss": 0.0343,
"macro_f1": 0.3333333432674408,
"num_tokens": 1967620.0,
"repeat_count": 0.0,
- "routers_loss": 0.007713846862316132,
+ "routers_loss": 0.008577900938689709,
"skip_count": 0.0,
"step": 1220,
"text_loss": 0.4789901375770569
@@ -11607,13 +11607,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.0009876013302706828,
- "loss": 0.0499,
+ "loss": 0.049,
"macro_f1": 0.3333333432674408,
"num_tokens": 1971100.0,
"repeat_count": 0.0,
- "routers_loss": 0.004629489034414291,
+ "routers_loss": 0.004730266984552145,
"skip_count": 0.0,
"step": 1222,
"text_loss": 0.6799837946891785
@@ -11626,13 +11626,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08837890625,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009875327365925295,
- "loss": 0.035,
+ "loss": 0.0341,
"macro_f1": 0.3333333432674408,
"num_tokens": 1974408.0,
"repeat_count": 0.0,
- "routers_loss": 0.010654795914888382,
+ "routers_loss": 0.010849526152014732,
"skip_count": 0.0,
"step": 1224,
"text_loss": 0.18967926502227783
@@ -11640,18 +11640,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 26.0,
+ "avg_layers": 27.0,
"epoch": 5.756090402113296,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009874639560909118,
- "loss": 0.0516,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.0498,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 1977046.0,
"repeat_count": 0.0,
- "routers_loss": 0.05963074415922165,
+ "routers_loss": 0.04841252416372299,
"skip_count": 1.0,
"step": 1226,
"text_loss": 0.6133310198783875
@@ -11664,13 +11664,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1328125,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.0009873949887921867,
- "loss": 0.04,
+ "loss": 0.0402,
"macro_f1": 0.3272727429866791,
"num_tokens": 1980330.0,
"repeat_count": 0.0,
- "routers_loss": 0.028920643031597137,
+ "routers_loss": 0.029638588428497314,
"skip_count": 1.0,
"step": 1228,
"text_loss": 0.15649555623531342
@@ -11678,18 +11678,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 5.774875256824186,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10595703125,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.0009873258347227823,
- "loss": 0.0327,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0331,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1983173.0,
"repeat_count": 0.0,
- "routers_loss": 0.006852717138826847,
+ "routers_loss": 0.009955910965800285,
"skip_count": 0.0,
"step": 1230,
"text_loss": 0.4741005599498749
@@ -11702,13 +11702,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009872564939091989,
- "loss": 0.0346,
+ "loss": 0.0342,
"macro_f1": 0.3333333432674408,
"num_tokens": 1986825.0,
"repeat_count": 0.0,
- "routers_loss": 0.010968753136694431,
+ "routers_loss": 0.010205300524830818,
"skip_count": 0.0,
"step": 1232,
"text_loss": 0.5315462350845337
@@ -11721,13 +11721,13 @@
"f1_execute": 0.9302325248718262,
"f1_repeat": 1.0,
"f1_skip": 0.7272727489471436,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.11865234375,
"learning_rate": 0.0009871869663780077,
- "loss": 0.0344,
+ "loss": 0.0336,
"macro_f1": 0.8858351111412048,
"num_tokens": 1990448.0,
"repeat_count": 1.0,
- "routers_loss": 0.0906950980424881,
+ "routers_loss": 0.09120134264230728,
"skip_count": 7.0,
"step": 1234,
"text_loss": 0.6187508702278137
@@ -11740,13 +11740,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.125,
"learning_rate": 0.0009871172521558522,
- "loss": 0.0484,
+ "loss": 0.0475,
"macro_f1": 0.6666666865348816,
"num_tokens": 1993474.0,
"repeat_count": 0.0,
- "routers_loss": 0.016306072473526,
+ "routers_loss": 0.016188839450478554,
"skip_count": 1.0,
"step": 1236,
"text_loss": 0.20783066749572754
@@ -11759,13 +11759,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.208984375,
+ "grad_norm": 0.216796875,
"learning_rate": 0.0009870473512694465,
- "loss": 0.038,
+ "loss": 0.0373,
"macro_f1": 0.5934640765190125,
"num_tokens": 1996536.0,
"repeat_count": 0.0,
- "routers_loss": 0.05804471671581268,
+ "routers_loss": 0.05046704784035683,
"skip_count": 3.0,
"step": 1238,
"text_loss": 0.247748002409935
@@ -11773,18 +11773,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 5.821837393601409,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.091796875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.09033203125,
"learning_rate": 0.0009869772637455772,
- "loss": 0.0256,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0251,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 1999530.0,
"repeat_count": 0.0,
- "routers_loss": 0.045395996421575546,
+ "routers_loss": 0.044926248490810394,
"skip_count": 2.0,
"step": 1240,
"text_loss": 0.26001980900764465
@@ -11797,13 +11797,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11767578125,
+ "grad_norm": 0.1513671875,
"learning_rate": 0.000986906989611102,
- "loss": 0.0438,
+ "loss": 0.0446,
"macro_f1": 0.3272727429866791,
"num_tokens": 2002782.0,
"repeat_count": 0.0,
- "routers_loss": 0.020834850147366524,
+ "routers_loss": 0.025911526754498482,
"skip_count": 0.0,
"step": 1242,
"text_loss": 0.9009982943534851
@@ -11816,13 +11816,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1123046875,
+ "grad_norm": 0.115234375,
"learning_rate": 0.0009868365288929492,
- "loss": 0.0377,
+ "loss": 0.0371,
"macro_f1": 0.3333333432674408,
"num_tokens": 2005331.0,
"repeat_count": 0.0,
- "routers_loss": 0.005241698585450649,
+ "routers_loss": 0.0043760035187006,
"skip_count": 0.0,
"step": 1244,
"text_loss": 0.5547386407852173
@@ -11835,13 +11835,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0966796875,
+ "grad_norm": 0.1005859375,
"learning_rate": 0.0009867658816181206,
- "loss": 0.038,
+ "loss": 0.0374,
"macro_f1": 0.3333333432674408,
"num_tokens": 2008115.0,
"repeat_count": 0.0,
- "routers_loss": 0.008387803100049496,
+ "routers_loss": 0.009227181784808636,
"skip_count": 0.0,
"step": 1246,
"text_loss": 1.0067731142044067
@@ -11854,13 +11854,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.126953125,
"learning_rate": 0.000986695047813688,
- "loss": 0.0256,
+ "loss": 0.0261,
"macro_f1": 0.3272727429866791,
"num_tokens": 2011137.0,
"repeat_count": 1.0,
- "routers_loss": 0.02261745184659958,
+ "routers_loss": 0.023822437971830368,
"skip_count": 0.0,
"step": 1248,
"text_loss": 0.30058956146240234
@@ -11873,32 +11873,32 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009866240275067948,
- "loss": 0.0435,
+ "loss": 0.044,
"macro_f1": 0.47333335876464844,
"num_tokens": 2014159.0,
"repeat_count": 2.0,
- "routers_loss": 0.21678555011749268,
+ "routers_loss": 0.21523773670196533,
"skip_count": 3.0,
"step": 1250,
"text_loss": 0.39072203636169434
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 5.878191957734077,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.0009865528207246563,
- "loss": 0.0358,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0351,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 2017731.0,
"repeat_count": 0.0,
- "routers_loss": 0.06554054468870163,
+ "routers_loss": 0.06184682995080948,
"skip_count": 2.0,
"step": 1252,
"text_loss": 0.35751575231552124
@@ -11911,13 +11911,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.166015625,
"learning_rate": 0.000986481427494559,
- "loss": 0.0337,
+ "loss": 0.0336,
"macro_f1": 0.3333333432674408,
"num_tokens": 2020485.0,
"repeat_count": 0.0,
- "routers_loss": 0.007237187586724758,
+ "routers_loss": 0.007573372684419155,
"skip_count": 0.0,
"step": 1254,
"text_loss": 0.4061077833175659
@@ -11930,13 +11930,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1845703125,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.000986409847843861,
- "loss": 0.0387,
+ "loss": 0.0382,
"macro_f1": 0.3272727429866791,
"num_tokens": 2024149.0,
"repeat_count": 1.0,
- "routers_loss": 0.08003793656826019,
+ "routers_loss": 0.07447971403598785,
"skip_count": 0.0,
"step": 1256,
"text_loss": 0.41876497864723206
@@ -11949,13 +11949,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000986338081799992,
- "loss": 0.0341,
+ "loss": 0.0351,
"macro_f1": 0.3333333432674408,
"num_tokens": 2026545.0,
"repeat_count": 0.0,
- "routers_loss": 0.006424390245229006,
+ "routers_loss": 0.006609147880226374,
"skip_count": 0.0,
"step": 1258,
"text_loss": 0.4673794209957123
@@ -11968,13 +11968,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10009765625,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009862661293904523,
- "loss": 0.0482,
+ "loss": 0.0498,
"macro_f1": 0.32098764181137085,
"num_tokens": 2029581.0,
"repeat_count": 0.0,
- "routers_loss": 0.10797854512929916,
+ "routers_loss": 0.10624702274799347,
"skip_count": 2.0,
"step": 1260,
"text_loss": 0.3483233153820038
@@ -11987,13 +11987,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.111328125,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.0009861939906428145,
- "loss": 0.053,
+ "loss": 0.0525,
"macro_f1": 0.3333333432674408,
"num_tokens": 2033936.0,
"repeat_count": 0.0,
- "routers_loss": 0.006734046153724194,
+ "routers_loss": 0.007944886572659016,
"skip_count": 0.0,
"step": 1262,
"text_loss": 0.16362667083740234
@@ -12006,13 +12006,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009861216655847225,
- "loss": 0.0373,
+ "loss": 0.0376,
"macro_f1": 0.6666666865348816,
"num_tokens": 2037876.0,
"repeat_count": 1.0,
- "routers_loss": 0.00564212491735816,
+ "routers_loss": 0.007004092447459698,
"skip_count": 0.0,
"step": 1264,
"text_loss": 0.43228110671043396
@@ -12025,13 +12025,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1044921875,
+ "grad_norm": 0.1005859375,
"learning_rate": 0.0009860491542438912,
- "loss": 0.0472,
+ "loss": 0.047,
"macro_f1": 0.3272727429866791,
"num_tokens": 2040842.0,
"repeat_count": 0.0,
- "routers_loss": 0.026137735694646835,
+ "routers_loss": 0.026916226372122765,
"skip_count": 1.0,
"step": 1266,
"text_loss": 0.5901188850402832
@@ -12044,13 +12044,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.000985976456648107,
- "loss": 0.0343,
+ "loss": 0.0353,
"macro_f1": 0.3333333432674408,
"num_tokens": 2043890.0,
"repeat_count": 0.0,
- "routers_loss": 0.0069669694639742374,
+ "routers_loss": 0.007325216196477413,
"skip_count": 0.0,
"step": 1268,
"text_loss": 0.8780109882354736
@@ -12063,13 +12063,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.000985903572825228,
- "loss": 0.0323,
+ "loss": 0.0306,
"macro_f1": 0.4871794879436493,
"num_tokens": 2048848.0,
"repeat_count": 0.0,
- "routers_loss": 0.05618409812450409,
+ "routers_loss": 0.05007527023553848,
"skip_count": 2.0,
"step": 1270,
"text_loss": 0.5863722562789917
@@ -12084,11 +12084,11 @@
"f1_skip": 0.0,
"grad_norm": 0.173828125,
"learning_rate": 0.000985830502803183,
- "loss": 0.0391,
+ "loss": 0.0396,
"macro_f1": 0.3272727429866791,
"num_tokens": 2051561.0,
"repeat_count": 0.0,
- "routers_loss": 0.025900620967149734,
+ "routers_loss": 0.023995524272322655,
"skip_count": 0.0,
"step": 1272,
"text_loss": 0.7460709810256958
@@ -12101,13 +12101,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.0009857572466099732,
- "loss": 0.0426,
+ "loss": 0.0431,
"macro_f1": 0.3333333432674408,
"num_tokens": 2054752.0,
"repeat_count": 0.0,
- "routers_loss": 0.006236737594008446,
+ "routers_loss": 0.006928362417966127,
"skip_count": 0.0,
"step": 1274,
"text_loss": 0.5130293369293213
@@ -12120,13 +12120,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.171875,
+ "grad_norm": 0.162109375,
"learning_rate": 0.0009856838042736698,
- "loss": 0.0503,
+ "loss": 0.0501,
"macro_f1": 0.3333333432674408,
"num_tokens": 2058151.0,
"repeat_count": 0.0,
- "routers_loss": 0.006367063149809837,
+ "routers_loss": 0.006969396956264973,
"skip_count": 0.0,
"step": 1276,
"text_loss": 0.5911393761634827
@@ -12139,13 +12139,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.0009856101758224166,
- "loss": 0.0442,
+ "loss": 0.0441,
"macro_f1": 0.3333333432674408,
"num_tokens": 2061012.0,
"repeat_count": 0.0,
- "routers_loss": 0.003392914542928338,
+ "routers_loss": 0.003499418031424284,
"skip_count": 0.0,
"step": 1278,
"text_loss": 0.25347545742988586
@@ -12158,13 +12158,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.000985536361284428,
- "loss": 0.0231,
+ "loss": 0.0229,
"macro_f1": 0.3333333432674408,
"num_tokens": 2064597.0,
"repeat_count": 0.0,
- "routers_loss": 0.007376343477517366,
+ "routers_loss": 0.007856054231524467,
"skip_count": 0.0,
"step": 1280,
"text_loss": 0.7476963400840759
@@ -12177,13 +12177,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.0888671875,
"learning_rate": 0.0009854623606879898,
- "loss": 0.0243,
+ "loss": 0.0245,
"macro_f1": 0.3272727429866791,
"num_tokens": 2067972.0,
"repeat_count": 0.0,
- "routers_loss": 0.02773376554250717,
+ "routers_loss": 0.02617792971432209,
"skip_count": 1.0,
"step": 1282,
"text_loss": 0.5775872468948364
@@ -12196,13 +12196,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.09033203125,
"learning_rate": 0.000985388174061459,
- "loss": 0.0363,
+ "loss": 0.0356,
"macro_f1": 0.32098767161369324,
"num_tokens": 2071812.0,
"repeat_count": 0.0,
- "routers_loss": 0.03535797819495201,
+ "routers_loss": 0.035979997366666794,
"skip_count": 1.0,
"step": 1284,
"text_loss": 0.2933400869369507
@@ -12215,13 +12215,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08447265625,
"learning_rate": 0.0009853138014332646,
- "loss": 0.0269,
+ "loss": 0.0273,
"macro_f1": 0.3333333432674408,
"num_tokens": 2074868.0,
"repeat_count": 0.0,
- "routers_loss": 0.004910993855446577,
+ "routers_loss": 0.005142854526638985,
"skip_count": 0.0,
"step": 1286,
"text_loss": 0.29085102677345276
@@ -12234,13 +12234,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0888671875,
+ "grad_norm": 0.09033203125,
"learning_rate": 0.0009852392428319058,
- "loss": 0.0301,
+ "loss": 0.0306,
"macro_f1": 0.3333333432674408,
"num_tokens": 2078225.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032444109674543142,
+ "routers_loss": 0.0032799106556922197,
"skip_count": 0.0,
"step": 1288,
"text_loss": 0.7293626070022583
@@ -12253,13 +12253,13 @@
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.08935546875,
"learning_rate": 0.0009851644982859537,
- "loss": 0.0272,
+ "loss": 0.0273,
"macro_f1": 0.480392187833786,
"num_tokens": 2081495.0,
"repeat_count": 1.0,
- "routers_loss": 0.12451831251382828,
+ "routers_loss": 0.12224318832159042,
"skip_count": 3.0,
"step": 1290,
"text_loss": 0.26125892996788025
@@ -12272,13 +12272,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009850895678240508,
- "loss": 0.0289,
+ "loss": 0.0283,
"macro_f1": 0.6666666865348816,
"num_tokens": 2084390.0,
"repeat_count": 1.0,
- "routers_loss": 0.011074979789555073,
+ "routers_loss": 0.010662888176739216,
"skip_count": 0.0,
"step": 1292,
"text_loss": 0.3510764539241791
@@ -12291,13 +12291,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.0009850144514749104,
- "loss": 0.0336,
+ "loss": 0.0332,
"macro_f1": 0.5492662787437439,
"num_tokens": 2087210.0,
"repeat_count": 0.0,
- "routers_loss": 0.01774786226451397,
+ "routers_loss": 0.01979079470038414,
"skip_count": 2.0,
"step": 1294,
"text_loss": 0.40202176570892334
@@ -12310,13 +12310,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.000984939149267317,
- "loss": 0.0251,
+ "loss": 0.0253,
"macro_f1": 0.6666666865348816,
"num_tokens": 2090777.0,
"repeat_count": 0.0,
- "routers_loss": 0.0052874404937028885,
+ "routers_loss": 0.005172552540898323,
"skip_count": 1.0,
"step": 1296,
"text_loss": 0.5275651216506958
@@ -12329,13 +12329,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10107421875,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009848636612301272,
- "loss": 0.031,
+ "loss": 0.0299,
"macro_f1": 0.3333333432674408,
"num_tokens": 2094248.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034106262028217316,
+ "routers_loss": 0.0029599082190543413,
"skip_count": 0.0,
"step": 1298,
"text_loss": 0.4517653286457062
@@ -12348,13 +12348,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2177734375,
+ "grad_norm": 0.23046875,
"learning_rate": 0.0009847879873922675,
"loss": 0.0357,
"macro_f1": 0.3333333432674408,
"num_tokens": 2097139.0,
"repeat_count": 0.0,
- "routers_loss": 0.010383229702711105,
+ "routers_loss": 0.011455860920250416,
"skip_count": 0.0,
"step": 1300,
"text_loss": 0.16888445615768433
@@ -12367,13 +12367,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0966796875,
+ "grad_norm": 0.09619140625,
"learning_rate": 0.0009847121277827366,
- "loss": 0.0304,
+ "loss": 0.0301,
"macro_f1": 0.3333333432674408,
"num_tokens": 2100415.0,
"repeat_count": 0.0,
- "routers_loss": 0.0076674893498420715,
+ "routers_loss": 0.008091195486485958,
"skip_count": 0.0,
"step": 1302,
"text_loss": 0.40061676502227783
@@ -12386,13 +12386,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.109375,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.000984636082430604,
- "loss": 0.0287,
+ "loss": 0.0285,
"macro_f1": 0.3333333432674408,
"num_tokens": 2103285.0,
"repeat_count": 0.0,
- "routers_loss": 0.010486516170203686,
+ "routers_loss": 0.009593960829079151,
"skip_count": 0.0,
"step": 1304,
"text_loss": 0.7211073637008667
@@ -12405,13 +12405,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.107421875,
"learning_rate": 0.0009845598513650103,
- "loss": 0.0237,
+ "loss": 0.0231,
"macro_f1": 0.3333333432674408,
"num_tokens": 2106255.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023783023934811354,
+ "routers_loss": 0.0023068038281053305,
"skip_count": 0.0,
"step": 1306,
"text_loss": 0.7077119946479797
@@ -12424,13 +12424,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009844834346151674,
- "loss": 0.044,
+ "loss": 0.043,
"macro_f1": 0.3333333432674408,
"num_tokens": 2109305.0,
"repeat_count": 0.0,
- "routers_loss": 0.006714595016092062,
+ "routers_loss": 0.007703019306063652,
"skip_count": 0.0,
"step": 1308,
"text_loss": 0.3534316122531891
@@ -12443,13 +12443,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009844068322103585,
- "loss": 0.0281,
+ "loss": 0.0287,
"macro_f1": 0.3272727429866791,
"num_tokens": 2112216.0,
"repeat_count": 0.0,
- "routers_loss": 0.022373953834176064,
+ "routers_loss": 0.023549847304821014,
"skip_count": 1.0,
"step": 1310,
"text_loss": 0.6792599558830261
@@ -12462,13 +12462,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.150390625,
"learning_rate": 0.0009843300441799378,
- "loss": 0.0205,
+ "loss": 0.0211,
"macro_f1": 0.3333333432674408,
"num_tokens": 2114925.0,
"repeat_count": 0.0,
- "routers_loss": 0.007452849764376879,
+ "routers_loss": 0.007605871185660362,
"skip_count": 0.0,
"step": 1312,
"text_loss": 0.1571389138698578
@@ -12481,13 +12481,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009842530705533304,
- "loss": 0.0251,
+ "loss": 0.0253,
"macro_f1": 0.3272727429866791,
"num_tokens": 2117744.0,
"repeat_count": 0.0,
- "routers_loss": 0.016413308680057526,
+ "routers_loss": 0.014964760281145573,
"skip_count": 0.0,
"step": 1314,
"text_loss": 0.7840361595153809
@@ -12500,13 +12500,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.000984175911360033,
- "loss": 0.0243,
+ "loss": 0.0238,
"macro_f1": 0.3333333432674408,
"num_tokens": 2120848.0,
"repeat_count": 0.0,
- "routers_loss": 0.004676427226513624,
+ "routers_loss": 0.004663798492401838,
"skip_count": 0.0,
"step": 1316,
"text_loss": 0.536246120929718
@@ -12519,13 +12519,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.000984098566629613,
- "loss": 0.0284,
+ "loss": 0.0288,
"macro_f1": 0.5492662787437439,
"num_tokens": 2123651.0,
"repeat_count": 0.0,
- "routers_loss": 0.024454625323414803,
+ "routers_loss": 0.022852955386042595,
"skip_count": 2.0,
"step": 1318,
"text_loss": 0.43372172117233276
@@ -12538,13 +12538,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.07958984375,
"learning_rate": 0.0009840210363917087,
- "loss": 0.022,
+ "loss": 0.0216,
"macro_f1": 0.3333333432674408,
"num_tokens": 2128011.0,
"repeat_count": 0.0,
- "routers_loss": 0.013495884835720062,
+ "routers_loss": 0.012578422203660011,
"skip_count": 0.0,
"step": 1320,
"text_loss": 0.28190380334854126
@@ -12557,13 +12557,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.0009839433206760306,
- "loss": 0.0213,
+ "loss": 0.0204,
"macro_f1": 0.3333333432674408,
"num_tokens": 2131035.0,
"repeat_count": 0.0,
- "routers_loss": 0.006397814955562353,
+ "routers_loss": 0.006863643880933523,
"skip_count": 0.0,
"step": 1322,
"text_loss": 0.6340444087982178
@@ -12576,13 +12576,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1796875,
"learning_rate": 0.0009838654195123589,
- "loss": 0.0246,
+ "loss": 0.0243,
"macro_f1": 0.3333333432674408,
"num_tokens": 2133856.0,
"repeat_count": 0.0,
- "routers_loss": 0.00503434706479311,
+ "routers_loss": 0.00468854233622551,
"skip_count": 0.0,
"step": 1324,
"text_loss": 0.5138425827026367
@@ -12595,13 +12595,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.115234375,
"learning_rate": 0.0009837873329305458,
- "loss": 0.0402,
+ "loss": 0.0396,
"macro_f1": 0.6666666865348816,
"num_tokens": 2136451.0,
"repeat_count": 1.0,
- "routers_loss": 0.005150494631379843,
+ "routers_loss": 0.005731126759201288,
"skip_count": 0.0,
"step": 1326,
"text_loss": 0.742124617099762
@@ -12614,13 +12614,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000983709060960514,
- "loss": 0.041,
+ "loss": 0.0416,
"macro_f1": 0.3333333432674408,
"num_tokens": 2139496.0,
"repeat_count": 0.0,
- "routers_loss": 0.004570818971842527,
+ "routers_loss": 0.0056343949399888515,
"skip_count": 0.0,
"step": 1328,
"text_loss": 0.7317464351654053
@@ -12633,13 +12633,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09326171875,
+ "grad_norm": 0.10791015625,
"learning_rate": 0.0009836306036322576,
- "loss": 0.0314,
+ "loss": 0.0312,
"macro_f1": 0.3333333432674408,
"num_tokens": 2143120.0,
"repeat_count": 0.0,
- "routers_loss": 0.005299333017319441,
+ "routers_loss": 0.005127966403961182,
"skip_count": 0.0,
"step": 1330,
"text_loss": 0.538652241230011
@@ -12652,13 +12652,13 @@
"f1_execute": 0.9130434989929199,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.111328125,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009835519609758415,
- "loss": 0.0303,
+ "loss": 0.0301,
"macro_f1": 0.590062141418457,
"num_tokens": 2145807.0,
"repeat_count": 3.0,
- "routers_loss": 0.168672576546669,
+ "routers_loss": 0.1673707216978073,
"skip_count": 4.0,
"step": 1332,
"text_loss": 0.3498198091983795
@@ -12671,32 +12671,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009834731330214017,
- "loss": 0.0302,
+ "loss": 0.0293,
"macro_f1": 0.3272727429866791,
"num_tokens": 2148397.0,
"repeat_count": 1.0,
- "routers_loss": 0.05187409743666649,
+ "routers_loss": 0.04026653990149498,
"skip_count": 0.0,
"step": 1334,
"text_loss": 0.8153424859046936
},
{
"acc_repeat": 1.0,
- "acc_skip": 1.0,
- "avg_layers": 26.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 27.0,
"epoch": 6.272380393307896,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.8999999761581421,
"f1_repeat": 0.6666666865348816,
- "f1_skip": 0.9090909361839294,
- "grad_norm": 0.1669921875,
+ "f1_skip": 0.8000000715255737,
+ "grad_norm": 0.16015625,
"learning_rate": 0.0009833941197991455,
- "loss": 0.0339,
- "macro_f1": 0.8329448699951172,
+ "loss": 0.0329,
+ "macro_f1": 0.7888889312744141,
"num_tokens": 2152226.0,
"repeat_count": 2.0,
- "routers_loss": 0.05786697566509247,
+ "routers_loss": 0.05481519177556038,
"skip_count": 5.0,
"step": 1336,
"text_loss": 0.7802760004997253
@@ -12709,13 +12709,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009833149213393506,
- "loss": 0.0315,
+ "loss": 0.0304,
"macro_f1": 0.3272727429866791,
"num_tokens": 2156023.0,
"repeat_count": 0.0,
- "routers_loss": 0.017055779695510864,
+ "routers_loss": 0.01760484278202057,
"skip_count": 0.0,
"step": 1338,
"text_loss": 0.19721226394176483
@@ -12728,13 +12728,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.000983235537672366,
- "loss": 0.0249,
+ "loss": 0.0256,
"macro_f1": 0.3333333432674408,
"num_tokens": 2160037.0,
"repeat_count": 0.0,
- "routers_loss": 0.011614206247031689,
+ "routers_loss": 0.013206037692725658,
"skip_count": 0.0,
"step": 1340,
"text_loss": 0.5003817081451416
@@ -12747,13 +12747,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.000983155968828612,
- "loss": 0.033,
+ "loss": 0.0315,
"macro_f1": 0.6666666865348816,
"num_tokens": 2163910.0,
"repeat_count": 1.0,
- "routers_loss": 0.012611300684511662,
+ "routers_loss": 0.01256406120955944,
"skip_count": 0.0,
"step": 1342,
"text_loss": 0.5996923446655273
@@ -12766,13 +12766,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.11962890625,
"learning_rate": 0.0009830762148385793,
- "loss": 0.0315,
+ "loss": 0.0313,
"macro_f1": 0.3272727429866791,
"num_tokens": 2166921.0,
"repeat_count": 0.0,
- "routers_loss": 0.018757276237010956,
+ "routers_loss": 0.015086234547197819,
"skip_count": 1.0,
"step": 1344,
"text_loss": 0.45356282591819763
@@ -12785,13 +12785,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08447265625,
"learning_rate": 0.0009829962757328297,
- "loss": 0.0229,
+ "loss": 0.0223,
"macro_f1": 0.32098764181137085,
"num_tokens": 2170135.0,
"repeat_count": 0.0,
- "routers_loss": 0.08197146654129028,
+ "routers_loss": 0.07909081131219864,
"skip_count": 2.0,
"step": 1346,
"text_loss": 0.2874644994735718
@@ -12804,13 +12804,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0009829161515419959,
- "loss": 0.0256,
+ "loss": 0.0246,
"macro_f1": 0.6666666865348816,
"num_tokens": 2173029.0,
"repeat_count": 0.0,
- "routers_loss": 0.014122758992016315,
+ "routers_loss": 0.013569854199886322,
"skip_count": 2.0,
"step": 1348,
"text_loss": 0.25533875823020935
@@ -12823,13 +12823,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06005859375,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0009828358422967823,
- "loss": 0.0221,
+ "loss": 0.0226,
"macro_f1": 0.32098764181137085,
"num_tokens": 2176605.0,
"repeat_count": 1.0,
- "routers_loss": 0.08215996623039246,
+ "routers_loss": 0.08111091703176498,
"skip_count": 1.0,
"step": 1350,
"text_loss": 0.32827726006507874
@@ -12842,13 +12842,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09375,
+ "grad_norm": 0.091796875,
"learning_rate": 0.0009827553480279627,
- "loss": 0.0312,
+ "loss": 0.03,
"macro_f1": 0.5427350401878357,
"num_tokens": 2179406.0,
"repeat_count": 0.0,
- "routers_loss": 0.026304977014660835,
+ "routers_loss": 0.026550088077783585,
"skip_count": 2.0,
"step": 1352,
"text_loss": 0.2966301143169403
@@ -12861,13 +12861,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009826746687663832,
- "loss": 0.0302,
+ "loss": 0.0301,
"macro_f1": 0.3333333432674408,
"num_tokens": 2182353.0,
"repeat_count": 0.0,
- "routers_loss": 0.003616038942709565,
+ "routers_loss": 0.003914554137736559,
"skip_count": 0.0,
"step": 1354,
"text_loss": 0.7596251964569092
@@ -12880,13 +12880,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.0849609375,
+ "grad_norm": 0.0859375,
"learning_rate": 0.0009825938045429602,
- "loss": 0.0323,
+ "loss": 0.0324,
"macro_f1": 0.5866667032241821,
"num_tokens": 2185786.0,
"repeat_count": 1.0,
- "routers_loss": 0.060399893671274185,
+ "routers_loss": 0.059612665325403214,
"skip_count": 3.0,
"step": 1356,
"text_loss": 0.12325898557901382
@@ -12899,13 +12899,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10302734375,
+ "grad_norm": 0.10009765625,
"learning_rate": 0.0009825127553886807,
- "loss": 0.0384,
+ "loss": 0.0375,
"macro_f1": 0.3333333432674408,
"num_tokens": 2190157.0,
"repeat_count": 0.0,
- "routers_loss": 0.007164204493165016,
+ "routers_loss": 0.0071132429875433445,
"skip_count": 0.0,
"step": 1358,
"text_loss": 0.9287898540496826
@@ -12918,13 +12918,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0009824315213346033,
- "loss": 0.0343,
+ "loss": 0.0348,
"macro_f1": 0.3333333432674408,
"num_tokens": 2193077.0,
"repeat_count": 0.0,
- "routers_loss": 0.010965060442686081,
+ "routers_loss": 0.009611099027097225,
"skip_count": 0.0,
"step": 1360,
"text_loss": 0.20427259802818298
@@ -12937,13 +12937,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009823501024118569,
- "loss": 0.0276,
+ "loss": 0.0285,
"macro_f1": 0.3333333432674408,
"num_tokens": 2196494.0,
"repeat_count": 0.0,
- "routers_loss": 0.00784136913716793,
+ "routers_loss": 0.006913455203175545,
"skip_count": 0.0,
"step": 1362,
"text_loss": 0.574759840965271
@@ -12956,13 +12956,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009822684986516411,
- "loss": 0.0251,
+ "loss": 0.0245,
"macro_f1": 0.3333333432674408,
"num_tokens": 2199839.0,
"repeat_count": 0.0,
- "routers_loss": 0.009101065807044506,
+ "routers_loss": 0.009208920411765575,
"skip_count": 0.0,
"step": 1364,
"text_loss": 0.42422571778297424
@@ -12970,37 +12970,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 6.413266803639566,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.000982186710085227,
- "loss": 0.0206,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.0208,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 2203212.0,
"repeat_count": 1.0,
- "routers_loss": 0.05967295169830322,
+ "routers_loss": 0.059975091367959976,
"skip_count": 1.0,
"step": 1366,
"text_loss": 0.29213017225265503
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 26.0,
+ "acc_skip": 0.25,
+ "avg_layers": 27.0,
"epoch": 6.42265923099501,
- "f1_execute": 0.9600000381469727,
+ "f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.1875,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.181640625,
"learning_rate": 0.0009821047367439561,
- "loss": 0.0356,
- "macro_f1": 0.542222261428833,
+ "loss": 0.0358,
+ "macro_f1": 0.44705885648727417,
"num_tokens": 2206240.0,
"repeat_count": 0.0,
- "routers_loss": 0.05016552656888962,
+ "routers_loss": 0.048244867473840714,
"skip_count": 4.0,
"step": 1368,
"text_loss": 0.3072395324707031
@@ -13013,13 +13013,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0009820225786592405,
- "loss": 0.038,
+ "loss": 0.0375,
"macro_f1": 0.3272727429866791,
"num_tokens": 2209903.0,
"repeat_count": 1.0,
- "routers_loss": 0.02483060024678707,
+ "routers_loss": 0.026068156585097313,
"skip_count": 0.0,
"step": 1370,
"text_loss": 0.5961400270462036
@@ -13032,13 +13032,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.109375,
"learning_rate": 0.0009819402358625634,
- "loss": 0.0373,
+ "loss": 0.0366,
"macro_f1": 0.3272727429866791,
"num_tokens": 2213439.0,
"repeat_count": 0.0,
- "routers_loss": 0.01982821337878704,
+ "routers_loss": 0.022615568712353706,
"skip_count": 1.0,
"step": 1372,
"text_loss": 0.19375644624233246
@@ -13051,13 +13051,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.000981857708385479,
- "loss": 0.0353,
+ "loss": 0.0346,
"macro_f1": 0.3333333432674408,
"num_tokens": 2216457.0,
"repeat_count": 0.0,
- "routers_loss": 0.004753436427563429,
+ "routers_loss": 0.005855285096913576,
"skip_count": 0.0,
"step": 1374,
"text_loss": 0.5123368501663208
@@ -13070,13 +13070,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09912109375,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009817749962596114,
- "loss": 0.0246,
+ "loss": 0.0249,
"macro_f1": 0.3272727429866791,
"num_tokens": 2219975.0,
"repeat_count": 1.0,
- "routers_loss": 0.06541594862937927,
+ "routers_loss": 0.0651634931564331,
"skip_count": 0.0,
"step": 1376,
"text_loss": 0.5999220609664917
@@ -13089,13 +13089,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.0009816920995166568,
- "loss": 0.0376,
+ "loss": 0.0371,
"macro_f1": 0.6666666865348816,
"num_tokens": 2222833.0,
"repeat_count": 1.0,
- "routers_loss": 0.01156456395983696,
+ "routers_loss": 0.011408994905650616,
"skip_count": 0.0,
"step": 1378,
"text_loss": 0.5323230624198914
@@ -13108,13 +13108,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2392578125,
+ "grad_norm": 0.205078125,
"learning_rate": 0.0009816090181883807,
- "loss": 0.033,
+ "loss": 0.0313,
"macro_f1": 0.32098764181137085,
"num_tokens": 2225842.0,
"repeat_count": 0.0,
- "routers_loss": 0.05175521597266197,
+ "routers_loss": 0.039720915257930756,
"skip_count": 2.0,
"step": 1380,
"text_loss": 0.23363439738750458
@@ -13127,13 +13127,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009815257523066204,
- "loss": 0.0251,
+ "loss": 0.0249,
"macro_f1": 0.3333333432674408,
"num_tokens": 2229430.0,
"repeat_count": 0.0,
- "routers_loss": 0.002684591803699732,
+ "routers_loss": 0.002765297656878829,
"skip_count": 0.0,
"step": 1382,
"text_loss": 0.718977689743042
@@ -13146,13 +13146,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.12890625,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009814423019032835,
- "loss": 0.0397,
+ "loss": 0.0396,
"macro_f1": 0.5492662787437439,
"num_tokens": 2232594.0,
"repeat_count": 2.0,
- "routers_loss": 0.054509978741407394,
+ "routers_loss": 0.05362323671579361,
"skip_count": 0.0,
"step": 1384,
"text_loss": 0.6392166614532471
@@ -13165,13 +13165,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.150390625,
"learning_rate": 0.0009813586670103483,
"loss": 0.0426,
"macro_f1": 0.6603773832321167,
"num_tokens": 2236327.0,
"repeat_count": 1.0,
- "routers_loss": 0.04031623527407646,
+ "routers_loss": 0.031728316098451614,
"skip_count": 1.0,
"step": 1386,
"text_loss": 0.5951619148254395
@@ -13184,13 +13184,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.126953125,
"learning_rate": 0.0009812748476598638,
- "loss": 0.0308,
+ "loss": 0.031,
"macro_f1": 0.5492662787437439,
"num_tokens": 2239746.0,
"repeat_count": 0.0,
- "routers_loss": 0.039687711745500565,
+ "routers_loss": 0.03981253132224083,
"skip_count": 2.0,
"step": 1388,
"text_loss": 0.22756551206111908
@@ -13203,13 +13203,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.12451171875,
"learning_rate": 0.0009811908438839498,
- "loss": 0.0329,
+ "loss": 0.0331,
"macro_f1": 0.5492662787437439,
"num_tokens": 2242786.0,
"repeat_count": 0.0,
- "routers_loss": 0.04785723611712456,
+ "routers_loss": 0.04617162421345711,
"skip_count": 2.0,
"step": 1390,
"text_loss": 0.3233799934387207
@@ -13222,13 +13222,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.154296875,
"learning_rate": 0.000981106655714797,
- "loss": 0.0359,
+ "loss": 0.0358,
"macro_f1": 0.3272727429866791,
"num_tokens": 2245696.0,
"repeat_count": 0.0,
- "routers_loss": 0.046765491366386414,
+ "routers_loss": 0.046828847378492355,
"skip_count": 1.0,
"step": 1392,
"text_loss": 0.24273279309272766
@@ -13241,13 +13241,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009810222831846656,
- "loss": 0.0303,
+ "loss": 0.0307,
"macro_f1": 0.5492662787437439,
"num_tokens": 2249326.0,
"repeat_count": 0.0,
- "routers_loss": 0.015151665546000004,
+ "routers_loss": 0.010921589098870754,
"skip_count": 2.0,
"step": 1394,
"text_loss": 0.3921460807323456
@@ -13260,13 +13260,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009809377263258882,
- "loss": 0.0321,
+ "loss": 0.0315,
"macro_f1": 0.32098767161369324,
"num_tokens": 2253393.0,
"repeat_count": 0.0,
- "routers_loss": 0.04431106895208359,
+ "routers_loss": 0.04564022272825241,
"skip_count": 1.0,
"step": 1396,
"text_loss": 0.582602858543396
@@ -13279,13 +13279,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09814453125,
+ "grad_norm": 0.103515625,
"learning_rate": 0.000980852985170867,
- "loss": 0.0317,
+ "loss": 0.0328,
"macro_f1": 0.3272727429866791,
"num_tokens": 2256626.0,
"repeat_count": 0.0,
- "routers_loss": 0.012700649909675121,
+ "routers_loss": 0.013289985246956348,
"skip_count": 0.0,
"step": 1398,
"text_loss": 0.41031694412231445
@@ -13298,13 +13298,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.0009807680597520745,
- "loss": 0.0256,
+ "loss": 0.0264,
"macro_f1": 0.3333333432674408,
"num_tokens": 2259326.0,
"repeat_count": 0.0,
- "routers_loss": 0.005919010378420353,
+ "routers_loss": 0.0065213534981012344,
"skip_count": 0.0,
"step": 1400,
"text_loss": 0.2888098657131195
@@ -13317,13 +13317,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.23046875,
"learning_rate": 0.0009806829501020546,
- "loss": 0.0372,
+ "loss": 0.0358,
"macro_f1": 0.3272727429866791,
"num_tokens": 2262344.0,
"repeat_count": 0.0,
- "routers_loss": 0.04717765748500824,
+ "routers_loss": 0.04199840500950813,
"skip_count": 1.0,
"step": 1402,
"text_loss": 0.31973034143447876
@@ -13336,13 +13336,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.08935546875,
"learning_rate": 0.0009805976562534215,
"loss": 0.0317,
"macro_f1": 0.6603773832321167,
"num_tokens": 2266354.0,
"repeat_count": 1.0,
- "routers_loss": 0.015415813773870468,
+ "routers_loss": 0.015434930101037025,
"skip_count": 1.0,
"step": 1404,
"text_loss": 0.508630633354187
@@ -13355,13 +13355,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009805121782388599,
"loss": 0.0339,
"macro_f1": 0.6533333659172058,
"num_tokens": 2269660.0,
"repeat_count": 2.0,
- "routers_loss": 0.06812979280948639,
+ "routers_loss": 0.0720924660563469,
"skip_count": 2.0,
"step": 1406,
"text_loss": 0.40927737951278687
@@ -13374,13 +13374,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05908203125,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009804265160911253,
- "loss": 0.0265,
+ "loss": 0.0266,
"macro_f1": 0.5492662787437439,
"num_tokens": 2273335.0,
"repeat_count": 0.0,
- "routers_loss": 0.025383235886693,
+ "routers_loss": 0.02400495670735836,
"skip_count": 2.0,
"step": 1408,
"text_loss": 0.1777762621641159
@@ -13393,13 +13393,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.2314453125,
"learning_rate": 0.0009803406698430433,
- "loss": 0.0367,
+ "loss": 0.0371,
"macro_f1": 0.3272727429866791,
"num_tokens": 2277107.0,
"repeat_count": 0.0,
- "routers_loss": 0.026493225246667862,
+ "routers_loss": 0.02560107782483101,
"skip_count": 1.0,
"step": 1410,
"text_loss": 0.17955881357192993
@@ -13412,13 +13412,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0009802546395275104,
- "loss": 0.0342,
+ "loss": 0.0349,
"macro_f1": 0.3333333432674408,
"num_tokens": 2281638.0,
"repeat_count": 0.0,
- "routers_loss": 0.006616846192628145,
+ "routers_loss": 0.006655813194811344,
"skip_count": 0.0,
"step": 1412,
"text_loss": 0.20882295072078705
@@ -13431,32 +13431,32 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.0888671875,
+ "grad_norm": 0.08740234375,
"learning_rate": 0.000980168425177494,
- "loss": 0.0328,
+ "loss": 0.0342,
"macro_f1": 0.8200000524520874,
"num_tokens": 2284876.0,
"repeat_count": 1.0,
- "routers_loss": 0.060631848871707916,
+ "routers_loss": 0.06325097382068634,
"skip_count": 3.0,
"step": 1414,
"text_loss": 0.26035264134407043
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 6.648077487525683,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.138671875,
"learning_rate": 0.000980082026826031,
- "loss": 0.0317,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0315,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2288938.0,
"repeat_count": 1.0,
- "routers_loss": 0.011199389584362507,
+ "routers_loss": 0.013436575420200825,
"skip_count": 0.0,
"step": 1416,
"text_loss": 0.5502325892448425
@@ -13469,13 +13469,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0009799954445062296,
- "loss": 0.0192,
+ "loss": 0.0193,
"macro_f1": 0.6603773832321167,
"num_tokens": 2292317.0,
"repeat_count": 1.0,
- "routers_loss": 0.01120354700833559,
+ "routers_loss": 0.011264479719102383,
"skip_count": 1.0,
"step": 1418,
"text_loss": 0.48075684905052185
@@ -13488,13 +13488,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009799086782512686,
- "loss": 0.0294,
+ "loss": 0.0292,
"macro_f1": 0.5492662787437439,
"num_tokens": 2295935.0,
"repeat_count": 0.0,
- "routers_loss": 0.030204148963093758,
+ "routers_loss": 0.02833271212875843,
"skip_count": 2.0,
"step": 1420,
"text_loss": 0.18221206963062286
@@ -13507,13 +13507,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0888671875,
+ "grad_norm": 0.09375,
"learning_rate": 0.0009798217280943967,
- "loss": 0.0348,
+ "loss": 0.0356,
"macro_f1": 0.6666666865348816,
"num_tokens": 2298927.0,
"repeat_count": 0.0,
- "routers_loss": 0.008244800381362438,
+ "routers_loss": 0.009208574891090393,
"skip_count": 1.0,
"step": 1422,
"text_loss": 0.48686322569847107
@@ -13526,32 +13526,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009797345940689335,
- "loss": 0.0269,
+ "loss": 0.0267,
"macro_f1": 0.3272727429866791,
"num_tokens": 2301541.0,
"repeat_count": 0.0,
- "routers_loss": 0.015340043231844902,
+ "routers_loss": 0.015011847950518131,
"skip_count": 0.0,
"step": 1424,
"text_loss": 0.49446266889572144
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6000000238418579,
- "avg_layers": 25.0,
+ "acc_skip": 0.4000000059604645,
+ "avg_layers": 26.0,
"epoch": 6.695039624302906,
- "f1_execute": 0.9583333134651184,
+ "f1_execute": 0.9387754797935486,
"f1_repeat": 0.0,
- "f1_skip": 0.75,
- "grad_norm": 0.1318359375,
+ "f1_skip": 0.5714285969734192,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.0009796472762082687,
- "loss": 0.0341,
- "macro_f1": 0.5694444179534912,
+ "loss": 0.0338,
+ "macro_f1": 0.5034013986587524,
"num_tokens": 2304589.0,
"repeat_count": 0.0,
- "routers_loss": 0.058681465685367584,
+ "routers_loss": 0.05912091210484505,
"skip_count": 5.0,
"step": 1426,
"text_loss": 0.23945684731006622
@@ -13564,32 +13564,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.09765625,
"learning_rate": 0.000979559774545863,
- "loss": 0.0423,
+ "loss": 0.0405,
"macro_f1": 0.3272727429866791,
"num_tokens": 2307860.0,
"repeat_count": 0.0,
- "routers_loss": 0.020810559391975403,
+ "routers_loss": 0.021242303773760796,
"skip_count": 1.0,
"step": 1428,
"text_loss": 0.531273365020752
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 6.713824479013795,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.09033203125,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.000979472089115247,
- "loss": 0.0268,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0276,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 2311581.0,
"repeat_count": 0.0,
- "routers_loss": 0.030001837760210037,
+ "routers_loss": 0.02768544852733612,
"skip_count": 2.0,
"step": 1430,
"text_loss": 0.2497459501028061
@@ -13602,13 +13602,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1318359375,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.000979384219950022,
- "loss": 0.034,
+ "loss": 0.0346,
"macro_f1": 0.3333333432674408,
"num_tokens": 2314639.0,
"repeat_count": 0.0,
- "routers_loss": 0.010381575673818588,
+ "routers_loss": 0.008678150363266468,
"skip_count": 0.0,
"step": 1432,
"text_loss": 0.6579355001449585
@@ -13621,32 +13621,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08056640625,
"learning_rate": 0.0009792961670838595,
- "loss": 0.0365,
+ "loss": 0.0362,
"macro_f1": 0.3272727429866791,
"num_tokens": 2317927.0,
"repeat_count": 1.0,
- "routers_loss": 0.03234704211354256,
+ "routers_loss": 0.03325597569346428,
"skip_count": 0.0,
"step": 1434,
"text_loss": 0.5209436416625977
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 6.742001761080129,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009792079305505016,
- "loss": 0.0303,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0306,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2321065.0,
"repeat_count": 1.0,
- "routers_loss": 0.015481291338801384,
+ "routers_loss": 0.019228918477892876,
"skip_count": 0.0,
"step": 1436,
"text_loss": 0.41087067127227783
@@ -13659,13 +13659,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.000979119510383761,
- "loss": 0.0366,
+ "loss": 0.0371,
"macro_f1": 0.3333333432674408,
"num_tokens": 2323714.0,
"repeat_count": 0.0,
- "routers_loss": 0.018170451745390892,
+ "routers_loss": 0.017071325331926346,
"skip_count": 0.0,
"step": 1438,
"text_loss": 0.21490029990673065
@@ -13678,13 +13678,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.2060546875,
"learning_rate": 0.00097903090661752,
- "loss": 0.0306,
+ "loss": 0.0309,
"macro_f1": 0.3333333432674408,
"num_tokens": 2326454.0,
"repeat_count": 0.0,
- "routers_loss": 0.010385681875050068,
+ "routers_loss": 0.00991755723953247,
"skip_count": 0.0,
"step": 1440,
"text_loss": 0.23847346007823944
@@ -13697,13 +13697,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.232421875,
"learning_rate": 0.000978942119285732,
- "loss": 0.0407,
+ "loss": 0.0404,
"macro_f1": 0.3272727429866791,
"num_tokens": 2329462.0,
"repeat_count": 0.0,
- "routers_loss": 0.04976538568735123,
+ "routers_loss": 0.04908733069896698,
"skip_count": 1.0,
"step": 1442,
"text_loss": 0.23343028128147125
@@ -13716,13 +13716,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009788531484224204,
- "loss": 0.0255,
+ "loss": 0.0264,
"macro_f1": 0.3333333432674408,
"num_tokens": 2332146.0,
"repeat_count": 0.0,
- "routers_loss": 0.0030266831163316965,
+ "routers_loss": 0.0032628148328512907,
"skip_count": 0.0,
"step": 1444,
"text_loss": 0.47423800826072693
@@ -13730,18 +13730,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.3333333432674408,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 6.788963897857353,
- "f1_execute": 0.9600000381469727,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 0.5,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009787639940616788,
- "loss": 0.0411,
- "macro_f1": 0.8200000524520874,
+ "loss": 0.0405,
+ "macro_f1": 0.7018141150474548,
"num_tokens": 2335738.0,
"repeat_count": 1.0,
- "routers_loss": 0.13420957326889038,
+ "routers_loss": 0.14336998760700226,
"skip_count": 3.0,
"step": 1446,
"text_loss": 0.21837592124938965
@@ -13754,13 +13754,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.189453125,
"learning_rate": 0.0009786746562376717,
- "loss": 0.0251,
+ "loss": 0.0241,
"macro_f1": 0.6666666865348816,
"num_tokens": 2338488.0,
"repeat_count": 0.0,
- "routers_loss": 0.012779864482581615,
+ "routers_loss": 0.010542908683419228,
"skip_count": 1.0,
"step": 1448,
"text_loss": 1.0614757537841797
@@ -13773,13 +13773,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.0009785851349846334,
- "loss": 0.0266,
+ "loss": 0.0268,
"macro_f1": 0.3333333432674408,
"num_tokens": 2342074.0,
"repeat_count": 0.0,
- "routers_loss": 0.005545398220419884,
+ "routers_loss": 0.005998016335070133,
"skip_count": 0.0,
"step": 1450,
"text_loss": 0.4269719421863556
@@ -13792,13 +13792,13 @@
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.09814453125,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009784954303368686,
- "loss": 0.0395,
+ "loss": 0.0384,
"macro_f1": 0.44705885648727417,
"num_tokens": 2345838.0,
"repeat_count": 0.0,
- "routers_loss": 0.0899835154414177,
+ "routers_loss": 0.0959126204252243,
"skip_count": 3.0,
"step": 1452,
"text_loss": 0.3315916955471039
@@ -13811,13 +13811,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.1005859375,
"learning_rate": 0.0009784055423287521,
"loss": 0.0218,
"macro_f1": 0.3333333432674408,
"num_tokens": 2348939.0,
"repeat_count": 0.0,
- "routers_loss": 0.002738836221396923,
+ "routers_loss": 0.0025467623490840197,
"skip_count": 0.0,
"step": 1454,
"text_loss": 0.6162732839584351
@@ -13830,13 +13830,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.115234375,
"learning_rate": 0.0009783154709947293,
- "loss": 0.0266,
+ "loss": 0.0256,
"macro_f1": 0.3272727429866791,
"num_tokens": 2352232.0,
"repeat_count": 0.0,
- "routers_loss": 0.020522192120552063,
+ "routers_loss": 0.01860538125038147,
"skip_count": 1.0,
"step": 1456,
"text_loss": 0.23928768932819366
@@ -13844,18 +13844,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 6.84531846199002,
- "f1_execute": 0.9629629850387573,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.0009782252163693158,
- "loss": 0.0197,
- "macro_f1": 0.32098767161369324,
+ "loss": 0.0201,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2355159.0,
"repeat_count": 0.0,
- "routers_loss": 0.04245268926024437,
+ "routers_loss": 0.04412713274359703,
"skip_count": 1.0,
"step": 1458,
"text_loss": 0.3371323347091675
@@ -13868,13 +13868,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.224609375,
+ "grad_norm": 0.21484375,
"learning_rate": 0.0009781347784870973,
- "loss": 0.0376,
+ "loss": 0.0379,
"macro_f1": 0.3333333432674408,
"num_tokens": 2358175.0,
"repeat_count": 0.0,
- "routers_loss": 0.009142685681581497,
+ "routers_loss": 0.006809141952544451,
"skip_count": 0.0,
"step": 1460,
"text_loss": 0.547267735004425
@@ -13887,13 +13887,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009780441573827296,
- "loss": 0.0295,
+ "loss": 0.03,
"macro_f1": 0.3076923191547394,
"num_tokens": 2360991.0,
"repeat_count": 0.0,
- "routers_loss": 0.08038893342018127,
+ "routers_loss": 0.08924390375614166,
"skip_count": 4.0,
"step": 1462,
"text_loss": 0.7026563882827759
@@ -13906,13 +13906,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.1865234375,
"learning_rate": 0.000977953353090939,
- "loss": 0.027,
+ "loss": 0.0272,
"macro_f1": 0.3333333432674408,
"num_tokens": 2363894.0,
"repeat_count": 0.0,
- "routers_loss": 0.02107175625860691,
+ "routers_loss": 0.021858472377061844,
"skip_count": 0.0,
"step": 1464,
"text_loss": 0.2718065083026886
@@ -13925,13 +13925,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.0009778623656465219,
- "loss": 0.0349,
+ "loss": 0.0338,
"macro_f1": 0.32098764181137085,
"num_tokens": 2367265.0,
"repeat_count": 0.0,
- "routers_loss": 0.042030055075883865,
+ "routers_loss": 0.044781096279621124,
"skip_count": 0.0,
"step": 1466,
"text_loss": 0.5008095502853394
@@ -13944,13 +13944,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07470703125,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009777711950843448,
- "loss": 0.022,
+ "loss": 0.0212,
"macro_f1": 0.3333333432674408,
"num_tokens": 2370186.0,
"repeat_count": 0.0,
- "routers_loss": 0.004230673424899578,
+ "routers_loss": 0.0040459707379341125,
"skip_count": 0.0,
"step": 1468,
"text_loss": 0.5242461562156677
@@ -13963,13 +13963,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009776798414393446,
- "loss": 0.0284,
+ "loss": 0.0279,
"macro_f1": 0.6598639488220215,
"num_tokens": 2373314.0,
"repeat_count": 1.0,
- "routers_loss": 0.06986775249242783,
+ "routers_loss": 0.0708528608083725,
"skip_count": 3.0,
"step": 1470,
"text_loss": 0.2821732461452484
@@ -13982,13 +13982,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.142578125,
+ "grad_norm": 0.1328125,
"learning_rate": 0.0009775883047465279,
- "loss": 0.0431,
+ "loss": 0.0414,
"macro_f1": 0.31446540355682373,
"num_tokens": 2376435.0,
"repeat_count": 1.0,
- "routers_loss": 0.0439564548432827,
+ "routers_loss": 0.0290578193962574,
"skip_count": 1.0,
"step": 1472,
"text_loss": 0.8438440561294556
@@ -14001,13 +14001,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1123046875,
+ "grad_norm": 0.10546875,
"learning_rate": 0.000977496585040972,
- "loss": 0.0376,
+ "loss": 0.0373,
"macro_f1": 0.3333333432674408,
"num_tokens": 2380244.0,
"repeat_count": 0.0,
- "routers_loss": 0.011889892630279064,
+ "routers_loss": 0.010360375046730042,
"skip_count": 0.0,
"step": 1474,
"text_loss": 0.4356135427951813
@@ -14020,13 +14020,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.000977404682357824,
- "loss": 0.0295,
+ "loss": 0.0294,
"macro_f1": 0.3272727429866791,
"num_tokens": 2383498.0,
"repeat_count": 0.0,
- "routers_loss": 0.022536326199769974,
+ "routers_loss": 0.023518972098827362,
"skip_count": 0.0,
"step": 1476,
"text_loss": 0.25195425748825073
@@ -14039,13 +14039,13 @@
"f1_execute": 0.9743589162826538,
"f1_repeat": 0.888888955116272,
"f1_skip": 1.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.000977312596732301,
- "loss": 0.0388,
+ "loss": 0.0375,
"macro_f1": 0.9544159770011902,
"num_tokens": 2386414.0,
"repeat_count": 5.0,
- "routers_loss": 0.07959948480129242,
+ "routers_loss": 0.08190606534481049,
"skip_count": 4.0,
"step": 1478,
"text_loss": 0.6586798429489136
@@ -14058,13 +14058,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.10546875,
"learning_rate": 0.0009772203281996905,
- "loss": 0.0341,
+ "loss": 0.0336,
"macro_f1": 1.0,
"num_tokens": 2389399.0,
"repeat_count": 1.0,
- "routers_loss": 0.019112225621938705,
+ "routers_loss": 0.016441475600004196,
"skip_count": 2.0,
"step": 1480,
"text_loss": 0.3671986758708954
@@ -14077,13 +14077,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0888671875,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009771278767953502,
- "loss": 0.0345,
+ "loss": 0.0357,
"macro_f1": 0.3333333432674408,
"num_tokens": 2392400.0,
"repeat_count": 0.0,
- "routers_loss": 0.018750866875052452,
+ "routers_loss": 0.019211363047361374,
"skip_count": 0.0,
"step": 1482,
"text_loss": 0.27418580651283264
@@ -14096,32 +14096,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09228515625,
+ "grad_norm": 0.0947265625,
"learning_rate": 0.0009770352425547072,
- "loss": 0.0291,
+ "loss": 0.0292,
"macro_f1": 0.3333333432674408,
"num_tokens": 2395123.0,
"repeat_count": 0.0,
- "routers_loss": 0.015407348051667213,
+ "routers_loss": 0.015800386667251587,
"skip_count": 0.0,
"step": 1484,
"text_loss": 0.19896622002124786
},
{
- "acc_repeat": 0.6666666865348816,
+ "acc_repeat": 0.3333333432674408,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 29.0,
"epoch": 6.976812444966246,
- "f1_execute": 0.9803921580314636,
- "f1_repeat": 0.800000011920929,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.5,
"f1_skip": 0.0,
- "grad_norm": 0.11474609375,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009769424255132596,
- "loss": 0.0258,
- "macro_f1": 0.5934640765190125,
+ "loss": 0.0256,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 2397359.0,
"repeat_count": 3.0,
- "routers_loss": 0.06514479219913483,
+ "routers_loss": 0.06670158356428146,
"skip_count": 0.0,
"step": 1486,
"text_loss": 0.4229799509048462
@@ -14134,13 +14134,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.111328125,
+ "grad_norm": 0.1162109375,
"learning_rate": 0.0009768494257065747,
- "loss": 0.0217,
+ "loss": 0.0218,
"macro_f1": 0.3272727429866791,
"num_tokens": 2400387.0,
"repeat_count": 0.0,
- "routers_loss": 0.013567833229899406,
+ "routers_loss": 0.011144762858748436,
"skip_count": 1.0,
"step": 1488,
"text_loss": 0.4264226257801056
@@ -14153,13 +14153,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12255859375,
+ "grad_norm": 0.12353515625,
"learning_rate": 0.0009767562431702904,
- "loss": 0.0389,
+ "loss": 0.0387,
"macro_f1": 0.3006536364555359,
"num_tokens": 2403241.0,
"repeat_count": 2.0,
- "routers_loss": 0.13762018084526062,
+ "routers_loss": 0.12339717149734497,
"skip_count": 3.0,
"step": 1490,
"text_loss": 0.2850193977355957
@@ -14172,13 +14172,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0009766628779401142,
- "loss": 0.0214,
+ "loss": 0.0215,
"macro_f1": 0.6666666865348816,
"num_tokens": 2406087.0,
"repeat_count": 0.0,
- "routers_loss": 0.008640666492283344,
+ "routers_loss": 0.008174685761332512,
"skip_count": 1.0,
"step": 1492,
"text_loss": 0.6756544709205627
@@ -14191,13 +14191,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05712890625,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.000976569330051824,
- "loss": 0.0182,
+ "loss": 0.0186,
"macro_f1": 0.3333333432674408,
"num_tokens": 2409312.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018257038900628686,
+ "routers_loss": 0.0021256296895444393,
"skip_count": 0.0,
"step": 1494,
"text_loss": 0.4789894223213196
@@ -14210,13 +14210,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0009764755995412677,
"loss": 0.0193,
"macro_f1": 0.3333333432674408,
"num_tokens": 2412758.0,
"repeat_count": 0.0,
- "routers_loss": 0.003656312357634306,
+ "routers_loss": 0.003944927826523781,
"skip_count": 0.0,
"step": 1496,
"text_loss": 0.5157490968704224
@@ -14229,13 +14229,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009763816864443627,
- "loss": 0.0246,
+ "loss": 0.0239,
"macro_f1": 0.3272727429866791,
"num_tokens": 2416079.0,
"repeat_count": 1.0,
- "routers_loss": 0.044268425554037094,
+ "routers_loss": 0.03893325850367546,
"skip_count": 0.0,
"step": 1498,
"text_loss": 0.28045418858528137
@@ -14248,13 +14248,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.0009762875907970968,
- "loss": 0.0207,
+ "loss": 0.0199,
"macro_f1": 0.3333333432674408,
"num_tokens": 2420340.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018966116476804018,
+ "routers_loss": 0.0017725443467497826,
"skip_count": 0.0,
"step": 1500,
"text_loss": 0.35550856590270996
@@ -14267,32 +14267,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.06298828125,
"learning_rate": 0.0009761933126355277,
- "loss": 0.0249,
+ "loss": 0.0245,
"macro_f1": 0.3272727429866791,
"num_tokens": 2424735.0,
"repeat_count": 0.0,
- "routers_loss": 0.01729201152920723,
+ "routers_loss": 0.01393749937415123,
"skip_count": 1.0,
"step": 1502,
"text_loss": 0.38840189576148987
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 7.06105077781039,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.11962890625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009760988519957828,
- "loss": 0.0248,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0249,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 2428132.0,
"repeat_count": 0.0,
- "routers_loss": 0.01693531684577465,
+ "routers_loss": 0.01687910407781601,
"skip_count": 2.0,
"step": 1504,
"text_loss": 0.3031681478023529
@@ -14305,13 +14305,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0009760042089140598,
- "loss": 0.0197,
+ "loss": 0.0193,
"macro_f1": 0.3144654333591461,
"num_tokens": 2431592.0,
"repeat_count": 1.0,
- "routers_loss": 0.04939094930887222,
+ "routers_loss": 0.04704280197620392,
"skip_count": 2.0,
"step": 1506,
"text_loss": 0.16355200111865997
@@ -14324,13 +14324,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0009759093834266259,
- "loss": 0.0213,
+ "loss": 0.0206,
"macro_f1": 0.3333333432674408,
"num_tokens": 2434236.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016892930725589395,
+ "routers_loss": 0.0016075772000476718,
"skip_count": 0.0,
"step": 1508,
"text_loss": 0.6080073118209839
@@ -14343,13 +14343,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10009765625,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009758143755698186,
- "loss": 0.0147,
+ "loss": 0.015,
"macro_f1": 0.3333333432674408,
"num_tokens": 2437170.0,
"repeat_count": 0.0,
- "routers_loss": 0.008671467192471027,
+ "routers_loss": 0.008451299741864204,
"skip_count": 0.0,
"step": 1510,
"text_loss": 0.22100484371185303
@@ -14362,13 +14362,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009757191853800449,
- "loss": 0.0228,
+ "loss": 0.0227,
"macro_f1": 0.5866667032241821,
"num_tokens": 2441187.0,
"repeat_count": 1.0,
- "routers_loss": 0.042682576924562454,
+ "routers_loss": 0.046565692871809006,
"skip_count": 3.0,
"step": 1512,
"text_loss": 0.25098952651023865
@@ -14381,13 +14381,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.11279296875,
"learning_rate": 0.000975623812893782,
- "loss": 0.028,
+ "loss": 0.0276,
"macro_f1": 0.3272727429866791,
"num_tokens": 2444664.0,
"repeat_count": 0.0,
- "routers_loss": 0.02905822917819023,
+ "routers_loss": 0.02872578240931034,
"skip_count": 1.0,
"step": 1514,
"text_loss": 0.4952253997325897
@@ -14400,13 +14400,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09326171875,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.0009755282581475768,
- "loss": 0.0223,
+ "loss": 0.0233,
"macro_f1": 0.3333333432674408,
"num_tokens": 2447748.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018810008186846972,
+ "routers_loss": 0.002055214950814843,
"skip_count": 0.0,
"step": 1516,
"text_loss": 0.7465500831604004
@@ -14419,13 +14419,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.10302734375,
"learning_rate": 0.000975432521178046,
- "loss": 0.0219,
+ "loss": 0.0216,
"macro_f1": 0.3272727429866791,
"num_tokens": 2450834.0,
"repeat_count": 1.0,
- "routers_loss": 0.04308714717626572,
+ "routers_loss": 0.04498551785945892,
"skip_count": 0.0,
"step": 1518,
"text_loss": 0.28144413232803345
@@ -14438,13 +14438,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.0009753366020218763,
- "loss": 0.0232,
+ "loss": 0.0234,
"macro_f1": 0.3333333432674408,
"num_tokens": 2454233.0,
"repeat_count": 0.0,
- "routers_loss": 0.003754811594262719,
+ "routers_loss": 0.003669742727652192,
"skip_count": 0.0,
"step": 1520,
"text_loss": 0.5667551755905151
@@ -14457,32 +14457,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08837890625,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009752405007158238,
- "loss": 0.0246,
+ "loss": 0.0238,
"macro_f1": 0.3333333432674408,
"num_tokens": 2457331.0,
"repeat_count": 0.0,
- "routers_loss": 0.010853761807084084,
+ "routers_loss": 0.010455607436597347,
"skip_count": 0.0,
"step": 1522,
"text_loss": 0.19575810432434082
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.5,
"acc_skip": 1.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 7.154975051364837,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0009751442172967151,
- "loss": 0.0196,
- "macro_f1": 1.0,
+ "loss": 0.0193,
+ "macro_f1": 0.8823530077934265,
"num_tokens": 2459935.0,
"repeat_count": 2.0,
- "routers_loss": 0.015100379474461079,
+ "routers_loss": 0.025189083069562912,
"skip_count": 1.0,
"step": 1524,
"text_loss": 0.45453405380249023
@@ -14495,13 +14495,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.000975047751801446,
- "loss": 0.0189,
+ "loss": 0.0187,
"macro_f1": 0.3272727429866791,
"num_tokens": 2463008.0,
"repeat_count": 0.0,
- "routers_loss": 0.011991916224360466,
+ "routers_loss": 0.012297490611672401,
"skip_count": 0.0,
"step": 1526,
"text_loss": 0.31437572836875916
@@ -14514,32 +14514,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009749511042669823,
- "loss": 0.0226,
+ "loss": 0.0233,
"macro_f1": 0.3333333432674408,
"num_tokens": 2466475.0,
"repeat_count": 0.0,
- "routers_loss": 0.008201062679290771,
+ "routers_loss": 0.011026266030967236,
"skip_count": 0.0,
"step": 1528,
"text_loss": 0.46604859828948975
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 7.183152333431171,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.1181640625,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1376953125,
"learning_rate": 0.0009748542747303595,
- "loss": 0.0174,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0182,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2469320.0,
"repeat_count": 0.0,
- "routers_loss": 0.008513177745044231,
+ "routers_loss": 0.011934996582567692,
"skip_count": 1.0,
"step": 1530,
"text_loss": 0.7764923572540283
@@ -14552,13 +14552,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.0966796875,
"learning_rate": 0.0009747572632286827,
- "loss": 0.02,
+ "loss": 0.0203,
"macro_f1": 0.3333333432674408,
"num_tokens": 2472468.0,
"repeat_count": 0.0,
- "routers_loss": 0.004850955214351416,
+ "routers_loss": 0.005786920432001352,
"skip_count": 0.0,
"step": 1532,
"text_loss": 0.3555782437324524
@@ -14571,32 +14571,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.087890625,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009746600697991271,
- "loss": 0.0206,
+ "loss": 0.02,
"macro_f1": 0.6666666865348816,
"num_tokens": 2475736.0,
"repeat_count": 1.0,
- "routers_loss": 0.0027650354895740747,
+ "routers_loss": 0.0026990731712430716,
"skip_count": 0.0,
"step": 1534,
"text_loss": 0.49561792612075806
},
{
"acc_repeat": 1.0,
- "acc_skip": 0.0,
- "avg_layers": 29.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
"epoch": 7.2113296154975055,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
- "f1_skip": 0.0,
- "grad_norm": 0.0615234375,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0556640625,
"learning_rate": 0.0009745626944789375,
- "loss": 0.0209,
- "macro_f1": 0.6538461446762085,
+ "loss": 0.0204,
+ "macro_f1": 0.8823530077934265,
"num_tokens": 2478887.0,
"repeat_count": 1.0,
- "routers_loss": 0.023268593475222588,
+ "routers_loss": 0.020221207290887833,
"skip_count": 2.0,
"step": 1536,
"text_loss": 0.5375416278839111
@@ -14609,13 +14609,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11669921875,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009744651373054279,
"loss": 0.0286,
"macro_f1": 0.3272727429866791,
"num_tokens": 2481293.0,
"repeat_count": 0.0,
- "routers_loss": 0.031235001981258392,
+ "routers_loss": 0.03131086751818657,
"skip_count": 1.0,
"step": 1538,
"text_loss": 0.5241039395332336
@@ -14628,13 +14628,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.080078125,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009743673983159828,
- "loss": 0.023,
+ "loss": 0.0241,
"macro_f1": 0.6122449040412903,
"num_tokens": 2484403.0,
"repeat_count": 0.0,
- "routers_loss": 0.042398080229759216,
+ "routers_loss": 0.04448170214891434,
"skip_count": 4.0,
"step": 1540,
"text_loss": 0.7465724349021912
@@ -14647,13 +14647,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.08935546875,
"learning_rate": 0.0009742694775480557,
- "loss": 0.0268,
+ "loss": 0.0265,
"macro_f1": 0.6666666865348816,
"num_tokens": 2487952.0,
"repeat_count": 0.0,
- "routers_loss": 0.007361465133726597,
+ "routers_loss": 0.007171491626650095,
"skip_count": 1.0,
"step": 1542,
"text_loss": 0.2877117097377777
@@ -14666,13 +14666,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0009741713750391703,
- "loss": 0.0166,
+ "loss": 0.0171,
"macro_f1": 0.6666666865348816,
"num_tokens": 2490815.0,
"repeat_count": 1.0,
- "routers_loss": 0.0052334014326334,
+ "routers_loss": 0.004559285007417202,
"skip_count": 0.0,
"step": 1544,
"text_loss": 0.6097800135612488
@@ -14685,13 +14685,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.06787109375,
"learning_rate": 0.0009740730908269193,
"loss": 0.0174,
"macro_f1": 0.3333333432674408,
"num_tokens": 2494727.0,
"repeat_count": 0.0,
- "routers_loss": 0.004993532784283161,
+ "routers_loss": 0.005271553061902523,
"skip_count": 0.0,
"step": 1546,
"text_loss": 0.5431114435195923
@@ -14704,13 +14704,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009739746249489658,
- "loss": 0.0248,
+ "loss": 0.0239,
"macro_f1": 0.3333333432674408,
"num_tokens": 2499266.0,
"repeat_count": 0.0,
- "routers_loss": 0.001611889572814107,
+ "routers_loss": 0.0015409323386847973,
"skip_count": 0.0,
"step": 1548,
"text_loss": 0.4702678322792053
@@ -14723,13 +14723,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.1171875,
"learning_rate": 0.0009738759774430417,
- "loss": 0.0209,
+ "loss": 0.0216,
"macro_f1": 0.32098764181137085,
"num_tokens": 2502273.0,
"repeat_count": 1.0,
- "routers_loss": 0.03059260919690132,
+ "routers_loss": 0.030183158814907074,
"skip_count": 1.0,
"step": 1550,
"text_loss": 0.3239189088344574
@@ -14742,32 +14742,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056396484375,
+ "grad_norm": 0.0498046875,
"learning_rate": 0.0009737771483469493,
- "loss": 0.0195,
+ "loss": 0.0196,
"macro_f1": 0.3333333432674408,
"num_tokens": 2507624.0,
"repeat_count": 0.0,
- "routers_loss": 0.00508903618901968,
+ "routers_loss": 0.005410848651081324,
"skip_count": 0.0,
"step": 1552,
"text_loss": 0.4014642834663391
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 7.295861461696507,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
+ "f1_skip": 1.0,
"grad_norm": 0.07763671875,
"learning_rate": 0.0009736781376985598,
- "loss": 0.0174,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0168,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 2510366.0,
"repeat_count": 0.0,
- "routers_loss": 0.007860450074076653,
+ "routers_loss": 0.0066976165398955345,
"skip_count": 1.0,
"step": 1554,
"text_loss": 0.5924848914146423
@@ -14780,13 +14780,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11669921875,
+ "grad_norm": 0.13671875,
"learning_rate": 0.0009735789455358144,
- "loss": 0.0217,
+ "loss": 0.022,
"macro_f1": 0.3333333432674408,
"num_tokens": 2513317.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027370608877390623,
+ "routers_loss": 0.002763477386906743,
"skip_count": 0.0,
"step": 1556,
"text_loss": 0.3222943842411041
@@ -14799,13 +14799,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10302734375,
+ "grad_norm": 0.11767578125,
"learning_rate": 0.0009734795718967237,
- "loss": 0.0276,
+ "loss": 0.0283,
"macro_f1": 0.32098764181137085,
"num_tokens": 2516628.0,
"repeat_count": 0.0,
- "routers_loss": 0.061584725975990295,
+ "routers_loss": 0.061566028743982315,
"skip_count": 2.0,
"step": 1558,
"text_loss": 0.3249334692955017
@@ -14818,13 +14818,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009733800168193679,
"loss": 0.0228,
"macro_f1": 1.0,
"num_tokens": 2519424.0,
"repeat_count": 2.0,
- "routers_loss": 0.01694316789507866,
+ "routers_loss": 0.017976421862840652,
"skip_count": 4.0,
"step": 1560,
"text_loss": 0.3341919481754303
@@ -14837,13 +14837,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1826171875,
"learning_rate": 0.0009732802803418966,
- "loss": 0.0234,
+ "loss": 0.023,
"macro_f1": 0.3333333432674408,
"num_tokens": 2522922.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023331891279667616,
+ "routers_loss": 0.002525332849472761,
"skip_count": 0.0,
"step": 1562,
"text_loss": 0.3176332712173462
@@ -14856,13 +14856,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.07861328125,
"learning_rate": 0.0009731803625025292,
- "loss": 0.0203,
+ "loss": 0.0196,
"macro_f1": 0.3272727429866791,
"num_tokens": 2525811.0,
"repeat_count": 0.0,
- "routers_loss": 0.021300682798027992,
+ "routers_loss": 0.015524424612522125,
"skip_count": 1.0,
"step": 1564,
"text_loss": 0.532774031162262
@@ -14875,13 +14875,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.0009730802633395541,
- "loss": 0.026,
+ "loss": 0.0257,
"macro_f1": 0.6603773832321167,
"num_tokens": 2529157.0,
"repeat_count": 1.0,
- "routers_loss": 0.08335043489933014,
+ "routers_loss": 0.08138631284236908,
"skip_count": 1.0,
"step": 1566,
"text_loss": 0.529487133026123
@@ -14894,13 +14894,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009729799828913298,
- "loss": 0.0224,
+ "loss": 0.0223,
"macro_f1": 0.3333333432674408,
"num_tokens": 2532249.0,
"repeat_count": 0.0,
- "routers_loss": 0.003535634372383356,
+ "routers_loss": 0.0035867292899638414,
"skip_count": 0.0,
"step": 1568,
"text_loss": 0.503160297870636
@@ -14913,13 +14913,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009728795211962838,
"loss": 0.0259,
"macro_f1": 0.5492662787437439,
"num_tokens": 2535904.0,
"repeat_count": 0.0,
- "routers_loss": 0.025729363784193993,
+ "routers_loss": 0.02987455204129219,
"skip_count": 2.0,
"step": 1570,
"text_loss": 0.9170270562171936
@@ -14932,13 +14932,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1357421875,
+ "grad_norm": 0.11865234375,
"learning_rate": 0.0009727788782929131,
- "loss": 0.0287,
+ "loss": 0.0273,
"macro_f1": 0.3272727429866791,
"num_tokens": 2538943.0,
"repeat_count": 1.0,
- "routers_loss": 0.059166863560676575,
+ "routers_loss": 0.04676021635532379,
"skip_count": 0.0,
"step": 1572,
"text_loss": 0.29146310687065125
@@ -14951,13 +14951,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0009726780542197844,
- "loss": 0.0173,
+ "loss": 0.0169,
"macro_f1": 0.3333333432674408,
"num_tokens": 2541805.0,
"repeat_count": 0.0,
- "routers_loss": 0.002580022206529975,
+ "routers_loss": 0.002127803163602948,
"skip_count": 0.0,
"step": 1574,
"text_loss": 1.0126502513885498
@@ -14970,13 +14970,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009725770490155338,
- "loss": 0.0257,
+ "loss": 0.0262,
"macro_f1": 0.3333333432674408,
"num_tokens": 2546213.0,
"repeat_count": 0.0,
- "routers_loss": 0.007746981456875801,
+ "routers_loss": 0.007609677035361528,
"skip_count": 0.0,
"step": 1576,
"text_loss": 0.190168559551239
@@ -14989,13 +14989,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.083984375,
"learning_rate": 0.0009724758627188665,
- "loss": 0.0344,
+ "loss": 0.0356,
"macro_f1": 0.3272727429866791,
"num_tokens": 2549554.0,
"repeat_count": 0.0,
- "routers_loss": 0.027308562770485878,
+ "routers_loss": 0.033554721623659134,
"skip_count": 1.0,
"step": 1578,
"text_loss": 0.2977406084537506
@@ -15008,13 +15008,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009723744953685572,
- "loss": 0.0277,
+ "loss": 0.028,
"macro_f1": 0.3272727429866791,
"num_tokens": 2552785.0,
"repeat_count": 1.0,
- "routers_loss": 0.029863199219107628,
+ "routers_loss": 0.027864238247275352,
"skip_count": 0.0,
"step": 1580,
"text_loss": 0.2700682580471039
@@ -15027,13 +15027,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.19921875,
"learning_rate": 0.0009722729470034503,
- "loss": 0.0218,
+ "loss": 0.0224,
"macro_f1": 0.3333333432674408,
"num_tokens": 2556550.0,
"repeat_count": 0.0,
- "routers_loss": 0.004019706044346094,
+ "routers_loss": 0.004798175301402807,
"skip_count": 0.0,
"step": 1582,
"text_loss": 0.6559903025627136
@@ -15046,32 +15046,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07177734375,
+ "grad_norm": 0.078125,
"learning_rate": 0.0009721712176624591,
- "loss": 0.0239,
+ "loss": 0.0242,
"macro_f1": 0.3333333432674408,
"num_tokens": 2559862.0,
"repeat_count": 0.0,
- "routers_loss": 0.014162382110953331,
+ "routers_loss": 0.013764148578047752,
"skip_count": 0.0,
"step": 1584,
"text_loss": 0.2257535308599472
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 7.446140299383622,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.0009720693073845667,
- "loss": 0.0338,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.032,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 2562766.0,
"repeat_count": 0.0,
- "routers_loss": 0.023485012352466583,
+ "routers_loss": 0.01937069371342659,
"skip_count": 2.0,
"step": 1586,
"text_loss": 0.178413525223732
@@ -15079,37 +15079,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 7.455532726739067,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.150390625,
"learning_rate": 0.0009719672162088252,
- "loss": 0.0308,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0306,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 2566583.0,
"repeat_count": 1.0,
- "routers_loss": 0.05822715163230896,
+ "routers_loss": 0.06224144622683525,
"skip_count": 0.0,
"step": 1588,
"text_loss": 0.3992367684841156
},
{
- "acc_repeat": 0.5,
- "acc_skip": 0.5,
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
"avg_layers": 27.0,
"epoch": 7.464925154094511,
- "f1_execute": 0.936170220375061,
- "f1_repeat": 0.6666666865348816,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.189453125,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.185546875,
"learning_rate": 0.0009718649441743559,
- "loss": 0.0243,
- "macro_f1": 0.7565011978149414,
+ "loss": 0.0239,
+ "macro_f1": 0.9449735879898071,
"num_tokens": 2569516.0,
"repeat_count": 2.0,
- "routers_loss": 0.07448136061429977,
+ "routers_loss": 0.06937911361455917,
"skip_count": 4.0,
"step": 1590,
"text_loss": 0.1945122629404068
@@ -15122,13 +15122,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.00097176249132035,
- "loss": 0.0228,
+ "loss": 0.0229,
"macro_f1": 0.3333333432674408,
"num_tokens": 2572418.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038424162194132805,
+ "routers_loss": 0.0034326619934290648,
"skip_count": 0.0,
"step": 1592,
"text_loss": 0.6259906888008118
@@ -15141,13 +15141,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.08642578125,
"learning_rate": 0.0009716598576860676,
- "loss": 0.0277,
+ "loss": 0.0278,
"macro_f1": 0.6666666865348816,
"num_tokens": 2575235.0,
"repeat_count": 1.0,
- "routers_loss": 0.005674343090504408,
+ "routers_loss": 0.004557516425848007,
"skip_count": 0.0,
"step": 1594,
"text_loss": 0.6638736724853516
@@ -15160,13 +15160,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.193359375,
"learning_rate": 0.0009715570433108378,
- "loss": 0.0209,
+ "loss": 0.0198,
"macro_f1": 1.0,
"num_tokens": 2578157.0,
"repeat_count": 1.0,
- "routers_loss": 0.015544800087809563,
+ "routers_loss": 0.015363055281341076,
"skip_count": 1.0,
"step": 1596,
"text_loss": 0.6530464887619019
@@ -15179,13 +15179,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009714540482340595,
- "loss": 0.0279,
+ "loss": 0.0268,
"macro_f1": 0.6666666865348816,
"num_tokens": 2581801.0,
"repeat_count": 1.0,
- "routers_loss": 0.013199405744671822,
+ "routers_loss": 0.01257144846022129,
"skip_count": 0.0,
"step": 1598,
"text_loss": 0.5916110277175903
@@ -15198,13 +15198,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059326171875,
+ "grad_norm": 0.058837890625,
"learning_rate": 0.0009713508724952006,
- "loss": 0.0178,
+ "loss": 0.0177,
"macro_f1": 0.3333333432674408,
"num_tokens": 2585204.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032487998250871897,
+ "routers_loss": 0.003175645601004362,
"skip_count": 0.0,
"step": 1600,
"text_loss": 0.27901601791381836
@@ -15217,13 +15217,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12255859375,
+ "grad_norm": 0.12353515625,
"learning_rate": 0.0009712475161337981,
- "loss": 0.0253,
+ "loss": 0.0261,
"macro_f1": 0.3333333432674408,
"num_tokens": 2588286.0,
"repeat_count": 0.0,
- "routers_loss": 0.0041928659193217754,
+ "routers_loss": 0.004122321493923664,
"skip_count": 0.0,
"step": 1602,
"text_loss": 0.42420244216918945
@@ -15236,13 +15236,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0009711439791894585,
- "loss": 0.0343,
+ "loss": 0.0341,
"macro_f1": 0.6666666865348816,
"num_tokens": 2591476.0,
"repeat_count": 0.0,
- "routers_loss": 0.011576149612665176,
+ "routers_loss": 0.011215819045901299,
"skip_count": 1.0,
"step": 1604,
"text_loss": 0.5549933910369873
@@ -15255,13 +15255,13 @@
"f1_execute": 0.9599999785423279,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009710402617018574,
- "loss": 0.0179,
+ "loss": 0.0172,
"macro_f1": 0.8200000524520874,
"num_tokens": 2594336.0,
"repeat_count": 1.0,
- "routers_loss": 0.03026912547647953,
+ "routers_loss": 0.02916567400097847,
"skip_count": 2.0,
"step": 1606,
"text_loss": 0.3263779282569885
@@ -15276,11 +15276,11 @@
"f1_skip": 1.0,
"grad_norm": 0.068359375,
"learning_rate": 0.0009709363637107393,
- "loss": 0.021,
+ "loss": 0.0209,
"macro_f1": 0.6666666865348816,
"num_tokens": 2597462.0,
"repeat_count": 0.0,
- "routers_loss": 0.014957098290324211,
+ "routers_loss": 0.015897957608103752,
"skip_count": 1.0,
"step": 1608,
"text_loss": 0.20917139947414398
@@ -15293,13 +15293,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009708322852559184,
- "loss": 0.0226,
+ "loss": 0.0229,
"macro_f1": 0.3333333432674408,
"num_tokens": 2601543.0,
"repeat_count": 0.0,
- "routers_loss": 0.00254683755338192,
+ "routers_loss": 0.002211357234045863,
"skip_count": 0.0,
"step": 1610,
"text_loss": 0.450550377368927
@@ -15312,13 +15312,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1748046875,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.0009707280263772776,
- "loss": 0.0286,
+ "loss": 0.0277,
"macro_f1": 0.6666666865348816,
"num_tokens": 2604462.0,
"repeat_count": 0.0,
- "routers_loss": 0.018759876489639282,
+ "routers_loss": 0.01615734025835991,
"skip_count": 2.0,
"step": 1612,
"text_loss": 0.6908381581306458
@@ -15337,7 +15337,7 @@
"macro_f1": 0.5492662787437439,
"num_tokens": 2607484.0,
"repeat_count": 0.0,
- "routers_loss": 0.022694367915391922,
+ "routers_loss": 0.022048067301511765,
"skip_count": 2.0,
"step": 1614,
"text_loss": 0.36691340804100037
@@ -15350,13 +15350,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.10546875,
"learning_rate": 0.0009705189675084138,
- "loss": 0.0181,
+ "loss": 0.0176,
"macro_f1": 0.6666666865348816,
"num_tokens": 2610204.0,
"repeat_count": 0.0,
- "routers_loss": 0.010102321393787861,
+ "routers_loss": 0.008503952994942665,
"skip_count": 1.0,
"step": 1616,
"text_loss": 0.5226598381996155
@@ -15369,13 +15369,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08984375,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009704141675983029,
- "loss": 0.0252,
+ "loss": 0.0248,
"macro_f1": 0.3333333432674408,
"num_tokens": 2613128.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020994991064071655,
+ "routers_loss": 0.0019020626787096262,
"skip_count": 0.0,
"step": 1618,
"text_loss": 0.6465088725090027
@@ -15388,13 +15388,13 @@
"f1_execute": 0.9333333373069763,
"f1_repeat": 0.0,
"f1_skip": 0.7272727489471436,
- "grad_norm": 0.10009765625,
+ "grad_norm": 0.107421875,
"learning_rate": 0.0009703091874245956,
- "loss": 0.0323,
+ "loss": 0.032,
"macro_f1": 0.5535354018211365,
"num_tokens": 2616360.0,
"repeat_count": 0.0,
- "routers_loss": 0.11748704314231873,
+ "routers_loss": 0.11837691068649292,
"skip_count": 7.0,
"step": 1620,
"text_loss": 0.2987039089202881
@@ -15407,32 +15407,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009702040270275204,
- "loss": 0.018,
+ "loss": 0.0181,
"macro_f1": 0.3333333432674408,
"num_tokens": 2619606.0,
"repeat_count": 0.0,
- "routers_loss": 0.007642311509698629,
+ "routers_loss": 0.0065958453342318535,
"skip_count": 0.0,
"step": 1622,
"text_loss": 0.6262096166610718
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 7.62459641913707,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.10595703125,
+ "f1_skip": 1.0,
+ "grad_norm": 0.103515625,
"learning_rate": 0.000970098686447375,
- "loss": 0.0258,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0257,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 2622499.0,
"repeat_count": 0.0,
- "routers_loss": 0.016890225932002068,
+ "routers_loss": 0.013632026500999928,
"skip_count": 1.0,
"step": 1624,
"text_loss": 0.2392602562904358
@@ -15445,13 +15445,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.125,
"learning_rate": 0.0009699931657245264,
- "loss": 0.0242,
+ "loss": 0.0245,
"macro_f1": 0.5492662787437439,
"num_tokens": 2626002.0,
"repeat_count": 0.0,
- "routers_loss": 0.010900186374783516,
+ "routers_loss": 0.012147823348641396,
"skip_count": 2.0,
"step": 1626,
"text_loss": 0.4742976129055023
@@ -15464,13 +15464,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0810546875,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009698874648994098,
- "loss": 0.0279,
+ "loss": 0.0285,
"macro_f1": 1.0,
"num_tokens": 2629847.0,
"repeat_count": 1.0,
- "routers_loss": 0.011229799129068851,
+ "routers_loss": 0.010692884214222431,
"skip_count": 3.0,
"step": 1628,
"text_loss": 0.5090685486793518
@@ -15483,13 +15483,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.0009697815840125304,
- "loss": 0.0275,
+ "loss": 0.0265,
"macro_f1": 0.3333333432674408,
"num_tokens": 2633529.0,
"repeat_count": 0.0,
- "routers_loss": 0.0105878422036767,
+ "routers_loss": 0.011442207731306553,
"skip_count": 0.0,
"step": 1630,
"text_loss": 0.1874329298734665
@@ -15502,13 +15502,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.2119140625,
"learning_rate": 0.0009696755231044618,
- "loss": 0.0209,
+ "loss": 0.0207,
"macro_f1": 0.3333333432674408,
"num_tokens": 2636321.0,
"repeat_count": 0.0,
- "routers_loss": 0.002953991526737809,
+ "routers_loss": 0.0026681360322982073,
"skip_count": 0.0,
"step": 1632,
"text_loss": 0.7650400400161743
@@ -15521,13 +15521,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10888671875,
+ "grad_norm": 0.10498046875,
"learning_rate": 0.0009695692822158466,
- "loss": 0.0241,
+ "loss": 0.0242,
"macro_f1": 0.3272727429866791,
"num_tokens": 2638840.0,
"repeat_count": 1.0,
- "routers_loss": 0.04717390984296799,
+ "routers_loss": 0.033965807408094406,
"skip_count": 0.0,
"step": 1634,
"text_loss": 0.6175784468650818
@@ -15540,13 +15540,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0009694628613873968,
- "loss": 0.0179,
+ "loss": 0.018,
"macro_f1": 0.3333333432674408,
"num_tokens": 2641886.0,
"repeat_count": 0.0,
- "routers_loss": 0.0073657832108438015,
+ "routers_loss": 0.007568214554339647,
"skip_count": 0.0,
"step": 1636,
"text_loss": 0.43139931559562683
@@ -15559,13 +15559,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.193359375,
"learning_rate": 0.0009693562606598929,
- "loss": 0.0259,
+ "loss": 0.025,
"macro_f1": 0.3333333432674408,
"num_tokens": 2645028.0,
"repeat_count": 0.0,
- "routers_loss": 0.005212752148509026,
+ "routers_loss": 0.004973865579813719,
"skip_count": 0.0,
"step": 1638,
"text_loss": 0.6430339217185974
@@ -15578,13 +15578,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0009692494800741844,
- "loss": 0.0304,
+ "loss": 0.0313,
"macro_f1": 0.3272727429866791,
"num_tokens": 2648209.0,
"repeat_count": 1.0,
- "routers_loss": 0.04311618581414223,
+ "routers_loss": 0.049863800406455994,
"skip_count": 0.0,
"step": 1640,
"text_loss": 0.28138160705566406
@@ -15597,13 +15597,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08251953125,
+ "grad_norm": 0.08544921875,
"learning_rate": 0.0009691425196711901,
- "loss": 0.039,
+ "loss": 0.0398,
"macro_f1": 0.3272727429866791,
"num_tokens": 2651171.0,
"repeat_count": 0.0,
- "routers_loss": 0.02027471922338009,
+ "routers_loss": 0.02112230286002159,
"skip_count": 0.0,
"step": 1642,
"text_loss": 0.3745322525501251
@@ -15616,13 +15616,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009690353794918971,
- "loss": 0.0279,
+ "loss": 0.0275,
"macro_f1": 0.3333333432674408,
"num_tokens": 2654093.0,
"repeat_count": 0.0,
- "routers_loss": 0.003074956126511097,
+ "routers_loss": 0.0024304776452481747,
"skip_count": 0.0,
"step": 1644,
"text_loss": 0.4275154173374176
@@ -15635,13 +15635,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.0771484375,
"learning_rate": 0.000968928059577362,
- "loss": 0.0241,
+ "loss": 0.0244,
"macro_f1": 0.6666666865348816,
"num_tokens": 2657079.0,
"repeat_count": 0.0,
- "routers_loss": 0.009374706074595451,
+ "routers_loss": 0.009320619516074657,
"skip_count": 1.0,
"step": 1646,
"text_loss": 0.46650025248527527
@@ -15654,13 +15654,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1162109375,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009688205599687099,
- "loss": 0.0218,
+ "loss": 0.0209,
"macro_f1": 0.3272727429866791,
"num_tokens": 2660951.0,
"repeat_count": 0.0,
- "routers_loss": 0.01204691268503666,
+ "routers_loss": 0.011913162656128407,
"skip_count": 0.0,
"step": 1648,
"text_loss": 0.46644100546836853
@@ -15673,13 +15673,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009687128807071347,
"loss": 0.0284,
"macro_f1": 0.3333333432674408,
"num_tokens": 2663823.0,
"repeat_count": 0.0,
- "routers_loss": 0.01376053225249052,
+ "routers_loss": 0.013754756189882755,
"skip_count": 0.0,
"step": 1650,
"text_loss": 0.40808847546577454
@@ -15692,13 +15692,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.103515625,
"learning_rate": 0.0009686050218338996,
- "loss": 0.0285,
+ "loss": 0.0286,
"macro_f1": 0.3333333432674408,
"num_tokens": 2667079.0,
"repeat_count": 0.0,
- "routers_loss": 0.009346984326839447,
+ "routers_loss": 0.009099726565182209,
"skip_count": 0.0,
"step": 1652,
"text_loss": 0.2389989197254181
@@ -15711,13 +15711,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.08837890625,
"learning_rate": 0.0009684969833903359,
- "loss": 0.0291,
+ "loss": 0.0283,
"macro_f1": 0.6666666865348816,
"num_tokens": 2670162.0,
"repeat_count": 0.0,
- "routers_loss": 0.002724624238908291,
+ "routers_loss": 0.0034928603563457727,
"skip_count": 1.0,
"step": 1654,
"text_loss": 0.6930749416351318
@@ -15730,13 +15730,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009683887654178445,
- "loss": 0.0271,
+ "loss": 0.0261,
"macro_f1": 0.6666666865348816,
"num_tokens": 2673031.0,
"repeat_count": 0.0,
- "routers_loss": 0.00823777075856924,
+ "routers_loss": 0.008340462110936642,
"skip_count": 1.0,
"step": 1656,
"text_loss": 0.277752548456192
@@ -15749,32 +15749,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07373046875,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009682803679578947,
- "loss": 0.0262,
+ "loss": 0.0259,
"macro_f1": 0.3333333432674408,
"num_tokens": 2676092.0,
"repeat_count": 0.0,
- "routers_loss": 0.004393119364976883,
+ "routers_loss": 0.004337446764111519,
"skip_count": 0.0,
"step": 1658,
"text_loss": 0.5176776051521301
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 7.7936601115350745,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.1513671875,
+ "f1_skip": 0.0,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009681717910520244,
- "loss": 0.024,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0242,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 2679479.0,
"repeat_count": 0.0,
- "routers_loss": 0.031827569007873535,
+ "routers_loss": 0.034611742943525314,
"skip_count": 2.0,
"step": 1660,
"text_loss": 0.21485982835292816
@@ -15789,11 +15789,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.07958984375,
"learning_rate": 0.0009680630347418406,
- "loss": 0.0216,
+ "loss": 0.022,
"macro_f1": 0.5492662787437439,
"num_tokens": 2683289.0,
"repeat_count": 0.0,
- "routers_loss": 0.03329647704958916,
+ "routers_loss": 0.03297121450304985,
"skip_count": 2.0,
"step": 1662,
"text_loss": 0.33801013231277466
@@ -15806,13 +15806,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.000967954099069019,
- "loss": 0.0415,
+ "loss": 0.0411,
"macro_f1": 0.32098764181137085,
"num_tokens": 2685879.0,
"repeat_count": 1.0,
- "routers_loss": 0.047317031770944595,
+ "routers_loss": 0.04551183059811592,
"skip_count": 1.0,
"step": 1664,
"text_loss": 0.41123488545417786
@@ -15827,11 +15827,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1240234375,
"learning_rate": 0.0009678449840753038,
- "loss": 0.0325,
+ "loss": 0.0324,
"macro_f1": 0.32098764181137085,
"num_tokens": 2688910.0,
"repeat_count": 0.0,
- "routers_loss": 0.05649980902671814,
+ "routers_loss": 0.05866450071334839,
"skip_count": 2.0,
"step": 1666,
"text_loss": 0.1740892380475998
@@ -15844,13 +15844,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009677356898025082,
- "loss": 0.0229,
+ "loss": 0.023,
"macro_f1": 0.3333333432674408,
"num_tokens": 2691680.0,
"repeat_count": 0.0,
- "routers_loss": 0.01004624180495739,
+ "routers_loss": 0.009243223816156387,
"skip_count": 0.0,
"step": 1668,
"text_loss": 0.2512350380420685
@@ -15863,13 +15863,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.09619140625,
"learning_rate": 0.000967626216292514,
- "loss": 0.0194,
+ "loss": 0.0195,
"macro_f1": 0.3333333432674408,
"num_tokens": 2694895.0,
"repeat_count": 0.0,
- "routers_loss": 0.0054973396472632885,
+ "routers_loss": 0.005576452240347862,
"skip_count": 0.0,
"step": 1670,
"text_loss": 0.43294376134872437
@@ -15882,13 +15882,13 @@
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.09130859375,
"learning_rate": 0.0009675165635872715,
- "loss": 0.031,
+ "loss": 0.0306,
"macro_f1": 0.44705885648727417,
"num_tokens": 2697806.0,
"repeat_count": 0.0,
- "routers_loss": 0.05615650862455368,
+ "routers_loss": 0.05372785031795502,
"skip_count": 3.0,
"step": 1672,
"text_loss": 0.1614082306623459
@@ -15901,13 +15901,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009674067317288,
- "loss": 0.0301,
+ "loss": 0.0296,
"macro_f1": 0.6666666865348816,
"num_tokens": 2700529.0,
"repeat_count": 1.0,
- "routers_loss": 0.012819192372262478,
+ "routers_loss": 0.018131591379642487,
"skip_count": 0.0,
"step": 1674,
"text_loss": 0.2093173861503601
@@ -15920,13 +15920,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.08203125,
"learning_rate": 0.0009672967207591869,
- "loss": 0.0253,
+ "loss": 0.0257,
"macro_f1": 0.3272727429866791,
"num_tokens": 2703650.0,
"repeat_count": 0.0,
- "routers_loss": 0.07059332728385925,
+ "routers_loss": 0.0673515796661377,
"skip_count": 1.0,
"step": 1676,
"text_loss": 0.3029400110244751
@@ -15939,13 +15939,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009671865307205892,
- "loss": 0.0198,
+ "loss": 0.021,
"macro_f1": 0.32098767161369324,
"num_tokens": 2707615.0,
"repeat_count": 0.0,
- "routers_loss": 0.029778441414237022,
+ "routers_loss": 0.03821169584989548,
"skip_count": 1.0,
"step": 1678,
"text_loss": 0.2262786477804184
@@ -15958,13 +15958,13 @@
"f1_execute": 0.9756097793579102,
"f1_repeat": 1.0,
"f1_skip": 0.9090909361839294,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0009670761616552315,
- "loss": 0.0474,
+ "loss": 0.0465,
"macro_f1": 0.9615669250488281,
"num_tokens": 2710894.0,
"repeat_count": 2.0,
- "routers_loss": 0.04371272772550583,
+ "routers_loss": 0.042625464498996735,
"skip_count": 6.0,
"step": 1680,
"text_loss": 0.29623574018478394
@@ -15977,13 +15977,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009669656136054074,
- "loss": 0.0293,
+ "loss": 0.0289,
"macro_f1": 0.3333333432674408,
"num_tokens": 2714330.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033591394312679768,
+ "routers_loss": 0.0037571541033685207,
"skip_count": 0.0,
"step": 1682,
"text_loss": 0.7510389089584351
@@ -15996,13 +15996,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.07421875,
"learning_rate": 0.0009668548866134795,
- "loss": 0.0259,
+ "loss": 0.0256,
"macro_f1": 0.3333333432674408,
"num_tokens": 2717176.0,
"repeat_count": 0.0,
- "routers_loss": 0.005085585173219442,
+ "routers_loss": 0.004142968449741602,
"skip_count": 0.0,
"step": 1684,
"text_loss": 0.3273485600948334
@@ -16015,13 +16015,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0712890625,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009667439807218783,
- "loss": 0.0243,
+ "loss": 0.0233,
"macro_f1": 0.6666666865348816,
"num_tokens": 2720628.0,
"repeat_count": 0.0,
- "routers_loss": 0.008569681085646152,
+ "routers_loss": 0.008753842674195766,
"skip_count": 2.0,
"step": 1686,
"text_loss": 0.4314708709716797
@@ -16034,32 +16034,32 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0009666328959731033,
- "loss": 0.022,
+ "loss": 0.0211,
"macro_f1": 0.6603773832321167,
"num_tokens": 2723739.0,
"repeat_count": 1.0,
- "routers_loss": 0.024587804451584816,
+ "routers_loss": 0.022674910724163055,
"skip_count": 1.0,
"step": 1688,
"text_loss": 0.25734150409698486
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.3333333432674408,
- "avg_layers": 27.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
"epoch": 7.934546521866745,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
- "f1_skip": 0.5,
- "grad_norm": 0.169921875,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009665216324097222,
- "loss": 0.0332,
- "macro_f1": 0.4871794879436493,
+ "loss": 0.0324,
+ "macro_f1": 0.5934640765190125,
"num_tokens": 2726644.0,
"repeat_count": 0.0,
- "routers_loss": 0.037516288459300995,
+ "routers_loss": 0.03932750225067139,
"skip_count": 3.0,
"step": 1690,
"text_loss": 0.24511034786701202
@@ -16072,13 +16072,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.09765625,
"learning_rate": 0.0009664101900743714,
- "loss": 0.0262,
+ "loss": 0.0255,
"macro_f1": 0.3272727429866791,
"num_tokens": 2729662.0,
"repeat_count": 0.0,
- "routers_loss": 0.01287431176751852,
+ "routers_loss": 0.012672754004597664,
"skip_count": 1.0,
"step": 1692,
"text_loss": 0.39431414008140564
@@ -16091,13 +16091,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.07763671875,
+ "grad_norm": 0.076171875,
"learning_rate": 0.000966298569009756,
- "loss": 0.0227,
+ "loss": 0.0231,
"macro_f1": 0.5492662787437439,
"num_tokens": 2732578.0,
"repeat_count": 0.0,
- "routers_loss": 0.015499880537390709,
+ "routers_loss": 0.01548632513731718,
"skip_count": 2.0,
"step": 1694,
"text_loss": 0.12439999729394913
@@ -16110,13 +16110,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.080078125,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009661867692586494,
- "loss": 0.0144,
+ "loss": 0.0153,
"macro_f1": 0.32098764181137085,
"num_tokens": 2735887.0,
"repeat_count": 0.0,
- "routers_loss": 0.049878787249326706,
+ "routers_loss": 0.05622401833534241,
"skip_count": 2.0,
"step": 1696,
"text_loss": 0.29024389386177063
@@ -16129,13 +16129,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10009765625,
+ "grad_norm": 0.087890625,
"learning_rate": 0.0009660747908638933,
- "loss": 0.0206,
+ "loss": 0.0205,
"macro_f1": 0.3272727429866791,
"num_tokens": 2739293.0,
"repeat_count": 0.0,
- "routers_loss": 0.04108169302344322,
+ "routers_loss": 0.041060201823711395,
"skip_count": 1.0,
"step": 1698,
"text_loss": 0.39461007714271545
@@ -16148,13 +16148,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.1767578125,
"learning_rate": 0.0009659626338683981,
- "loss": 0.0367,
+ "loss": 0.0369,
"macro_f1": 0.3333333432674408,
"num_tokens": 2742468.0,
"repeat_count": 0.0,
- "routers_loss": 0.007651917636394501,
+ "routers_loss": 0.007251353468745947,
"skip_count": 0.0,
"step": 1700,
"text_loss": 0.2751767635345459
@@ -16167,13 +16167,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.07763671875,
"learning_rate": 0.0009658502983151427,
- "loss": 0.0182,
+ "loss": 0.0186,
"macro_f1": 0.3272727429866791,
"num_tokens": 2745123.0,
"repeat_count": 0.0,
- "routers_loss": 0.015448091551661491,
+ "routers_loss": 0.012847424484789371,
"skip_count": 1.0,
"step": 1702,
"text_loss": 0.4756404757499695
@@ -16186,13 +16186,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.11767578125,
"learning_rate": 0.0009657377842471742,
- "loss": 0.0324,
+ "loss": 0.0313,
"macro_f1": 0.6666666865348816,
"num_tokens": 2748016.0,
"repeat_count": 0.0,
- "routers_loss": 0.009139287285506725,
+ "routers_loss": 0.007060411386191845,
"skip_count": 1.0,
"step": 1704,
"text_loss": 0.9571210145950317
@@ -16205,13 +16205,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.10009765625,
"learning_rate": 0.0009656250917076081,
- "loss": 0.0191,
+ "loss": 0.0188,
"macro_f1": 0.5492662787437439,
"num_tokens": 2750717.0,
"repeat_count": 0.0,
- "routers_loss": 0.015412120148539543,
+ "routers_loss": 0.016748681664466858,
"skip_count": 2.0,
"step": 1706,
"text_loss": 0.14542843401432037
@@ -16224,13 +16224,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.060302734375,
"learning_rate": 0.0009655122207396285,
- "loss": 0.0175,
+ "loss": 0.017,
"macro_f1": 0.3333333432674408,
"num_tokens": 2753635.0,
"repeat_count": 0.0,
- "routers_loss": 0.012735052965581417,
+ "routers_loss": 0.013607042841613293,
"skip_count": 0.0,
"step": 1708,
"text_loss": 0.21836471557617188
@@ -16243,13 +16243,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07177734375,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0009653991713864878,
- "loss": 0.0192,
+ "loss": 0.0205,
"macro_f1": 0.3333333432674408,
"num_tokens": 2756643.0,
"repeat_count": 0.0,
- "routers_loss": 0.00114025070797652,
+ "routers_loss": 0.0012097888393327594,
"skip_count": 0.0,
"step": 1710,
"text_loss": 0.635187029838562
@@ -16262,13 +16262,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1171875,
"learning_rate": 0.0009652859436915066,
- "loss": 0.0243,
+ "loss": 0.0231,
"macro_f1": 0.3333333432674408,
"num_tokens": 2759432.0,
"repeat_count": 0.0,
- "routers_loss": 0.006401443853974342,
+ "routers_loss": 0.006196760106831789,
"skip_count": 0.0,
"step": 1712,
"text_loss": 0.5629420876502991
@@ -16281,13 +16281,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0009651725376980743,
- "loss": 0.0185,
+ "loss": 0.0177,
"macro_f1": 0.3333333432674408,
"num_tokens": 2762538.0,
"repeat_count": 0.0,
- "routers_loss": 0.004316259175539017,
+ "routers_loss": 0.0042513771913945675,
"skip_count": 0.0,
"step": 1714,
"text_loss": 0.39522525668144226
@@ -16300,13 +16300,13 @@
"f1_execute": 0.9583333134651184,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.125,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009650589534496479,
- "loss": 0.0201,
+ "loss": 0.0194,
"macro_f1": 0.8194444179534912,
"num_tokens": 2765571.0,
"repeat_count": 2.0,
- "routers_loss": 0.043461959809064865,
+ "routers_loss": 0.03596706688404083,
"skip_count": 3.0,
"step": 1716,
"text_loss": 0.6252416968345642
@@ -16319,13 +16319,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.04833984375,
"learning_rate": 0.0009649451909897532,
"loss": 0.0178,
"macro_f1": 0.3333333432674408,
"num_tokens": 2769206.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024530428927391768,
+ "routers_loss": 0.0025788163766264915,
"skip_count": 0.0,
"step": 1718,
"text_loss": 0.8851634860038757
@@ -16338,13 +16338,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.10791015625,
"learning_rate": 0.0009648312503619843,
- "loss": 0.026,
+ "loss": 0.0265,
"macro_f1": 0.3333333432674408,
"num_tokens": 2772488.0,
"repeat_count": 0.0,
- "routers_loss": 0.0046626063995063305,
+ "routers_loss": 0.004443451762199402,
"skip_count": 0.0,
"step": 1720,
"text_loss": 0.8568580746650696
@@ -16357,13 +16357,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009647171316100034,
- "loss": 0.0257,
+ "loss": 0.0265,
"macro_f1": 0.9265305995941162,
"num_tokens": 2776482.0,
"repeat_count": 1.0,
- "routers_loss": 0.02480102889239788,
+ "routers_loss": 0.022948263213038445,
"skip_count": 3.0,
"step": 1722,
"text_loss": 0.13431036472320557
@@ -16376,13 +16376,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.0009646028347775409,
- "loss": 0.02,
+ "loss": 0.0204,
"macro_f1": 0.6666666865348816,
"num_tokens": 2778966.0,
"repeat_count": 0.0,
- "routers_loss": 0.012629947625100613,
+ "routers_loss": 0.011328035034239292,
"skip_count": 1.0,
"step": 1724,
"text_loss": 0.2085491120815277
@@ -16395,13 +16395,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08447265625,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009644883599083958,
"loss": 0.0238,
"macro_f1": 0.3333333432674408,
"num_tokens": 2781968.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024127380456775427,
+ "routers_loss": 0.002208018908277154,
"skip_count": 0.0,
"step": 1726,
"text_loss": 0.4948323965072632
@@ -16414,13 +16414,13 @@
"f1_execute": 0.9411764740943909,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.054443359375,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0009643737070464349,
- "loss": 0.0162,
+ "loss": 0.0158,
"macro_f1": 0.6470588445663452,
"num_tokens": 2784666.0,
"repeat_count": 1.0,
- "routers_loss": 0.0415453165769577,
+ "routers_loss": 0.04391832649707794,
"skip_count": 2.0,
"step": 1728,
"text_loss": 0.39060094952583313
@@ -16433,13 +16433,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.046630859375,
"learning_rate": 0.0009642588762355935,
- "loss": 0.0211,
+ "loss": 0.0212,
"macro_f1": 0.6666666865348816,
"num_tokens": 2787558.0,
"repeat_count": 0.0,
- "routers_loss": 0.0056681083515286446,
+ "routers_loss": 0.004497280344367027,
"skip_count": 1.0,
"step": 1730,
"text_loss": 0.34908708930015564
@@ -16452,13 +16452,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0009641438675198748,
- "loss": 0.0189,
+ "loss": 0.0175,
"macro_f1": 0.3333333432674408,
"num_tokens": 2790474.0,
"repeat_count": 0.0,
- "routers_loss": 0.006391602102667093,
+ "routers_loss": 0.00583475548774004,
"skip_count": 0.0,
"step": 1732,
"text_loss": 0.5720033049583435
@@ -16471,13 +16471,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0595703125,
+ "grad_norm": 0.08154296875,
"learning_rate": 0.0009640286809433508,
- "loss": 0.0229,
+ "loss": 0.0235,
"macro_f1": 0.3333333432674408,
"num_tokens": 2793272.0,
"repeat_count": 0.0,
- "routers_loss": 0.007466991897672415,
+ "routers_loss": 0.007826375775039196,
"skip_count": 0.0,
"step": 1734,
"text_loss": 0.32181721925735474
@@ -16490,13 +16490,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0009639133165501606,
- "loss": 0.0197,
+ "loss": 0.0192,
"macro_f1": 0.3333333432674408,
"num_tokens": 2797726.0,
"repeat_count": 0.0,
- "routers_loss": 0.001953453291207552,
+ "routers_loss": 0.0019055595621466637,
"skip_count": 0.0,
"step": 1736,
"text_loss": 0.620936393737793
@@ -16509,13 +16509,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009637977743845124,
- "loss": 0.0223,
+ "loss": 0.0229,
"macro_f1": 0.3333333432674408,
"num_tokens": 2800706.0,
"repeat_count": 0.0,
- "routers_loss": 0.003612719476222992,
+ "routers_loss": 0.0028302327264100313,
"skip_count": 0.0,
"step": 1738,
"text_loss": 0.6473138332366943
@@ -16528,13 +16528,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.049072265625,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009636820544906823,
- "loss": 0.0145,
+ "loss": 0.0146,
"macro_f1": 1.0,
"num_tokens": 2803847.0,
"repeat_count": 1.0,
- "routers_loss": 0.009977150708436966,
+ "routers_loss": 0.01105099730193615,
"skip_count": 2.0,
"step": 1740,
"text_loss": 0.4401201903820038
@@ -16547,13 +16547,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009635661569130141,
"loss": 0.0195,
"macro_f1": 0.5934640765190125,
"num_tokens": 2807235.0,
"repeat_count": 0.0,
- "routers_loss": 0.026468059048056602,
+ "routers_loss": 0.02619045600295067,
"skip_count": 3.0,
"step": 1742,
"text_loss": 0.459264874458313
@@ -16566,13 +16566,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0009634500816959202,
- "loss": 0.0165,
+ "loss": 0.0162,
"macro_f1": 0.6666666865348816,
"num_tokens": 2810396.0,
"repeat_count": 0.0,
- "routers_loss": 0.00849854201078415,
+ "routers_loss": 0.007915694266557693,
"skip_count": 2.0,
"step": 1744,
"text_loss": 0.5084020495414734
@@ -16585,13 +16585,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009633338288838805,
- "loss": 0.0275,
+ "loss": 0.0271,
"macro_f1": 0.5492662787437439,
"num_tokens": 2813215.0,
"repeat_count": 2.0,
- "routers_loss": 0.08082596957683563,
+ "routers_loss": 0.08364596217870712,
"skip_count": 0.0,
"step": 1746,
"text_loss": 0.27681824564933777
@@ -16604,13 +16604,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.046142578125,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.0009632173985214438,
- "loss": 0.015,
+ "loss": 0.0156,
"macro_f1": 0.8817967176437378,
"num_tokens": 2816452.0,
"repeat_count": 3.0,
- "routers_loss": 0.029500717297196388,
+ "routers_loss": 0.028805451467633247,
"skip_count": 2.0,
"step": 1748,
"text_loss": 0.4678419530391693
@@ -16623,13 +16623,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.0625,
"learning_rate": 0.000963100790653226,
- "loss": 0.0183,
+ "loss": 0.0188,
"macro_f1": 0.3272727429866791,
"num_tokens": 2819364.0,
"repeat_count": 0.0,
- "routers_loss": 0.025238536298274994,
+ "routers_loss": 0.03056817688047886,
"skip_count": 1.0,
"step": 1750,
"text_loss": 0.3078109920024872
@@ -16642,13 +16642,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0703125,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009629840053239116,
- "loss": 0.0204,
+ "loss": 0.0205,
"macro_f1": 0.3333333432674408,
"num_tokens": 2823469.0,
"repeat_count": 0.0,
- "routers_loss": 0.002069319598376751,
+ "routers_loss": 0.0019477814203128219,
"skip_count": 0.0,
"step": 1752,
"text_loss": 0.45501336455345154
@@ -16661,13 +16661,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05224609375,
+ "grad_norm": 0.057373046875,
"learning_rate": 0.000962867042578253,
- "loss": 0.0169,
+ "loss": 0.0173,
"macro_f1": 0.3333333432674408,
"num_tokens": 2826716.0,
"repeat_count": 0.0,
- "routers_loss": 0.002853946527466178,
+ "routers_loss": 0.0032963966950774193,
"skip_count": 0.0,
"step": 1754,
"text_loss": 0.49234694242477417
@@ -16680,13 +16680,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0947265625,
"learning_rate": 0.0009627499024610707,
- "loss": 0.0236,
+ "loss": 0.0239,
"macro_f1": 0.3272727429866791,
"num_tokens": 2829733.0,
"repeat_count": 0.0,
- "routers_loss": 0.0100983502343297,
+ "routers_loss": 0.010289114899933338,
"skip_count": 1.0,
"step": 1756,
"text_loss": 0.22335539758205414
@@ -16699,13 +16699,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09228515625,
+ "grad_norm": 0.0888671875,
"learning_rate": 0.0009626325850172527,
- "loss": 0.0173,
+ "loss": 0.0174,
"macro_f1": 0.3272727429866791,
"num_tokens": 2833350.0,
"repeat_count": 0.0,
- "routers_loss": 0.031218983232975006,
+ "routers_loss": 0.03249066323041916,
"skip_count": 1.0,
"step": 1758,
"text_loss": 0.6581931114196777
@@ -16718,13 +16718,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009625150902917555,
- "loss": 0.019,
+ "loss": 0.0185,
"macro_f1": 0.3333333432674408,
"num_tokens": 2836558.0,
"repeat_count": 0.0,
- "routers_loss": 0.010347879491746426,
+ "routers_loss": 0.00870000571012497,
"skip_count": 0.0,
"step": 1760,
"text_loss": 0.22938725352287292
@@ -16737,13 +16737,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1455078125,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0009623974183296031,
- "loss": 0.0193,
+ "loss": 0.0192,
"macro_f1": 0.3333333432674408,
"num_tokens": 2840560.0,
"repeat_count": 0.0,
- "routers_loss": 0.007768871728330851,
+ "routers_loss": 0.007767196744680405,
"skip_count": 0.0,
"step": 1762,
"text_loss": 0.24473799765110016
@@ -16756,13 +16756,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009622795691758876,
- "loss": 0.0253,
+ "loss": 0.0244,
"macro_f1": 0.3333333432674408,
"num_tokens": 2843548.0,
"repeat_count": 0.0,
- "routers_loss": 0.002887974726036191,
+ "routers_loss": 0.0021693643648177385,
"skip_count": 0.0,
"step": 1764,
"text_loss": 0.3084608018398285
@@ -16777,11 +16777,11 @@
"f1_skip": 0.0,
"grad_norm": 0.0498046875,
"learning_rate": 0.0009621615428757693,
- "loss": 0.0147,
+ "loss": 0.0149,
"macro_f1": 0.3333333432674408,
"num_tokens": 2847076.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027294005267322063,
+ "routers_loss": 0.0024727333802729845,
"skip_count": 0.0,
"step": 1766,
"text_loss": 0.5251734852790833
@@ -16794,13 +16794,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.000962043339474476,
- "loss": 0.0193,
+ "loss": 0.0194,
"macro_f1": 0.3333333432674408,
"num_tokens": 2849751.0,
"repeat_count": 0.0,
- "routers_loss": 0.00543541694059968,
+ "routers_loss": 0.005174890160560608,
"skip_count": 0.0,
"step": 1768,
"text_loss": 0.4410129189491272
@@ -16813,13 +16813,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.06103515625,
"learning_rate": 0.0009619249590173032,
- "loss": 0.0167,
+ "loss": 0.016,
"macro_f1": 0.6666666865348816,
"num_tokens": 2853916.0,
"repeat_count": 0.0,
- "routers_loss": 0.006514009553939104,
+ "routers_loss": 0.006785830482840538,
"skip_count": 2.0,
"step": 1770,
"text_loss": 0.550076425075531
@@ -16832,13 +16832,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.06591796875,
"learning_rate": 0.0009618064015496149,
- "loss": 0.019,
+ "loss": 0.0192,
"macro_f1": 0.5934640765190125,
"num_tokens": 2857372.0,
"repeat_count": 0.0,
- "routers_loss": 0.02333846502006054,
+ "routers_loss": 0.021370256319642067,
"skip_count": 3.0,
"step": 1772,
"text_loss": 0.1988629847764969
@@ -16851,13 +16851,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0732421875,
+ "grad_norm": 0.072265625,
"learning_rate": 0.0009616876671168423,
- "loss": 0.0165,
+ "loss": 0.0162,
"macro_f1": 0.6666666865348816,
"num_tokens": 2861028.0,
"repeat_count": 0.0,
- "routers_loss": 0.004471905063837767,
+ "routers_loss": 0.004313841462135315,
"skip_count": 1.0,
"step": 1774,
"text_loss": 0.42581331729888916
@@ -16870,13 +16870,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.0009615687557644847,
- "loss": 0.0261,
+ "loss": 0.0268,
"macro_f1": 0.3333333432674408,
"num_tokens": 2864847.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024362702388316393,
+ "routers_loss": 0.0025742491707205772,
"skip_count": 0.0,
"step": 1776,
"text_loss": 0.46510905027389526
@@ -16889,13 +16889,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009614496675381093,
- "loss": 0.0116,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 2867392.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021166049409657717,
+ "routers_loss": 0.0016813480760902166,
"skip_count": 0.0,
"step": 1778,
"text_loss": 0.5922174453735352
@@ -16908,13 +16908,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0712890625,
+ "grad_norm": 0.0810546875,
"learning_rate": 0.0009613304024833507,
"loss": 0.0166,
"macro_f1": 0.3333333432674408,
"num_tokens": 2871273.0,
"repeat_count": 0.0,
- "routers_loss": 0.004722296260297298,
+ "routers_loss": 0.004948933608829975,
"skip_count": 0.0,
"step": 1780,
"text_loss": 0.6776977777481079
@@ -16929,11 +16929,11 @@
"f1_skip": 1.0,
"grad_norm": 0.07470703125,
"learning_rate": 0.0009612109606459117,
- "loss": 0.0199,
+ "loss": 0.0186,
"macro_f1": 1.0,
"num_tokens": 2874172.0,
"repeat_count": 1.0,
- "routers_loss": 0.014188882894814014,
+ "routers_loss": 0.016950147226452827,
"skip_count": 2.0,
"step": 1782,
"text_loss": 0.48758944869041443
@@ -16946,13 +16946,13 @@
"f1_execute": 0.9599999785423279,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.076171875,
+ "grad_norm": 0.08251953125,
"learning_rate": 0.0009610913420715623,
- "loss": 0.0241,
+ "loss": 0.0237,
"macro_f1": 0.7644444704055786,
"num_tokens": 2877528.0,
"repeat_count": 2.0,
- "routers_loss": 0.04599560424685478,
+ "routers_loss": 0.04880943149328232,
"skip_count": 1.0,
"step": 1784,
"text_loss": 0.4404778480529785
@@ -16965,13 +16965,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.06201171875,
"learning_rate": 0.0009609715468061411,
- "loss": 0.0216,
+ "loss": 0.0205,
"macro_f1": 0.3333333432674408,
"num_tokens": 2880627.0,
"repeat_count": 0.0,
- "routers_loss": 0.004942454397678375,
+ "routers_loss": 0.004678630735725164,
"skip_count": 0.0,
"step": 1786,
"text_loss": 0.7295402884483337
@@ -16984,13 +16984,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08349609375,
+ "grad_norm": 0.07958984375,
"learning_rate": 0.0009608515748955535,
- "loss": 0.021,
+ "loss": 0.0205,
"macro_f1": 0.3333333432674408,
"num_tokens": 2883333.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020542226266115904,
+ "routers_loss": 0.0026695074047893286,
"skip_count": 0.0,
"step": 1788,
"text_loss": 0.9697831273078918
@@ -17003,13 +17003,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.1171875,
+ "grad_norm": 0.107421875,
"learning_rate": 0.000960731426385773,
- "loss": 0.0155,
+ "loss": 0.0157,
"macro_f1": 0.4871794879436493,
"num_tokens": 2887444.0,
"repeat_count": 0.0,
- "routers_loss": 0.0397041030228138,
+ "routers_loss": 0.029743613675236702,
"skip_count": 2.0,
"step": 1790,
"text_loss": 0.4737568199634552
@@ -17022,13 +17022,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.10107421875,
"learning_rate": 0.0009606111013228407,
- "loss": 0.0204,
+ "loss": 0.0207,
"macro_f1": 0.3333333432674408,
"num_tokens": 2890221.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017490010941401124,
+ "routers_loss": 0.0016153788892552257,
"skip_count": 0.0,
"step": 1792,
"text_loss": 0.6693558096885681
@@ -17041,13 +17041,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08251953125,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009604905997528655,
- "loss": 0.021,
+ "loss": 0.02,
"macro_f1": 0.3272727429866791,
"num_tokens": 2893262.0,
"repeat_count": 0.0,
- "routers_loss": 0.023590171709656715,
+ "routers_loss": 0.01965433731675148,
"skip_count": 1.0,
"step": 1794,
"text_loss": 0.45227760076522827
@@ -17060,13 +17060,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.08642578125,
"learning_rate": 0.0009603699217220239,
- "loss": 0.0125,
+ "loss": 0.0117,
"macro_f1": 0.6601307392120361,
"num_tokens": 2896823.0,
"repeat_count": 1.0,
- "routers_loss": 0.02458076737821102,
+ "routers_loss": 0.024017298594117165,
"skip_count": 2.0,
"step": 1796,
"text_loss": 0.48865509033203125
@@ -17079,13 +17079,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.08837890625,
"learning_rate": 0.0009602490672765597,
- "loss": 0.019,
+ "loss": 0.0182,
"macro_f1": 0.3333333432674408,
"num_tokens": 2899707.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014341498026624322,
+ "routers_loss": 0.0012420224957168102,
"skip_count": 0.0,
"step": 1798,
"text_loss": 0.43292415142059326
@@ -17098,13 +17098,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08056640625,
+ "grad_norm": 0.07861328125,
"learning_rate": 0.0009601280364627848,
- "loss": 0.02,
+ "loss": 0.0196,
"macro_f1": 0.3333333432674408,
"num_tokens": 2902795.0,
"repeat_count": 0.0,
- "routers_loss": 0.00213223067112267,
+ "routers_loss": 0.0020389219280332327,
"skip_count": 0.0,
"step": 1800,
"text_loss": 0.41021591424942017
@@ -17117,13 +17117,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07275390625,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009600068293270783,
- "loss": 0.0147,
+ "loss": 0.0142,
"macro_f1": 0.3333333432674408,
"num_tokens": 2905769.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027340995147824287,
+ "routers_loss": 0.002006303984671831,
"skip_count": 0.0,
"step": 1802,
"text_loss": 0.46892106533050537
@@ -17136,32 +17136,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.08740234375,
"learning_rate": 0.000959885445915887,
- "loss": 0.0172,
+ "loss": 0.017,
"macro_f1": 0.3333333432674408,
"num_tokens": 2909475.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035587961319833994,
+ "routers_loss": 0.003734810510650277,
"skip_count": 0.0,
"step": 1804,
"text_loss": 0.45364710688591003
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.5,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 8.479013795127678,
- "f1_execute": 0.9615384340286255,
- "f1_repeat": 0.0,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009597638862757254,
- "loss": 0.0187,
- "macro_f1": 0.5427350401878357,
+ "loss": 0.0182,
+ "macro_f1": 0.8823530077934265,
"num_tokens": 2914348.0,
"repeat_count": 1.0,
- "routers_loss": 0.04446055367588997,
+ "routers_loss": 0.038971323519945145,
"skip_count": 2.0,
"step": 1806,
"text_loss": 0.42913779616355896
@@ -17174,13 +17174,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08447265625,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009596421504531751,
- "loss": 0.0244,
+ "loss": 0.0249,
"macro_f1": 0.3272727429866791,
"num_tokens": 2917467.0,
"repeat_count": 1.0,
- "routers_loss": 0.05095123499631882,
+ "routers_loss": 0.04800829663872719,
"skip_count": 0.0,
"step": 1808,
"text_loss": 0.17332297563552856
@@ -17193,13 +17193,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009595202384948858,
- "loss": 0.0232,
+ "loss": 0.0227,
"macro_f1": 0.6666666865348816,
"num_tokens": 2920223.0,
"repeat_count": 1.0,
- "routers_loss": 0.008440068922936916,
+ "routers_loss": 0.009164143353700638,
"skip_count": 0.0,
"step": 1810,
"text_loss": 0.33740702271461487
@@ -17212,13 +17212,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0927734375,
+ "grad_norm": 0.0947265625,
"learning_rate": 0.0009593981504475742,
- "loss": 0.0273,
+ "loss": 0.0275,
"macro_f1": 0.6666666865348816,
"num_tokens": 2923780.0,
"repeat_count": 0.0,
- "routers_loss": 0.012230116873979568,
+ "routers_loss": 0.011236993595957756,
"skip_count": 2.0,
"step": 1812,
"text_loss": 0.1609916388988495
@@ -17231,13 +17231,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009592758863580248,
- "loss": 0.026,
+ "loss": 0.0259,
"macro_f1": 0.5492662787437439,
"num_tokens": 2926259.0,
"repeat_count": 0.0,
- "routers_loss": 0.017307188361883163,
+ "routers_loss": 0.019026532769203186,
"skip_count": 2.0,
"step": 1814,
"text_loss": 0.6460903882980347
@@ -17250,13 +17250,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009591534462730894,
- "loss": 0.0215,
+ "loss": 0.0206,
"macro_f1": 0.5492662787437439,
"num_tokens": 2929173.0,
"repeat_count": 2.0,
- "routers_loss": 0.07191162556409836,
+ "routers_loss": 0.0608333982527256,
"skip_count": 0.0,
"step": 1816,
"text_loss": 0.476126492023468
@@ -17269,13 +17269,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.06640625,
"learning_rate": 0.000959030830239687,
- "loss": 0.0182,
+ "loss": 0.0175,
"macro_f1": 0.3333333432674408,
"num_tokens": 2932703.0,
"repeat_count": 0.0,
- "routers_loss": 0.008753604255616665,
+ "routers_loss": 0.0093300249427557,
"skip_count": 0.0,
"step": 1818,
"text_loss": 0.5471875667572021
@@ -17288,13 +17288,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19921875,
+ "grad_norm": 0.2001953125,
"learning_rate": 0.0009589080383048048,
- "loss": 0.0233,
+ "loss": 0.0235,
"macro_f1": 0.3333333432674408,
"num_tokens": 2936195.0,
"repeat_count": 0.0,
- "routers_loss": 0.008390828967094421,
+ "routers_loss": 0.010434109717607498,
"skip_count": 0.0,
"step": 1820,
"text_loss": 0.5068115592002869
@@ -17307,13 +17307,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0009587850705154964,
"loss": 0.0291,
"macro_f1": 0.3333333432674408,
"num_tokens": 2939412.0,
"repeat_count": 0.0,
- "routers_loss": 0.005617359187453985,
+ "routers_loss": 0.004347751382738352,
"skip_count": 0.0,
"step": 1822,
"text_loss": 0.4241984784603119
@@ -17326,13 +17326,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.0859375,
"learning_rate": 0.0009586619269188836,
- "loss": 0.0227,
+ "loss": 0.0224,
"macro_f1": 0.32098767161369324,
"num_tokens": 2942318.0,
"repeat_count": 0.0,
- "routers_loss": 0.0346846878528595,
+ "routers_loss": 0.034238871186971664,
"skip_count": 1.0,
"step": 1824,
"text_loss": 0.2328975349664688
@@ -17345,32 +17345,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0009585386075621553,
"loss": 0.027,
"macro_f1": 0.3333333432674408,
"num_tokens": 2945731.0,
"repeat_count": 0.0,
- "routers_loss": 0.006601692643016577,
+ "routers_loss": 0.006097695790231228,
"skip_count": 0.0,
"step": 1826,
"text_loss": 0.22816994786262512
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 8.582330496037569,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.08837890625,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0908203125,
"learning_rate": 0.0009584151124925676,
- "loss": 0.0207,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0208,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2948944.0,
"repeat_count": 0.0,
- "routers_loss": 0.0065619745291769505,
+ "routers_loss": 0.007790776435285807,
"skip_count": 1.0,
"step": 1828,
"text_loss": 0.5009413361549377
@@ -17383,13 +17383,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0009582914417574438,
- "loss": 0.0149,
+ "loss": 0.0145,
"macro_f1": 0.6666666865348816,
"num_tokens": 2951723.0,
"repeat_count": 0.0,
- "routers_loss": 0.011109639890491962,
+ "routers_loss": 0.009144559502601624,
"skip_count": 2.0,
"step": 1830,
"text_loss": 0.1402502954006195
@@ -17404,11 +17404,11 @@
"f1_skip": 0.0,
"grad_norm": 0.06201171875,
"learning_rate": 0.0009581675954041751,
- "loss": 0.0167,
+ "loss": 0.0166,
"macro_f1": 0.6666666865348816,
"num_tokens": 2954726.0,
"repeat_count": 1.0,
- "routers_loss": 0.008432094007730484,
+ "routers_loss": 0.006593191530555487,
"skip_count": 0.0,
"step": 1832,
"text_loss": 0.4871736466884613
@@ -17421,13 +17421,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0859375,
+ "grad_norm": 0.0869140625,
"learning_rate": 0.0009580435734802196,
- "loss": 0.0208,
+ "loss": 0.0206,
"macro_f1": 0.3333333432674408,
"num_tokens": 2957853.0,
"repeat_count": 0.0,
- "routers_loss": 0.011518111452460289,
+ "routers_loss": 0.01241068821400404,
"skip_count": 0.0,
"step": 1834,
"text_loss": 0.30100154876708984
@@ -17440,13 +17440,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009579193760331027,
- "loss": 0.0211,
+ "loss": 0.022,
"macro_f1": 0.3333333432674408,
"num_tokens": 2960783.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026744187343865633,
+ "routers_loss": 0.002219218760728836,
"skip_count": 0.0,
"step": 1836,
"text_loss": 0.4961516559123993
@@ -17459,13 +17459,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009577950031104169,
- "loss": 0.0165,
+ "loss": 0.0166,
"macro_f1": 0.6601307392120361,
"num_tokens": 2963328.0,
"repeat_count": 1.0,
- "routers_loss": 0.028107430785894394,
+ "routers_loss": 0.029363535344600677,
"skip_count": 2.0,
"step": 1838,
"text_loss": 0.42814353108406067
@@ -17478,13 +17478,13 @@
"f1_execute": 0.9387754797935486,
"f1_repeat": 1.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009576704547598226,
- "loss": 0.0263,
+ "loss": 0.0257,
"macro_f1": 0.7795917987823486,
"num_tokens": 2966108.0,
"repeat_count": 1.0,
- "routers_loss": 0.060007549822330475,
+ "routers_loss": 0.0579402856528759,
"skip_count": 4.0,
"step": 1840,
"text_loss": 0.20523512363433838
@@ -17497,13 +17497,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009575457310290463,
"loss": 0.0121,
"macro_f1": 0.3272727429866791,
"num_tokens": 2969137.0,
"repeat_count": 0.0,
- "routers_loss": 0.01074182614684105,
+ "routers_loss": 0.008810589089989662,
"skip_count": 0.0,
"step": 1842,
"text_loss": 0.6199528574943542
@@ -17516,13 +17516,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0732421875,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0009574208319658831,
- "loss": 0.0213,
+ "loss": 0.0208,
"macro_f1": 0.6666666865348816,
"num_tokens": 2972407.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019638657104223967,
+ "routers_loss": 0.0012295129708945751,
"skip_count": 1.0,
"step": 1844,
"text_loss": 0.66938316822052
@@ -17535,13 +17535,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.000957295757618194,
- "loss": 0.0156,
+ "loss": 0.0152,
"macro_f1": 0.4871794879436493,
"num_tokens": 2976045.0,
"repeat_count": 0.0,
- "routers_loss": 0.06953249871730804,
+ "routers_loss": 0.06162935495376587,
"skip_count": 2.0,
"step": 1846,
"text_loss": 0.5381782650947571
@@ -17554,13 +17554,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009571705080339079,
- "loss": 0.0154,
+ "loss": 0.0144,
"macro_f1": 0.3333333432674408,
"num_tokens": 2979025.0,
"repeat_count": 0.0,
- "routers_loss": 0.003563052974641323,
+ "routers_loss": 0.003950524143874645,
"skip_count": 0.0,
"step": 1848,
"text_loss": 0.5831671357154846
@@ -17573,13 +17573,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.11376953125,
"learning_rate": 0.0009570450832610208,
- "loss": 0.0216,
+ "loss": 0.0209,
"macro_f1": 0.3333333432674408,
"num_tokens": 2982276.0,
"repeat_count": 0.0,
- "routers_loss": 0.010409255512058735,
+ "routers_loss": 0.010354886762797832,
"skip_count": 0.0,
"step": 1850,
"text_loss": 0.27448201179504395
@@ -17592,13 +17592,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.0009569194833475956,
- "loss": 0.0195,
+ "loss": 0.0199,
"macro_f1": 0.3272727429866791,
"num_tokens": 2985691.0,
"repeat_count": 0.0,
- "routers_loss": 0.009769548662006855,
+ "routers_loss": 0.010167439468204975,
"skip_count": 0.0,
"step": 1852,
"text_loss": 0.5264663696289062
@@ -17611,13 +17611,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1181640625,
+ "grad_norm": 0.1328125,
"learning_rate": 0.0009567937083417624,
- "loss": 0.0184,
+ "loss": 0.0194,
"macro_f1": 0.3272727429866791,
"num_tokens": 2989126.0,
"repeat_count": 0.0,
- "routers_loss": 0.036616452038288116,
+ "routers_loss": 0.0371871180832386,
"skip_count": 1.0,
"step": 1854,
"text_loss": 0.2008018046617508
@@ -17630,13 +17630,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.0009566677582917185,
- "loss": 0.0192,
+ "loss": 0.0184,
"macro_f1": 0.3333333432674408,
"num_tokens": 2992814.0,
"repeat_count": 0.0,
- "routers_loss": 0.009581349790096283,
+ "routers_loss": 0.010190588422119617,
"skip_count": 0.0,
"step": 1856,
"text_loss": 0.749717116355896
@@ -17649,13 +17649,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09814453125,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009565416332457282,
- "loss": 0.0138,
+ "loss": 0.0132,
"macro_f1": 0.6538461446762085,
"num_tokens": 2995729.0,
"repeat_count": 1.0,
- "routers_loss": 0.02330300398170948,
+ "routers_loss": 0.022285036742687225,
"skip_count": 1.0,
"step": 1858,
"text_loss": 0.5870219469070435
@@ -17668,13 +17668,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009564153332521228,
- "loss": 0.0226,
+ "loss": 0.0224,
"macro_f1": 0.3272727429866791,
"num_tokens": 2998812.0,
"repeat_count": 0.0,
- "routers_loss": 0.011985735036432743,
+ "routers_loss": 0.011050296947360039,
"skip_count": 1.0,
"step": 1860,
"text_loss": 0.8444408774375916
@@ -17687,13 +17687,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.06005859375,
"learning_rate": 0.0009562888583593005,
- "loss": 0.0162,
+ "loss": 0.0163,
"macro_f1": 0.3333333432674408,
"num_tokens": 3001799.0,
"repeat_count": 0.0,
- "routers_loss": 0.005997250322252512,
+ "routers_loss": 0.007125461008399725,
"skip_count": 0.0,
"step": 1862,
"text_loss": 0.41510361433029175
@@ -17706,13 +17706,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009561622086157272,
- "loss": 0.0243,
+ "loss": 0.0236,
"macro_f1": 0.3333333432674408,
"num_tokens": 3005088.0,
"repeat_count": 0.0,
- "routers_loss": 0.004814761225134134,
+ "routers_loss": 0.0049054501578211784,
"skip_count": 0.0,
"step": 1864,
"text_loss": 0.3801248073577881
@@ -17725,13 +17725,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.056884765625,
+ "grad_norm": 0.054443359375,
"learning_rate": 0.000956035384069935,
- "loss": 0.0242,
+ "loss": 0.0238,
"macro_f1": 1.0,
"num_tokens": 3008178.0,
"repeat_count": 1.0,
- "routers_loss": 0.004750931169837713,
+ "routers_loss": 0.005162427201867104,
"skip_count": 1.0,
"step": 1866,
"text_loss": 0.2687684893608093
@@ -17744,13 +17744,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1123046875,
+ "grad_norm": 0.10400390625,
"learning_rate": 0.0009559083847705233,
- "loss": 0.0216,
+ "loss": 0.0214,
"macro_f1": 0.3272727429866791,
"num_tokens": 3010923.0,
"repeat_count": 0.0,
- "routers_loss": 0.038251202553510666,
+ "routers_loss": 0.028984658420085907,
"skip_count": 1.0,
"step": 1868,
"text_loss": 0.6277349591255188
@@ -17763,13 +17763,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009557812107661584,
- "loss": 0.0204,
+ "loss": 0.0208,
"macro_f1": 1.0,
"num_tokens": 3015030.0,
"repeat_count": 1.0,
- "routers_loss": 0.010951942764222622,
+ "routers_loss": 0.012200530618429184,
"skip_count": 1.0,
"step": 1870,
"text_loss": 0.6293368339538574
@@ -17782,13 +17782,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.11962890625,
"learning_rate": 0.0009556538621055739,
- "loss": 0.0265,
+ "loss": 0.0268,
"macro_f1": 0.3272727429866791,
"num_tokens": 3019067.0,
"repeat_count": 0.0,
- "routers_loss": 0.06582094728946686,
+ "routers_loss": 0.06365182995796204,
"skip_count": 1.0,
"step": 1872,
"text_loss": 0.39046618342399597
@@ -17796,18 +17796,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
- "avg_layers": 26.0,
+ "avg_layers": 27.0,
"epoch": 8.798356325212797,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.12353515625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.115234375,
"learning_rate": 0.0009555263388375699,
- "loss": 0.0143,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.014,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3022166.0,
"repeat_count": 0.0,
- "routers_loss": 0.008920271880924702,
+ "routers_loss": 0.0041703456081449986,
"skip_count": 1.0,
"step": 1874,
"text_loss": 0.42232340574264526
@@ -17820,13 +17820,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1220703125,
+ "grad_norm": 0.11572265625,
"learning_rate": 0.0009553986410110134,
"loss": 0.016,
"macro_f1": 0.3333333432674408,
"num_tokens": 3025865.0,
"repeat_count": 0.0,
- "routers_loss": 0.006444344762712717,
+ "routers_loss": 0.005841755773872137,
"skip_count": 0.0,
"step": 1876,
"text_loss": 0.37600573897361755
@@ -17839,13 +17839,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009552707686748388,
- "loss": 0.022,
+ "loss": 0.0219,
"macro_f1": 0.3272727429866791,
"num_tokens": 3029950.0,
"repeat_count": 0.0,
- "routers_loss": 0.05197767913341522,
+ "routers_loss": 0.05165952071547508,
"skip_count": 1.0,
"step": 1878,
"text_loss": 0.33717799186706543
@@ -17858,13 +17858,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009551427218780467,
- "loss": 0.0224,
+ "loss": 0.0219,
"macro_f1": 0.6666666865348816,
"num_tokens": 3033649.0,
"repeat_count": 0.0,
- "routers_loss": 0.017570581287145615,
+ "routers_loss": 0.020680008456110954,
"skip_count": 2.0,
"step": 1880,
"text_loss": 0.5011783838272095
@@ -17877,13 +17877,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.173828125,
+ "grad_norm": 0.15625,
"learning_rate": 0.0009550145006697048,
- "loss": 0.0225,
+ "loss": 0.0217,
"macro_f1": 0.32098764181137085,
"num_tokens": 3036847.0,
"repeat_count": 0.0,
- "routers_loss": 0.07106777280569077,
+ "routers_loss": 0.07626450061798096,
"skip_count": 2.0,
"step": 1882,
"text_loss": 0.3066408336162567
@@ -17896,13 +17896,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0009548861050989482,
- "loss": 0.0139,
+ "loss": 0.0136,
"macro_f1": 1.0,
"num_tokens": 3040353.0,
"repeat_count": 1.0,
- "routers_loss": 0.009862381964921951,
+ "routers_loss": 0.010884666815400124,
"skip_count": 1.0,
"step": 1884,
"text_loss": 0.49779415130615234
@@ -17915,13 +17915,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0908203125,
"learning_rate": 0.0009547575352149778,
- "loss": 0.0209,
+ "loss": 0.0213,
"macro_f1": 0.6666666865348816,
"num_tokens": 3043504.0,
"repeat_count": 0.0,
- "routers_loss": 0.006928981747478247,
+ "routers_loss": 0.006704333238303661,
"skip_count": 2.0,
"step": 1886,
"text_loss": 0.12284614145755768
@@ -17934,13 +17934,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09423828125,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.0009546287910670621,
"loss": 0.0211,
"macro_f1": 0.5427350401878357,
"num_tokens": 3046422.0,
"repeat_count": 1.0,
- "routers_loss": 0.04788029566407204,
+ "routers_loss": 0.04799000173807144,
"skip_count": 2.0,
"step": 1888,
"text_loss": 0.1824081838130951
@@ -17953,13 +17953,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1357421875,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009544998727045361,
- "loss": 0.0299,
+ "loss": 0.0306,
"macro_f1": 0.3333333432674408,
"num_tokens": 3049819.0,
"repeat_count": 0.0,
- "routers_loss": 0.008282946422696114,
+ "routers_loss": 0.008139612153172493,
"skip_count": 0.0,
"step": 1890,
"text_loss": 0.18929053843021393
@@ -17972,32 +17972,32 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.09375,
"learning_rate": 0.0009543707801768015,
- "loss": 0.0181,
+ "loss": 0.0175,
"macro_f1": 0.5934640765190125,
"num_tokens": 3052766.0,
"repeat_count": 0.0,
- "routers_loss": 0.03251546248793602,
+ "routers_loss": 0.02966771461069584,
"skip_count": 3.0,
"step": 1892,
"text_loss": 0.247748002409935
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 24.0,
+ "acc_skip": 0.5,
+ "avg_layers": 25.0,
"epoch": 8.892280598767243,
- "f1_execute": 0.9600000381469727,
+ "f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.06640625,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009542415135333267,
- "loss": 0.0195,
- "macro_f1": 0.542222261428833,
+ "loss": 0.0193,
+ "macro_f1": 0.44705885648727417,
"num_tokens": 3056427.0,
"repeat_count": 0.0,
- "routers_loss": 0.03368280455470085,
+ "routers_loss": 0.03637036308646202,
"skip_count": 2.0,
"step": 1894,
"text_loss": 0.2583999037742615
@@ -18010,13 +18010,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.0595703125,
"learning_rate": 0.0009541120728236472,
- "loss": 0.0133,
+ "loss": 0.0136,
"macro_f1": 0.3333333432674408,
"num_tokens": 3059497.0,
"repeat_count": 0.0,
- "routers_loss": 0.0069940583780407906,
+ "routers_loss": 0.007026574574410915,
"skip_count": 0.0,
"step": 1896,
"text_loss": 0.5222375988960266
@@ -18029,13 +18029,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0810546875,
+ "grad_norm": 0.076171875,
"learning_rate": 0.0009539824580973646,
- "loss": 0.0221,
+ "loss": 0.0219,
"macro_f1": 0.3333333432674408,
"num_tokens": 3062187.0,
"repeat_count": 0.0,
- "routers_loss": 0.004268508404493332,
+ "routers_loss": 0.003449335927143693,
"skip_count": 0.0,
"step": 1898,
"text_loss": 0.5736427307128906
@@ -18048,13 +18048,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0009538526694041477,
- "loss": 0.0159,
+ "loss": 0.0163,
"macro_f1": 0.3333333432674408,
"num_tokens": 3066100.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032616283278912306,
+ "routers_loss": 0.0035463871899992228,
"skip_count": 0.0,
"step": 1900,
"text_loss": 0.5471583604812622
@@ -18067,13 +18067,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.08056640625,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009537227067937318,
- "loss": 0.023,
+ "loss": 0.0233,
"macro_f1": 1.0,
"num_tokens": 3068737.0,
"repeat_count": 3.0,
- "routers_loss": 0.005389219615608454,
+ "routers_loss": 0.00597514258697629,
"skip_count": 3.0,
"step": 1902,
"text_loss": 0.36644190549850464
@@ -18086,13 +18086,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.166015625,
"learning_rate": 0.0009535925703159186,
- "loss": 0.0311,
+ "loss": 0.0301,
"macro_f1": 0.32098764181137085,
"num_tokens": 3071686.0,
"repeat_count": 0.0,
- "routers_loss": 0.024814991280436516,
+ "routers_loss": 0.025420479476451874,
"skip_count": 2.0,
"step": 1904,
"text_loss": 0.535789966583252
@@ -18107,11 +18107,11 @@
"f1_skip": 0.0,
"grad_norm": 0.07568359375,
"learning_rate": 0.0009534622600205769,
- "loss": 0.0151,
+ "loss": 0.0145,
"macro_f1": 0.3333333432674408,
"num_tokens": 3074954.0,
"repeat_count": 0.0,
- "routers_loss": 0.013415839523077011,
+ "routers_loss": 0.014377486892044544,
"skip_count": 0.0,
"step": 1906,
"text_loss": 0.19009549915790558
@@ -18124,13 +18124,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009533317759576416,
- "loss": 0.019,
+ "loss": 0.0197,
"macro_f1": 0.3333333432674408,
"num_tokens": 3077540.0,
"repeat_count": 0.0,
- "routers_loss": 0.005814475007355213,
+ "routers_loss": 0.004848944488912821,
"skip_count": 0.0,
"step": 1908,
"text_loss": 0.5022001266479492
@@ -18143,13 +18143,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0732421875,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0009532011181771148,
- "loss": 0.0218,
+ "loss": 0.0217,
"macro_f1": 0.6666666865348816,
"num_tokens": 3080445.0,
"repeat_count": 0.0,
- "routers_loss": 0.007621586322784424,
+ "routers_loss": 0.009480170905590057,
"skip_count": 2.0,
"step": 1910,
"text_loss": 0.35135936737060547
@@ -18162,13 +18162,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.10400390625,
"learning_rate": 0.0009530702867290644,
- "loss": 0.0178,
+ "loss": 0.0185,
"macro_f1": 0.3333333432674408,
"num_tokens": 3083657.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020917020738124847,
+ "routers_loss": 0.0019353039097040892,
"skip_count": 0.0,
"step": 1912,
"text_loss": 0.5123994946479797
@@ -18181,13 +18181,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009529392816636256,
- "loss": 0.025,
+ "loss": 0.0249,
"macro_f1": 0.3333333432674408,
"num_tokens": 3086837.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010824954370036721,
+ "routers_loss": 0.0010921972570940852,
"skip_count": 0.0,
"step": 1914,
"text_loss": 0.44477662444114685
@@ -18200,13 +18200,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.19140625,
"learning_rate": 0.0009528081030309995,
- "loss": 0.0353,
+ "loss": 0.0351,
"macro_f1": 0.3333333432674408,
"num_tokens": 3089892.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018075350672006607,
+ "routers_loss": 0.0018027103506028652,
"skip_count": 0.0,
"step": 1916,
"text_loss": 0.7356183528900146
@@ -18219,13 +18219,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07958984375,
+ "grad_norm": 0.07568359375,
"learning_rate": 0.0009526767508814542,
- "loss": 0.0235,
+ "loss": 0.0236,
"macro_f1": 0.3333333432674408,
"num_tokens": 3093058.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032930250745266676,
+ "routers_loss": 0.003243023296818137,
"skip_count": 0.0,
"step": 1918,
"text_loss": 0.48823556303977966
@@ -18238,13 +18238,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08837890625,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009525452252653239,
- "loss": 0.0184,
+ "loss": 0.0175,
"macro_f1": 0.3333333432674408,
"num_tokens": 3096404.0,
"repeat_count": 0.0,
- "routers_loss": 0.009042349644005299,
+ "routers_loss": 0.009360014460980892,
"skip_count": 0.0,
"step": 1920,
"text_loss": 0.21498437225818634
@@ -18257,13 +18257,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009524135262330098,
- "loss": 0.022,
+ "loss": 0.0224,
"macro_f1": 0.9265305995941162,
"num_tokens": 3099520.0,
"repeat_count": 1.0,
- "routers_loss": 0.016776500269770622,
+ "routers_loss": 0.017444295808672905,
"skip_count": 3.0,
"step": 1922,
"text_loss": 0.27608850598335266
@@ -18276,13 +18276,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.0009522816538349789,
- "loss": 0.016,
+ "loss": 0.0162,
"macro_f1": 0.5492662787437439,
"num_tokens": 3102956.0,
"repeat_count": 0.0,
- "routers_loss": 0.06579705327749252,
+ "routers_loss": 0.06424452364444733,
"skip_count": 2.0,
"step": 1924,
"text_loss": 0.21558666229248047
@@ -18295,13 +18295,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.058349609375,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0009521496081217651,
- "loss": 0.0113,
+ "loss": 0.0112,
"macro_f1": 0.6666666865348816,
"num_tokens": 3106565.0,
"repeat_count": 1.0,
- "routers_loss": 0.0022786022163927555,
+ "routers_loss": 0.002270506462082267,
"skip_count": 0.0,
"step": 1926,
"text_loss": 0.5641813278198242
@@ -18314,13 +18314,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.09033203125,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009520173891439684,
"loss": 0.0216,
"macro_f1": 0.6666666865348816,
"num_tokens": 3109314.0,
"repeat_count": 0.0,
- "routers_loss": 0.01074281521141529,
+ "routers_loss": 0.011512448079884052,
"skip_count": 1.0,
"step": 1928,
"text_loss": 0.6351624727249146
@@ -18333,13 +18333,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009518849969522556,
- "loss": 0.0201,
+ "loss": 0.0198,
"macro_f1": 0.3333333432674408,
"num_tokens": 3112956.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032052614260464907,
+ "routers_loss": 0.003883908037096262,
"skip_count": 0.0,
"step": 1930,
"text_loss": 0.35160085558891296
@@ -18352,32 +18352,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009517524315973595,
- "loss": 0.0186,
+ "loss": 0.019,
"macro_f1": 1.0,
"num_tokens": 3115593.0,
"repeat_count": 1.0,
- "routers_loss": 0.008593574166297913,
+ "routers_loss": 0.009479222819209099,
"skip_count": 3.0,
"step": 1932,
"text_loss": 0.2900560200214386
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 9.079835632521279,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.07373046875,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0771484375,
"learning_rate": 0.0009516196931300794,
- "loss": 0.0152,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0153,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3118516.0,
"repeat_count": 0.0,
- "routers_loss": 0.0201246440410614,
+ "routers_loss": 0.017834696918725967,
"skip_count": 2.0,
"step": 1934,
"text_loss": 0.20094378292560577
@@ -18390,13 +18390,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1357421875,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009514867816012809,
- "loss": 0.0199,
+ "loss": 0.02,
"macro_f1": 0.3333333432674408,
"num_tokens": 3122242.0,
"repeat_count": 0.0,
- "routers_loss": 0.001721356064081192,
+ "routers_loss": 0.0017964740982279181,
"skip_count": 0.0,
"step": 1936,
"text_loss": 0.6498590707778931
@@ -18409,13 +18409,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.049072265625,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0009513536970618961,
- "loss": 0.0135,
+ "loss": 0.013,
"macro_f1": 0.6666666865348816,
"num_tokens": 3125645.0,
"repeat_count": 0.0,
- "routers_loss": 0.010442634113132954,
+ "routers_loss": 0.007437168620526791,
"skip_count": 2.0,
"step": 1938,
"text_loss": 0.25863033533096313
@@ -18428,13 +18428,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.058349609375,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009512204395629232,
- "loss": 0.019,
+ "loss": 0.0184,
"macro_f1": 0.6666666865348816,
"num_tokens": 3128740.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009493798715993762,
+ "routers_loss": 0.0008759932243265212,
"skip_count": 1.0,
"step": 1940,
"text_loss": 0.5638351440429688
@@ -18447,13 +18447,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05517578125,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009510870091554264,
- "loss": 0.0149,
+ "loss": 0.0153,
"macro_f1": 0.3272727429866791,
"num_tokens": 3131742.0,
"repeat_count": 1.0,
- "routers_loss": 0.022104881703853607,
+ "routers_loss": 0.019906625151634216,
"skip_count": 0.0,
"step": 1942,
"text_loss": 0.8410717844963074
@@ -18466,13 +18466,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009509534058905369,
- "loss": 0.0164,
+ "loss": 0.016,
"macro_f1": 0.3333333432674408,
"num_tokens": 3134407.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009013625676743686,
+ "routers_loss": 0.0009229081333614886,
"skip_count": 0.0,
"step": 1944,
"text_loss": 0.47506049275398254
@@ -18485,13 +18485,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06103515625,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0009508196298194517,
- "loss": 0.0121,
+ "loss": 0.0123,
"macro_f1": 0.3333333432674408,
"num_tokens": 3137053.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028069843538105488,
+ "routers_loss": 0.003630586201325059,
"skip_count": 0.0,
"step": 1946,
"text_loss": 0.32225799560546875
@@ -18504,13 +18504,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009506856809934338,
- "loss": 0.0116,
+ "loss": 0.0119,
"macro_f1": 0.3333333432674408,
"num_tokens": 3140943.0,
"repeat_count": 0.0,
- "routers_loss": 0.006877045147120953,
+ "routers_loss": 0.007580445148050785,
"skip_count": 0.0,
"step": 1948,
"text_loss": 0.3120577931404114
@@ -18523,13 +18523,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0009505515594638127,
- "loss": 0.0127,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 3144298.0,
"repeat_count": 0.0,
- "routers_loss": 0.004543667659163475,
+ "routers_loss": 0.004471861757338047,
"skip_count": 0.0,
"step": 1950,
"text_loss": 0.22052447497844696
@@ -18542,13 +18542,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.09130859375,
"learning_rate": 0.0009504172652819843,
- "loss": 0.0232,
+ "loss": 0.023,
"macro_f1": 1.0,
"num_tokens": 3147069.0,
"repeat_count": 1.0,
- "routers_loss": 0.007053609937429428,
+ "routers_loss": 0.009606664068996906,
"skip_count": 1.0,
"step": 1952,
"text_loss": 0.34773921966552734
@@ -18561,13 +18561,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0537109375,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009502827984994099,
- "loss": 0.0146,
+ "loss": 0.0148,
"macro_f1": 0.6666666865348816,
"num_tokens": 3149992.0,
"repeat_count": 0.0,
- "routers_loss": 0.006783280987292528,
+ "routers_loss": 0.006443799939006567,
"skip_count": 1.0,
"step": 1954,
"text_loss": 0.6442171335220337
@@ -18580,13 +18580,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.0009501481591676177,
- "loss": 0.0181,
+ "loss": 0.0188,
"macro_f1": 0.3333333432674408,
"num_tokens": 3153167.0,
"repeat_count": 0.0,
- "routers_loss": 0.002531677018851042,
+ "routers_loss": 0.003219039412215352,
"skip_count": 0.0,
"step": 1956,
"text_loss": 0.43369221687316895
@@ -18599,32 +18599,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.078125,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.000950013347338202,
- "loss": 0.0154,
+ "loss": 0.0152,
"macro_f1": 0.3272727429866791,
"num_tokens": 3156590.0,
"repeat_count": 0.0,
- "routers_loss": 0.027040868997573853,
+ "routers_loss": 0.025551019236445427,
"skip_count": 1.0,
"step": 1958,
"text_loss": 0.294479101896286
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 1.0,
- "avg_layers": 26.0,
+ "avg_layers": 27.0,
"epoch": 9.201937188142061,
- "f1_execute": 0.9803921580314636,
- "f1_repeat": 0.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009498783630628225,
- "loss": 0.0154,
- "macro_f1": 0.6601307392120361,
+ "loss": 0.0158,
+ "macro_f1": 1.0,
"num_tokens": 3159451.0,
"repeat_count": 1.0,
- "routers_loss": 0.01573321223258972,
+ "routers_loss": 0.013802438974380493,
"skip_count": 2.0,
"step": 1960,
"text_loss": 0.20888492465019226
@@ -18637,13 +18637,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06689453125,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009497432063932057,
- "loss": 0.0135,
+ "loss": 0.0137,
"macro_f1": 0.6601307392120361,
"num_tokens": 3162889.0,
"repeat_count": 1.0,
- "routers_loss": 0.02442278526723385,
+ "routers_loss": 0.02852988988161087,
"skip_count": 2.0,
"step": 1962,
"text_loss": 0.5027125477790833
@@ -18656,13 +18656,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.046630859375,
+ "grad_norm": 0.045166015625,
"learning_rate": 0.0009496078773811437,
- "loss": 0.0142,
+ "loss": 0.0136,
"macro_f1": 0.6666666865348816,
"num_tokens": 3165979.0,
"repeat_count": 0.0,
- "routers_loss": 0.018267054110765457,
+ "routers_loss": 0.01784522272646427,
"skip_count": 2.0,
"step": 1964,
"text_loss": 0.1696339100599289
@@ -18675,13 +18675,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.060302734375,
"learning_rate": 0.000949472376078495,
- "loss": 0.0162,
+ "loss": 0.016,
"macro_f1": 0.3333333432674408,
"num_tokens": 3168683.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016024474753066897,
+ "routers_loss": 0.0017019887454807758,
"skip_count": 0.0,
"step": 1966,
"text_loss": 0.48905447125434875
@@ -18694,13 +18694,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.052978515625,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.000949336702537184,
- "loss": 0.011,
+ "loss": 0.0108,
"macro_f1": 0.6666666865348816,
"num_tokens": 3171968.0,
"repeat_count": 0.0,
- "routers_loss": 0.004668849054723978,
+ "routers_loss": 0.004817947279661894,
"skip_count": 2.0,
"step": 1968,
"text_loss": 0.20984773337841034
@@ -18713,13 +18713,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0009492008568092007,
- "loss": 0.0098,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 3175947.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011657609138637781,
+ "routers_loss": 0.0012963006738573313,
"skip_count": 0.0,
"step": 1970,
"text_loss": 0.5215106010437012
@@ -18732,13 +18732,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.04248046875,
+ "grad_norm": 0.044921875,
"learning_rate": 0.0009490648389466019,
- "loss": 0.0133,
+ "loss": 0.0135,
"macro_f1": 0.4871794879436493,
"num_tokens": 3179348.0,
"repeat_count": 0.0,
- "routers_loss": 0.03806794434785843,
+ "routers_loss": 0.03950481489300728,
"skip_count": 2.0,
"step": 1972,
"text_loss": 0.24640929698944092
@@ -18751,13 +18751,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08837890625,
+ "grad_norm": 0.09326171875,
"learning_rate": 0.0009489286490015097,
- "loss": 0.0189,
+ "loss": 0.0183,
"macro_f1": 0.6666666865348816,
"num_tokens": 3182640.0,
"repeat_count": 0.0,
- "routers_loss": 0.005107097327709198,
+ "routers_loss": 0.0043345349840819836,
"skip_count": 2.0,
"step": 1974,
"text_loss": 0.6362852454185486
@@ -18770,13 +18770,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.078125,
+ "grad_norm": 0.07958984375,
"learning_rate": 0.0009487922870261122,
- "loss": 0.0156,
+ "loss": 0.0155,
"macro_f1": 0.3333333432674408,
"num_tokens": 3185657.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013696947135031223,
+ "routers_loss": 0.0015687479171901941,
"skip_count": 0.0,
"step": 1976,
"text_loss": 0.8977144360542297
@@ -18789,13 +18789,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.0009486557530726638,
- "loss": 0.0136,
+ "loss": 0.0139,
"macro_f1": 0.3333333432674408,
"num_tokens": 3188772.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012224154779687524,
+ "routers_loss": 0.0010977238416671753,
"skip_count": 0.0,
"step": 1978,
"text_loss": 0.38512736558914185
@@ -18808,13 +18808,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09423828125,
+ "grad_norm": 0.11279296875,
"learning_rate": 0.0009485190471934844,
"loss": 0.0196,
"macro_f1": 0.6666666865348816,
"num_tokens": 3193131.0,
"repeat_count": 2.0,
- "routers_loss": 0.0030119111761450768,
+ "routers_loss": 0.002264744369313121,
"skip_count": 0.0,
"step": 1980,
"text_loss": 0.4171289801597595
@@ -18827,13 +18827,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.09033203125,
"learning_rate": 0.00094838216944096,
- "loss": 0.0222,
+ "loss": 0.0219,
"macro_f1": 0.3272727429866791,
"num_tokens": 3196668.0,
"repeat_count": 0.0,
- "routers_loss": 0.04286033287644386,
+ "routers_loss": 0.042320676147937775,
"skip_count": 1.0,
"step": 1982,
"text_loss": 0.19008000195026398
@@ -18846,32 +18846,32 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.053466796875,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0009482451198675424,
- "loss": 0.0158,
+ "loss": 0.0151,
"macro_f1": 0.32098767161369324,
"num_tokens": 3200282.0,
"repeat_count": 0.0,
- "routers_loss": 0.019988590851426125,
+ "routers_loss": 0.01796630397439003,
"skip_count": 1.0,
"step": 1984,
"text_loss": 0.5009249448776245
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 9.324038743762841,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.0634765625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.061767578125,
"learning_rate": 0.0009481078985257494,
- "loss": 0.0154,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0147,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3204439.0,
"repeat_count": 0.0,
- "routers_loss": 0.012215938419103622,
+ "routers_loss": 0.01052347756922245,
"skip_count": 1.0,
"step": 1986,
"text_loss": 0.15319275856018066
@@ -18884,13 +18884,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0009479705054681644,
- "loss": 0.0149,
+ "loss": 0.015,
"macro_f1": 0.3076923191547394,
"num_tokens": 3207590.0,
"repeat_count": 1.0,
- "routers_loss": 0.10747655481100082,
+ "routers_loss": 0.09640293568372726,
"skip_count": 3.0,
"step": 1988,
"text_loss": 0.3654652535915375
@@ -18903,13 +18903,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009478329407474366,
- "loss": 0.0186,
+ "loss": 0.0183,
"macro_f1": 0.5492662787437439,
"num_tokens": 3211172.0,
"repeat_count": 0.0,
- "routers_loss": 0.016109853982925415,
+ "routers_loss": 0.012670112773776054,
"skip_count": 1.0,
"step": 1990,
"text_loss": 0.5817596316337585
@@ -18922,13 +18922,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.05859375,
"learning_rate": 0.000947695204416281,
- "loss": 0.0116,
+ "loss": 0.0121,
"macro_f1": 0.6666666865348816,
"num_tokens": 3214050.0,
"repeat_count": 1.0,
- "routers_loss": 0.006929324474185705,
+ "routers_loss": 0.005263707600533962,
"skip_count": 0.0,
"step": 1992,
"text_loss": 0.5985888242721558
@@ -18941,13 +18941,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009475572965274787,
- "loss": 0.0147,
+ "loss": 0.0144,
"macro_f1": 0.3272727429866791,
"num_tokens": 3217318.0,
"repeat_count": 1.0,
- "routers_loss": 0.0715102106332779,
+ "routers_loss": 0.0682850033044815,
"skip_count": 0.0,
"step": 1994,
"text_loss": 0.316506564617157
@@ -18960,13 +18960,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.052490234375,
+ "grad_norm": 0.0595703125,
"learning_rate": 0.000947419217133876,
- "loss": 0.0187,
+ "loss": 0.019,
"macro_f1": 0.6666666865348816,
"num_tokens": 3220012.0,
"repeat_count": 0.0,
- "routers_loss": 0.008499355986714363,
+ "routers_loss": 0.008508823812007904,
"skip_count": 2.0,
"step": 1996,
"text_loss": 0.09665893763303757
@@ -18979,13 +18979,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.048583984375,
+ "grad_norm": 0.053466796875,
"learning_rate": 0.0009472809662883852,
- "loss": 0.0162,
+ "loss": 0.0155,
"macro_f1": 1.0,
"num_tokens": 3223019.0,
"repeat_count": 1.0,
- "routers_loss": 0.012003371492028236,
+ "routers_loss": 0.01100847590714693,
"skip_count": 2.0,
"step": 1998,
"text_loss": 0.4938808083534241
@@ -18998,13 +18998,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0009471425440439844,
- "loss": 0.0137,
+ "loss": 0.0135,
"macro_f1": 0.8817967176437378,
"num_tokens": 3226013.0,
"repeat_count": 2.0,
- "routers_loss": 0.0529167577624321,
+ "routers_loss": 0.04953207075595856,
"skip_count": 3.0,
"step": 2000,
"text_loss": 0.22258254885673523
@@ -19017,13 +19017,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.076171875,
+ "grad_norm": 0.07568359375,
"learning_rate": 0.0009470039504537173,
- "loss": 0.0185,
+ "loss": 0.0186,
"macro_f1": 0.31446540355682373,
"num_tokens": 3230031.0,
"repeat_count": 0.0,
- "routers_loss": 0.05719539523124695,
+ "routers_loss": 0.052884332835674286,
"skip_count": 2.0,
"step": 2002,
"text_loss": 0.1741616576910019
@@ -19038,11 +19038,11 @@
"f1_skip": 0.0,
"grad_norm": 0.0869140625,
"learning_rate": 0.0009468651855706931,
- "loss": 0.0205,
+ "loss": 0.0204,
"macro_f1": 0.6666666865348816,
"num_tokens": 3232991.0,
"repeat_count": 1.0,
- "routers_loss": 0.007613501511514187,
+ "routers_loss": 0.008056716993451118,
"skip_count": 0.0,
"step": 2004,
"text_loss": 0.3173636198043823
@@ -19055,13 +19055,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0009467262494480868,
- "loss": 0.014,
+ "loss": 0.0136,
"macro_f1": 0.3333333432674408,
"num_tokens": 3236390.0,
"repeat_count": 0.0,
- "routers_loss": 0.005654903594404459,
+ "routers_loss": 0.0053409393876791,
"skip_count": 0.0,
"step": 2006,
"text_loss": 0.5806330442428589
@@ -19074,13 +19074,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07958984375,
+ "grad_norm": 0.068359375,
"learning_rate": 0.000946587142139139,
- "loss": 0.0152,
+ "loss": 0.0147,
"macro_f1": 0.3333333432674408,
"num_tokens": 3239267.0,
"repeat_count": 0.0,
- "routers_loss": 0.001680699409916997,
+ "routers_loss": 0.0015652200672775507,
"skip_count": 0.0,
"step": 2008,
"text_loss": 0.6214317679405212
@@ -19093,13 +19093,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.11376953125,
"learning_rate": 0.000946447863697156,
- "loss": 0.0171,
+ "loss": 0.0151,
"macro_f1": 0.6601307392120361,
"num_tokens": 3242569.0,
"repeat_count": 1.0,
- "routers_loss": 0.014179535210132599,
+ "routers_loss": 0.011673987843096256,
"skip_count": 2.0,
"step": 2010,
"text_loss": 0.532565712928772
@@ -19112,13 +19112,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.04345703125,
"learning_rate": 0.0009463084141755093,
- "loss": 0.0157,
+ "loss": 0.0159,
"macro_f1": 0.3272727429866791,
"num_tokens": 3245669.0,
"repeat_count": 0.0,
- "routers_loss": 0.026209332048892975,
+ "routers_loss": 0.028480790555477142,
"skip_count": 1.0,
"step": 2012,
"text_loss": 0.25210800766944885
@@ -19131,13 +19131,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08349609375,
+ "grad_norm": 0.0869140625,
"learning_rate": 0.0009461687936276364,
- "loss": 0.0134,
+ "loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 3248751.0,
"repeat_count": 0.0,
- "routers_loss": 0.008315940387547016,
+ "routers_loss": 0.007234727032482624,
"skip_count": 0.0,
"step": 2014,
"text_loss": 0.35922971367836
@@ -19150,13 +19150,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.06689453125,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0009460290021070402,
- "loss": 0.0197,
+ "loss": 0.0195,
"macro_f1": 0.6666666865348816,
"num_tokens": 3252614.0,
"repeat_count": 1.0,
- "routers_loss": 0.01872348040342331,
+ "routers_loss": 0.014691276475787163,
"skip_count": 0.0,
"step": 2016,
"text_loss": 0.2747853398323059
@@ -19169,13 +19169,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05126953125,
+ "grad_norm": 0.051513671875,
"learning_rate": 0.0009458890396672888,
"loss": 0.0186,
"macro_f1": 0.3333333432674408,
"num_tokens": 3256374.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024314222391694784,
+ "routers_loss": 0.002385235857218504,
"skip_count": 0.0,
"step": 2018,
"text_loss": 0.5268719792366028
@@ -19188,13 +19188,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.052978515625,
+ "grad_norm": 0.04443359375,
"learning_rate": 0.0009457489063620164,
- "loss": 0.0137,
+ "loss": 0.0133,
"macro_f1": 0.8823530077934265,
"num_tokens": 3259792.0,
"repeat_count": 1.0,
- "routers_loss": 0.04815426841378212,
+ "routers_loss": 0.047268565744161606,
"skip_count": 2.0,
"step": 2020,
"text_loss": 0.7785539627075195
@@ -19207,13 +19207,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.13671875,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009456086022449221,
- "loss": 0.0209,
+ "loss": 0.0218,
"macro_f1": 0.3272727429866791,
"num_tokens": 3262833.0,
"repeat_count": 0.0,
- "routers_loss": 0.015121756121516228,
+ "routers_loss": 0.015878718346357346,
"skip_count": 1.0,
"step": 2022,
"text_loss": 0.42270028591156006
@@ -19226,32 +19226,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.08935546875,
"learning_rate": 0.0009454681273697711,
- "loss": 0.0122,
+ "loss": 0.0117,
"macro_f1": 0.3272727429866791,
"num_tokens": 3265718.0,
"repeat_count": 1.0,
- "routers_loss": 0.030219297856092453,
+ "routers_loss": 0.030749641358852386,
"skip_count": 0.0,
"step": 2024,
"text_loss": 0.18668225407600403
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 9.511887290871735,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.05419921875,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.0009453274817903931,
- "loss": 0.0132,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.012,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3268158.0,
"repeat_count": 0.0,
- "routers_loss": 0.013256299309432507,
+ "routers_loss": 0.011538166552782059,
"skip_count": 1.0,
"step": 2026,
"text_loss": 0.34090787172317505
@@ -19264,13 +19264,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11572265625,
+ "grad_norm": 0.099609375,
"learning_rate": 0.000945186665560684,
- "loss": 0.0232,
+ "loss": 0.0218,
"macro_f1": 0.3333333432674408,
"num_tokens": 3271082.0,
"repeat_count": 0.0,
- "routers_loss": 0.009389489889144897,
+ "routers_loss": 0.009527760557830334,
"skip_count": 0.0,
"step": 2028,
"text_loss": 0.2110334187746048
@@ -19283,13 +19283,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.119140625,
"learning_rate": 0.000945045678734605,
- "loss": 0.0178,
+ "loss": 0.0175,
"macro_f1": 0.3144654333591461,
"num_tokens": 3273488.0,
"repeat_count": 0.0,
- "routers_loss": 0.03916877508163452,
+ "routers_loss": 0.03317151218652725,
"skip_count": 3.0,
"step": 2030,
"text_loss": 0.2233227640390396
@@ -19302,13 +19302,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11474609375,
+ "grad_norm": 0.12451171875,
"learning_rate": 0.0009449045213661822,
- "loss": 0.0215,
+ "loss": 0.0201,
"macro_f1": 0.3272727429866791,
"num_tokens": 3276646.0,
"repeat_count": 0.0,
- "routers_loss": 0.019781047478318214,
+ "routers_loss": 0.018510591238737106,
"skip_count": 1.0,
"step": 2032,
"text_loss": 0.16100332140922546
@@ -19321,13 +19321,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 1.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.11474609375,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.0009447631935095077,
- "loss": 0.0193,
+ "loss": 0.0185,
"macro_f1": 0.9452888369560242,
"num_tokens": 3279441.0,
"repeat_count": 1.0,
- "routers_loss": 0.02645993046462536,
+ "routers_loss": 0.028113311156630516,
"skip_count": 4.0,
"step": 2034,
"text_loss": 0.29208317399024963
@@ -19340,13 +19340,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.0009446216952187384,
- "loss": 0.0168,
+ "loss": 0.0164,
"macro_f1": 0.3333333432674408,
"num_tokens": 3282697.0,
"repeat_count": 0.0,
- "routers_loss": 0.008575125597417355,
+ "routers_loss": 0.008379172533750534,
"skip_count": 0.0,
"step": 2036,
"text_loss": 0.16026398539543152
@@ -19359,13 +19359,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.06298828125,
"learning_rate": 0.0009444800265480967,
- "loss": 0.0184,
+ "loss": 0.0178,
"macro_f1": 0.3333333432674408,
"num_tokens": 3285574.0,
"repeat_count": 0.0,
- "routers_loss": 0.01042154710739851,
+ "routers_loss": 0.00941354501992464,
"skip_count": 0.0,
"step": 2038,
"text_loss": 0.29523080587387085
@@ -19378,13 +19378,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.8571428656578064,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.076171875,
"learning_rate": 0.0009443381875518703,
- "loss": 0.0206,
+ "loss": 0.0197,
"macro_f1": 0.8600732684135437,
"num_tokens": 3289159.0,
"repeat_count": 4.0,
- "routers_loss": 0.05496715381741524,
+ "routers_loss": 0.04974055662751198,
"skip_count": 6.0,
"step": 2040,
"text_loss": 0.23033179342746735
@@ -19397,13 +19397,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.0537109375,
"learning_rate": 0.0009441961782844123,
- "loss": 0.0149,
+ "loss": 0.0146,
"macro_f1": 0.3272727429866791,
"num_tokens": 3293598.0,
"repeat_count": 0.0,
- "routers_loss": 0.021722445264458656,
+ "routers_loss": 0.022241825237870216,
"skip_count": 1.0,
"step": 2042,
"text_loss": 0.8299165368080139
@@ -19416,13 +19416,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.0009440539988001408,
- "loss": 0.0161,
+ "loss": 0.0159,
"macro_f1": 0.3333333432674408,
"num_tokens": 3296648.0,
"repeat_count": 0.0,
- "routers_loss": 0.011090370826423168,
+ "routers_loss": 0.011019332334399223,
"skip_count": 0.0,
"step": 2044,
"text_loss": 0.18207129836082458
@@ -19435,13 +19435,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.041259765625,
"learning_rate": 0.0009439116491535394,
- "loss": 0.0123,
+ "loss": 0.0118,
"macro_f1": 0.3333333432674408,
"num_tokens": 3300058.0,
"repeat_count": 0.0,
- "routers_loss": 0.00327755743637681,
+ "routers_loss": 0.002889640862122178,
"skip_count": 0.0,
"step": 2046,
"text_loss": 0.7051978707313538
@@ -19454,13 +19454,13 @@
"f1_execute": 0.9333333373069763,
"f1_repeat": 0.5,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.078125,
"learning_rate": 0.0009437691293991563,
- "loss": 0.0198,
+ "loss": 0.0192,
"macro_f1": 0.7634921073913574,
"num_tokens": 3303296.0,
"repeat_count": 3.0,
- "routers_loss": 0.0807223841547966,
+ "routers_loss": 0.07741832733154297,
"skip_count": 4.0,
"step": 2048,
"text_loss": 0.15563532710075378
@@ -19473,13 +19473,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.09521484375,
"learning_rate": 0.0009436264395916061,
- "loss": 0.0218,
+ "loss": 0.0209,
"macro_f1": 0.6666666865348816,
"num_tokens": 3306204.0,
"repeat_count": 0.0,
- "routers_loss": 0.014681774191558361,
+ "routers_loss": 0.014225383289158344,
"skip_count": 2.0,
"step": 2050,
"text_loss": 0.18117287755012512
@@ -19492,13 +19492,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09326171875,
+ "grad_norm": 0.1416015625,
"learning_rate": 0.0009434835797855672,
- "loss": 0.0166,
+ "loss": 0.0165,
"macro_f1": 0.3333333432674408,
"num_tokens": 3309444.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025602662935853004,
+ "routers_loss": 0.0023932650219649076,
"skip_count": 0.0,
"step": 2052,
"text_loss": 0.4645874798297882
@@ -19511,13 +19511,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05810546875,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0009433405500357839,
- "loss": 0.0148,
+ "loss": 0.0153,
"macro_f1": 0.3272727429866791,
"num_tokens": 3312488.0,
"repeat_count": 0.0,
- "routers_loss": 0.03283753618597984,
+ "routers_loss": 0.03193361684679985,
"skip_count": 1.0,
"step": 2054,
"text_loss": 0.5291082859039307
@@ -19530,13 +19530,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.062255859375,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0009431973503970655,
- "loss": 0.0138,
+ "loss": 0.0134,
"macro_f1": 0.3333333432674408,
"num_tokens": 3315765.0,
"repeat_count": 0.0,
- "routers_loss": 0.002137230010703206,
+ "routers_loss": 0.0020529816392809153,
"skip_count": 0.0,
"step": 2056,
"text_loss": 0.5877931118011475
@@ -19549,13 +19549,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08251953125,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0009430539809242864,
- "loss": 0.0199,
+ "loss": 0.0185,
"macro_f1": 0.32098764181137085,
"num_tokens": 3318877.0,
"repeat_count": 2.0,
- "routers_loss": 0.07938452064990997,
+ "routers_loss": 0.07907948642969131,
"skip_count": 0.0,
"step": 2058,
"text_loss": 0.3836737871170044
@@ -19568,13 +19568,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009429104416723862,
- "loss": 0.0164,
+ "loss": 0.0163,
"macro_f1": 0.6666666865348816,
"num_tokens": 3322576.0,
"repeat_count": 2.0,
- "routers_loss": 0.003832251997664571,
+ "routers_loss": 0.003006070153787732,
"skip_count": 0.0,
"step": 2060,
"text_loss": 0.3480920195579529
@@ -19587,13 +19587,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.045166015625,
"learning_rate": 0.0009427667326963689,
- "loss": 0.0131,
+ "loss": 0.0127,
"macro_f1": 0.3333333432674408,
"num_tokens": 3325974.0,
"repeat_count": 0.0,
- "routers_loss": 0.006192604545503855,
+ "routers_loss": 0.005013179033994675,
"skip_count": 0.0,
"step": 2062,
"text_loss": 0.931358814239502
@@ -19606,13 +19606,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09375,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0009426228540513047,
"loss": 0.0206,
"macro_f1": 0.3333333432674408,
"num_tokens": 3329398.0,
"repeat_count": 0.0,
- "routers_loss": 0.008115313947200775,
+ "routers_loss": 0.0059848143719136715,
"skip_count": 0.0,
"step": 2064,
"text_loss": 0.47568953037261963
@@ -19625,13 +19625,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009424788057923277,
- "loss": 0.0127,
+ "loss": 0.0131,
"macro_f1": 0.3333333432674408,
"num_tokens": 3332029.0,
"repeat_count": 0.0,
- "routers_loss": 0.007599714212119579,
+ "routers_loss": 0.00783882662653923,
"skip_count": 0.0,
"step": 2066,
"text_loss": 0.22887596487998962
@@ -19644,13 +19644,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.07470703125,
+ "grad_norm": 0.0712890625,
"learning_rate": 0.0009423345879746376,
- "loss": 0.0126,
+ "loss": 0.0128,
"macro_f1": 0.5492662787437439,
"num_tokens": 3334858.0,
"repeat_count": 0.0,
- "routers_loss": 0.016804348677396774,
+ "routers_loss": 0.01866884157061577,
"skip_count": 2.0,
"step": 2068,
"text_loss": 0.17724967002868652
@@ -19663,13 +19663,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.06591796875,
"learning_rate": 0.000942190200653499,
- "loss": 0.0164,
+ "loss": 0.0162,
"macro_f1": 0.32098764181137085,
"num_tokens": 3338094.0,
"repeat_count": 0.0,
- "routers_loss": 0.02686731517314911,
+ "routers_loss": 0.028636593371629715,
"skip_count": 2.0,
"step": 2070,
"text_loss": 0.34344956278800964
@@ -19682,13 +19682,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.07568359375,
"learning_rate": 0.0009420456438842413,
- "loss": 0.0172,
+ "loss": 0.0165,
"macro_f1": 0.5492662787437439,
"num_tokens": 3340526.0,
"repeat_count": 0.0,
- "routers_loss": 0.025320913642644882,
+ "routers_loss": 0.023245645686984062,
"skip_count": 2.0,
"step": 2072,
"text_loss": 0.7276164293289185
@@ -19701,13 +19701,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.11328125,
"learning_rate": 0.000941900917722259,
- "loss": 0.0145,
+ "loss": 0.0143,
"macro_f1": 0.3272727429866791,
"num_tokens": 3343303.0,
"repeat_count": 1.0,
- "routers_loss": 0.014900023117661476,
+ "routers_loss": 0.01565689593553543,
"skip_count": 0.0,
"step": 2074,
"text_loss": 0.5665070414543152
@@ -19720,13 +19720,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11474609375,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.0009417560222230115,
- "loss": 0.0244,
+ "loss": 0.0245,
"macro_f1": 0.3333333432674408,
"num_tokens": 3346409.0,
"repeat_count": 0.0,
- "routers_loss": 0.003426895011216402,
+ "routers_loss": 0.0035056080669164658,
"skip_count": 0.0,
"step": 2076,
"text_loss": 0.5112795233726501
@@ -19739,13 +19739,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0712890625,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0009416109574420229,
- "loss": 0.0136,
+ "loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 3349220.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031935563310980797,
+ "routers_loss": 0.0027565446216613054,
"skip_count": 0.0,
"step": 2078,
"text_loss": 0.5240910053253174
@@ -19758,13 +19758,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.08203125,
"learning_rate": 0.0009414657234348823,
- "loss": 0.0183,
+ "loss": 0.0186,
"macro_f1": 1.0,
"num_tokens": 3352627.0,
"repeat_count": 3.0,
- "routers_loss": 0.016454946249723434,
+ "routers_loss": 0.01652451977133751,
"skip_count": 2.0,
"step": 2080,
"text_loss": 1.0217112302780151
@@ -19777,13 +19777,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009413203202572438,
- "loss": 0.0174,
+ "loss": 0.0179,
"macro_f1": 0.32098764181137085,
"num_tokens": 3355392.0,
"repeat_count": 0.0,
- "routers_loss": 0.1056143268942833,
+ "routers_loss": 0.1012420505285263,
"skip_count": 2.0,
"step": 2082,
"text_loss": 0.4085482358932495
@@ -19796,13 +19796,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07373046875,
+ "grad_norm": 0.08251953125,
"learning_rate": 0.000941174747964826,
- "loss": 0.016,
+ "loss": 0.0154,
"macro_f1": 0.3333333432674408,
"num_tokens": 3358425.0,
"repeat_count": 0.0,
- "routers_loss": 0.003626141929998994,
+ "routers_loss": 0.004962718114256859,
"skip_count": 0.0,
"step": 2084,
"text_loss": 0.5833504796028137
@@ -19810,18 +19810,18 @@
{
"acc_repeat": 0.5,
"acc_skip": 0.6666666865348816,
- "avg_layers": 26.0,
+ "avg_layers": 27.0,
"epoch": 9.793660111535075,
- "f1_execute": 0.936170220375061,
+ "f1_execute": 0.9583333134651184,
"f1_repeat": 0.6666666865348816,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.107421875,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.11376953125,
"learning_rate": 0.0009410290066134124,
- "loss": 0.0216,
- "macro_f1": 0.7565011978149414,
+ "loss": 0.0211,
+ "macro_f1": 0.8083333373069763,
"num_tokens": 3361925.0,
"repeat_count": 2.0,
- "routers_loss": 0.08091846853494644,
+ "routers_loss": 0.07889176905155182,
"skip_count": 3.0,
"step": 2086,
"text_loss": 0.38126569986343384
@@ -19834,13 +19834,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.056884765625,
+ "grad_norm": 0.051513671875,
"learning_rate": 0.0009408830962588517,
- "loss": 0.0197,
+ "loss": 0.0195,
"macro_f1": 0.6601307392120361,
"num_tokens": 3365963.0,
"repeat_count": 1.0,
- "routers_loss": 0.035208042711019516,
+ "routers_loss": 0.033715736120939255,
"skip_count": 2.0,
"step": 2088,
"text_loss": 0.23213914036750793
@@ -19853,13 +19853,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07958984375,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0009407370169570567,
- "loss": 0.0167,
+ "loss": 0.0169,
"macro_f1": 0.3333333432674408,
"num_tokens": 3369422.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018934847321361303,
+ "routers_loss": 0.0014188943896442652,
"skip_count": 0.0,
"step": 2090,
"text_loss": 0.4648318886756897
@@ -19872,13 +19872,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.0712890625,
"learning_rate": 0.0009405907687640054,
- "loss": 0.0132,
+ "loss": 0.013,
"macro_f1": 0.3272727429866791,
"num_tokens": 3372506.0,
"repeat_count": 0.0,
- "routers_loss": 0.016075141727924347,
+ "routers_loss": 0.015339684672653675,
"skip_count": 1.0,
"step": 2092,
"text_loss": 0.2563800811767578
@@ -19891,13 +19891,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.054443359375,
"learning_rate": 0.0009404443517357404,
"loss": 0.0146,
"macro_f1": 0.542222261428833,
"num_tokens": 3375653.0,
"repeat_count": 4.0,
- "routers_loss": 0.06333976984024048,
+ "routers_loss": 0.06562861055135727,
"skip_count": 0.0,
"step": 2094,
"text_loss": 0.797835111618042
@@ -19910,13 +19910,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.060546875,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.000940297765928369,
- "loss": 0.0133,
+ "loss": 0.0136,
"macro_f1": 0.3333333432674408,
"num_tokens": 3379018.0,
"repeat_count": 0.0,
- "routers_loss": 0.005521406419575214,
+ "routers_loss": 0.005745889153331518,
"skip_count": 0.0,
"step": 2096,
"text_loss": 0.4238114655017853
@@ -19929,13 +19929,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06103515625,
+ "grad_norm": 0.0712890625,
"learning_rate": 0.0009401510113980631,
- "loss": 0.0205,
+ "loss": 0.0207,
"macro_f1": 0.3333333432674408,
"num_tokens": 3382855.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025159218348562717,
+ "routers_loss": 0.0026634482201188803,
"skip_count": 0.0,
"step": 2098,
"text_loss": 0.4967166483402252
@@ -19948,13 +19948,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009400040882010592,
- "loss": 0.0172,
+ "loss": 0.0166,
"macro_f1": 0.3333333432674408,
"num_tokens": 3386386.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025535966269671917,
+ "routers_loss": 0.0020642587915062904,
"skip_count": 0.0,
"step": 2100,
"text_loss": 0.44390562176704407
@@ -19967,13 +19967,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.056640625,
"learning_rate": 0.0009398569963936589,
- "loss": 0.0178,
+ "loss": 0.017,
"macro_f1": 0.3272727429866791,
"num_tokens": 3389958.0,
"repeat_count": 0.0,
- "routers_loss": 0.013569516129791737,
+ "routers_loss": 0.013722737319767475,
"skip_count": 1.0,
"step": 2102,
"text_loss": 0.7207565903663635
@@ -19986,13 +19986,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.08837890625,
"learning_rate": 0.0009397097360322276,
- "loss": 0.0175,
+ "loss": 0.017,
"macro_f1": 0.3333333432674408,
"num_tokens": 3392892.0,
"repeat_count": 0.0,
- "routers_loss": 0.0044935219921171665,
+ "routers_loss": 0.002051608171314001,
"skip_count": 0.0,
"step": 2104,
"text_loss": 0.3196398913860321
@@ -20005,13 +20005,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.000939562307173196,
- "loss": 0.0223,
+ "loss": 0.022,
"macro_f1": 0.3333333432674408,
"num_tokens": 3396636.0,
"repeat_count": 0.0,
- "routers_loss": 0.007407462690025568,
+ "routers_loss": 0.007085663266479969,
"skip_count": 0.0,
"step": 2106,
"text_loss": 0.5663776397705078
@@ -20024,13 +20024,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.13671875,
+ "grad_norm": 0.11328125,
"learning_rate": 0.0009394147098730592,
- "loss": 0.0205,
+ "loss": 0.02,
"macro_f1": 0.5492662787437439,
"num_tokens": 3399475.0,
"repeat_count": 0.0,
- "routers_loss": 0.024386432021856308,
+ "routers_loss": 0.019473131746053696,
"skip_count": 2.0,
"step": 2108,
"text_loss": 0.7708223462104797
@@ -20043,32 +20043,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037353515625,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.0009392669441883767,
- "loss": 0.0135,
+ "loss": 0.0134,
"macro_f1": 0.3333333432674408,
"num_tokens": 3402350.0,
"repeat_count": 0.0,
- "routers_loss": 0.002929724520072341,
+ "routers_loss": 0.0028328890912234783,
"skip_count": 0.0,
"step": 2110,
"text_loss": 0.5888006091117859
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 9.915761667155856,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.1201171875,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009391190101757724,
- "loss": 0.0168,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0166,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3405561.0,
"repeat_count": 0.0,
- "routers_loss": 0.026861928403377533,
+ "routers_loss": 0.023098422214388847,
"skip_count": 2.0,
"step": 2112,
"text_loss": 0.09865197539329529
@@ -20081,13 +20081,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0966796875,
+ "grad_norm": 0.10107421875,
"learning_rate": 0.000938970907891935,
- "loss": 0.0251,
+ "loss": 0.0247,
"macro_f1": 0.3333333432674408,
"num_tokens": 3408513.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025369988288730383,
+ "routers_loss": 0.002896632067859173,
"skip_count": 0.0,
"step": 2114,
"text_loss": 0.6613234281539917
@@ -20100,51 +20100,51 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09423828125,
+ "grad_norm": 0.0947265625,
"learning_rate": 0.0009388226373936179,
- "loss": 0.0209,
+ "loss": 0.0211,
"macro_f1": 0.3333333432674408,
"num_tokens": 3411195.0,
"repeat_count": 0.0,
- "routers_loss": 0.014292459934949875,
+ "routers_loss": 0.015814457088708878,
"skip_count": 0.0,
"step": 2116,
"text_loss": 0.17363053560256958
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 9.94393894922219,
- "f1_execute": 0.9629629850387573,
- "f1_repeat": 0.0,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1181640625,
+ "grad_norm": 0.12451171875,
"learning_rate": 0.0009386741987376381,
- "loss": 0.0151,
- "macro_f1": 0.32098767161369324,
+ "loss": 0.015,
+ "macro_f1": 0.6603773832321167,
"num_tokens": 3414875.0,
"repeat_count": 1.0,
- "routers_loss": 0.027571436017751694,
+ "routers_loss": 0.02676783688366413,
"skip_count": 0.0,
"step": 2118,
"text_loss": 0.674056887626648
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 9.953331376577633,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.08349609375,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0009385255919808778,
- "loss": 0.0205,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0203,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3418410.0,
"repeat_count": 0.0,
- "routers_loss": 0.011719600297510624,
+ "routers_loss": 0.01022857241332531,
"skip_count": 1.0,
"step": 2120,
"text_loss": 0.235092431306839
@@ -20157,13 +20157,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09375,
+ "grad_norm": 0.0888671875,
"learning_rate": 0.0009383768171802836,
- "loss": 0.0249,
+ "loss": 0.0244,
"macro_f1": 0.5492662787437439,
"num_tokens": 3421289.0,
"repeat_count": 0.0,
- "routers_loss": 0.01207603607326746,
+ "routers_loss": 0.013572212308645248,
"skip_count": 2.0,
"step": 2122,
"text_loss": 0.5992844104766846
@@ -20176,13 +20176,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.060791015625,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0009382278743928659,
- "loss": 0.0206,
+ "loss": 0.0201,
"macro_f1": 0.6666666865348816,
"num_tokens": 3424781.0,
"repeat_count": 0.0,
- "routers_loss": 0.008004254661500454,
+ "routers_loss": 0.0051873656921088696,
"skip_count": 2.0,
"step": 2124,
"text_loss": 0.29915499687194824
@@ -20195,13 +20195,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.07421875,
"learning_rate": 0.0009380787636757001,
- "loss": 0.0156,
+ "loss": 0.0155,
"macro_f1": 0.6122449040412903,
"num_tokens": 3427942.0,
"repeat_count": 0.0,
- "routers_loss": 0.030767880380153656,
+ "routers_loss": 0.030079292133450508,
"skip_count": 4.0,
"step": 2126,
"text_loss": 0.24181491136550903
@@ -20214,13 +20214,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06201171875,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0009379294850859256,
"loss": 0.0141,
"macro_f1": 0.3333333432674408,
"num_tokens": 3431314.0,
"repeat_count": 0.0,
- "routers_loss": 0.002620625076815486,
+ "routers_loss": 0.002675612922757864,
"skip_count": 0.0,
"step": 2128,
"text_loss": 0.4669873118400574
@@ -20233,13 +20233,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09033203125,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009377800386807465,
- "loss": 0.0175,
+ "loss": 0.0177,
"macro_f1": 0.3333333432674408,
"num_tokens": 3435020.0,
"repeat_count": 0.0,
- "routers_loss": 0.009095560759305954,
+ "routers_loss": 0.009334275498986244,
"skip_count": 0.0,
"step": 2130,
"text_loss": 0.6478219628334045
@@ -20252,13 +20252,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009376304245174306,
- "loss": 0.0143,
+ "loss": 0.0137,
"macro_f1": 0.6000000238418579,
"num_tokens": 3438276.0,
"repeat_count": 1.0,
- "routers_loss": 0.058448426425457,
+ "routers_loss": 0.038227908313274384,
"skip_count": 2.0,
"step": 2132,
"text_loss": 0.4401201903820038
@@ -20271,13 +20271,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.046875,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0009374806426533104,
- "loss": 0.0116,
+ "loss": 0.0113,
"macro_f1": 0.3333333432674408,
"num_tokens": 3440938.0,
"repeat_count": 0.0,
- "routers_loss": 0.007323687430471182,
+ "routers_loss": 0.006901399698108435,
"skip_count": 0.0,
"step": 2134,
"text_loss": 0.5948942303657532
@@ -20290,13 +20290,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.051513671875,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.0009373306931457827,
- "loss": 0.0122,
+ "loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 3444028.0,
"repeat_count": 0.0,
- "routers_loss": 0.003302243771031499,
+ "routers_loss": 0.0037061909679323435,
"skip_count": 0.0,
"step": 2136,
"text_loss": 0.5349751114845276
@@ -20309,13 +20309,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047607421875,
+ "grad_norm": 0.056884765625,
"learning_rate": 0.0009371805760523086,
- "loss": 0.0113,
+ "loss": 0.0111,
"macro_f1": 0.3333333432674408,
"num_tokens": 3448331.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027974818367511034,
+ "routers_loss": 0.0025877030566334724,
"skip_count": 0.0,
"step": 2138,
"text_loss": 0.4591051936149597
@@ -20328,13 +20328,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009370302914304129,
- "loss": 0.0145,
+ "loss": 0.0144,
"macro_f1": 0.5934640765190125,
"num_tokens": 3451434.0,
"repeat_count": 0.0,
- "routers_loss": 0.01572767272591591,
+ "routers_loss": 0.018742674961686134,
"skip_count": 3.0,
"step": 2140,
"text_loss": 0.23470863699913025
@@ -20347,13 +20347,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06201171875,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009368798393376851,
- "loss": 0.0119,
+ "loss": 0.0122,
"macro_f1": 0.3272727429866791,
"num_tokens": 3454375.0,
"repeat_count": 0.0,
- "routers_loss": 0.020721890032291412,
+ "routers_loss": 0.02382594160735607,
"skip_count": 1.0,
"step": 2142,
"text_loss": 0.6077954769134521
@@ -20366,13 +20366,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.05517578125,
"learning_rate": 0.0009367292198317787,
- "loss": 0.0161,
+ "loss": 0.0164,
"macro_f1": 0.5492662787437439,
"num_tokens": 3457591.0,
"repeat_count": 0.0,
- "routers_loss": 0.03272393345832825,
+ "routers_loss": 0.03331060707569122,
"skip_count": 2.0,
"step": 2144,
"text_loss": 0.3691073954105377
@@ -20385,13 +20385,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052490234375,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0009365784329704115,
- "loss": 0.0191,
+ "loss": 0.0186,
"macro_f1": 0.3333333432674408,
"num_tokens": 3460895.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017473002662882209,
+ "routers_loss": 0.0016955457394942641,
"skip_count": 0.0,
"step": 2146,
"text_loss": 0.3947436511516571
@@ -20404,13 +20404,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.05224609375,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.0009364274788113651,
- "loss": 0.0094,
+ "loss": 0.0096,
"macro_f1": 0.6666666865348816,
"num_tokens": 3464101.0,
"repeat_count": 1.0,
- "routers_loss": 0.008070237934589386,
+ "routers_loss": 0.006169239990413189,
"skip_count": 0.0,
"step": 2148,
"text_loss": 0.3348555266857147
@@ -20423,13 +20423,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.800000011920929,
"f1_skip": 1.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0009362763574124858,
- "loss": 0.0191,
+ "loss": 0.019,
"macro_f1": 0.9265305995941162,
"num_tokens": 3467417.0,
"repeat_count": 3.0,
- "routers_loss": 0.021709222346544266,
+ "routers_loss": 0.024033790454268456,
"skip_count": 1.0,
"step": 2150,
"text_loss": 0.496633380651474
@@ -20442,13 +20442,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.046630859375,
+ "grad_norm": 0.042724609375,
"learning_rate": 0.0009361250688316829,
- "loss": 0.014,
+ "loss": 0.0142,
"macro_f1": 0.3333333432674408,
"num_tokens": 3470917.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022237664088606834,
+ "routers_loss": 0.0024986129719763994,
"skip_count": 0.0,
"step": 2152,
"text_loss": 0.6857671737670898
@@ -20461,13 +20461,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.0546875,
"learning_rate": 0.0009359736131269312,
"loss": 0.0153,
"macro_f1": 0.6666666865348816,
"num_tokens": 3473624.0,
"repeat_count": 0.0,
- "routers_loss": 0.00838750321418047,
+ "routers_loss": 0.008183322846889496,
"skip_count": 1.0,
"step": 2154,
"text_loss": 0.13883116841316223
@@ -20480,13 +20480,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0576171875,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0009358219903562684,
- "loss": 0.01,
+ "loss": 0.0106,
"macro_f1": 0.6666666865348816,
"num_tokens": 3476472.0,
"repeat_count": 0.0,
- "routers_loss": 0.010190514847636223,
+ "routers_loss": 0.011198793537914753,
"skip_count": 3.0,
"step": 2156,
"text_loss": 0.24243666231632233
@@ -20499,13 +20499,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0419921875,
+ "grad_norm": 0.04296875,
"learning_rate": 0.0009356702005777969,
- "loss": 0.0124,
+ "loss": 0.0125,
"macro_f1": 0.3333333432674408,
"num_tokens": 3479688.0,
"repeat_count": 0.0,
- "routers_loss": 0.002411153633147478,
+ "routers_loss": 0.002520184963941574,
"skip_count": 0.0,
"step": 2158,
"text_loss": 0.6407818794250488
@@ -20518,13 +20518,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009355182438496825,
- "loss": 0.0141,
+ "loss": 0.0142,
"macro_f1": 0.3333333432674408,
"num_tokens": 3482598.0,
"repeat_count": 0.0,
- "routers_loss": 0.001032356172800064,
+ "routers_loss": 0.0011065017897635698,
"skip_count": 0.0,
"step": 2160,
"text_loss": 0.7214245796203613
@@ -20537,13 +20537,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05908203125,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0009353661202301557,
- "loss": 0.0147,
+ "loss": 0.0144,
"macro_f1": 0.3333333432674408,
"num_tokens": 3486271.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022046815138310194,
+ "routers_loss": 0.0017824085662141442,
"skip_count": 0.0,
"step": 2162,
"text_loss": 0.5140969157218933
@@ -20556,32 +20556,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.051513671875,
+ "grad_norm": 0.053466796875,
"learning_rate": 0.0009352138297775101,
"loss": 0.0145,
"macro_f1": 0.3333333432674408,
"num_tokens": 3489206.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014977266546338797,
+ "routers_loss": 0.001542879967018962,
"skip_count": 0.0,
"step": 2164,
"text_loss": 0.7956416606903076
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6666666865348816,
- "avg_layers": 26.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
"epoch": 10.169063692398003,
- "f1_execute": 0.9803921580314636,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.800000011920929,
+ "f1_skip": 1.0,
"grad_norm": 0.0771484375,
"learning_rate": 0.000935061372550104,
- "loss": 0.0132,
- "macro_f1": 0.5934640765190125,
+ "loss": 0.0134,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3492003.0,
"repeat_count": 0.0,
- "routers_loss": 0.016847684979438782,
+ "routers_loss": 0.01420794241130352,
"skip_count": 3.0,
"step": 2166,
"text_loss": 0.27489882707595825
@@ -20594,13 +20594,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0009349087486063594,
- "loss": 0.0168,
+ "loss": 0.0166,
"macro_f1": 0.6666666865348816,
"num_tokens": 3494784.0,
"repeat_count": 0.0,
- "routers_loss": 0.0036806222051382065,
+ "routers_loss": 0.003614309709519148,
"skip_count": 1.0,
"step": 2168,
"text_loss": 0.2962227761745453
@@ -20613,13 +20613,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0009347559580047618,
- "loss": 0.0174,
+ "loss": 0.0175,
"macro_f1": 0.8814815282821655,
"num_tokens": 3497886.0,
"repeat_count": 2.0,
- "routers_loss": 0.021412594243884087,
+ "routers_loss": 0.02122853323817253,
"skip_count": 4.0,
"step": 2170,
"text_loss": 0.5919580459594727
@@ -20627,18 +20627,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 10.197240974464338,
- "f1_execute": 1.0,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.06591796875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.000934603000803861,
- "loss": 0.0134,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0135,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 3500939.0,
"repeat_count": 0.0,
- "routers_loss": 0.0201424453407526,
+ "routers_loss": 0.02042219042778015,
"skip_count": 1.0,
"step": 2172,
"text_loss": 0.28722381591796875
@@ -20651,13 +20651,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05419921875,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0009344498770622704,
- "loss": 0.0131,
+ "loss": 0.013,
"macro_f1": 0.3333333432674408,
"num_tokens": 3504852.0,
"repeat_count": 0.0,
- "routers_loss": 0.005059401970356703,
+ "routers_loss": 0.004345106892287731,
"skip_count": 0.0,
"step": 2174,
"text_loss": 0.603236734867096
@@ -20670,13 +20670,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.1064453125,
"learning_rate": 0.0009342965868386673,
"loss": 0.0101,
"macro_f1": 0.3333333432674408,
"num_tokens": 3508320.0,
"repeat_count": 0.0,
- "routers_loss": 0.004006600938737392,
+ "routers_loss": 0.00368050136603415,
"skip_count": 0.0,
"step": 2176,
"text_loss": 0.6020491719245911
@@ -20691,11 +20691,11 @@
"f1_skip": 0.0,
"grad_norm": 0.060302734375,
"learning_rate": 0.000934143130191793,
- "loss": 0.0109,
+ "loss": 0.0108,
"macro_f1": 0.3333333432674408,
"num_tokens": 3511278.0,
"repeat_count": 0.0,
- "routers_loss": 0.013246738351881504,
+ "routers_loss": 0.013425769284367561,
"skip_count": 0.0,
"step": 2178,
"text_loss": 0.5954724550247192
@@ -20708,13 +20708,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06005859375,
+ "grad_norm": 0.060546875,
"learning_rate": 0.000933989507180452,
- "loss": 0.0151,
+ "loss": 0.0149,
"macro_f1": 0.3333333432674408,
"num_tokens": 3514361.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031937146559357643,
+ "routers_loss": 0.002896249992772937,
"skip_count": 0.0,
"step": 2180,
"text_loss": 0.39175131916999817
@@ -20727,13 +20727,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0556640625,
+ "grad_norm": 0.052978515625,
"learning_rate": 0.0009338357178635135,
- "loss": 0.0151,
+ "loss": 0.0147,
"macro_f1": 0.6603773832321167,
"num_tokens": 3517962.0,
"repeat_count": 1.0,
- "routers_loss": 0.014782631769776344,
+ "routers_loss": 0.011538350023329258,
"skip_count": 1.0,
"step": 2182,
"text_loss": 0.4482830762863159
@@ -20746,13 +20746,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.0869140625,
"learning_rate": 0.0009336817622999093,
- "loss": 0.0112,
+ "loss": 0.011,
"macro_f1": 0.3272727429866791,
"num_tokens": 3521299.0,
"repeat_count": 1.0,
- "routers_loss": 0.02318345196545124,
+ "routers_loss": 0.022787930443882942,
"skip_count": 0.0,
"step": 2184,
"text_loss": 0.35177817940711975
@@ -20765,13 +20765,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.055419921875,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009335276405486357,
- "loss": 0.0134,
+ "loss": 0.0139,
"macro_f1": 0.3272727429866791,
"num_tokens": 3524611.0,
"repeat_count": 0.0,
- "routers_loss": 0.011735675856471062,
+ "routers_loss": 0.011597735807299614,
"skip_count": 1.0,
"step": 2186,
"text_loss": 0.24868851900100708
@@ -20784,13 +20784,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0009333733526687524,
- "loss": 0.0198,
+ "loss": 0.0196,
"macro_f1": 0.3333333432674408,
"num_tokens": 3528012.0,
"repeat_count": 0.0,
- "routers_loss": 0.01558679062873125,
+ "routers_loss": 0.014253967441618443,
"skip_count": 0.0,
"step": 2188,
"text_loss": 0.3970910310745239
@@ -20803,13 +20803,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056396484375,
+ "grad_norm": 0.054931640625,
"learning_rate": 0.000933218898719383,
- "loss": 0.0163,
+ "loss": 0.0162,
"macro_f1": 0.3333333432674408,
"num_tokens": 3530908.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019149131840094924,
+ "routers_loss": 0.001659149187617004,
"skip_count": 0.0,
"step": 2190,
"text_loss": 0.7618573307991028
@@ -20822,13 +20822,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07958984375,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0009330642787597141,
- "loss": 0.0161,
+ "loss": 0.0159,
"macro_f1": 0.3333333432674408,
"num_tokens": 3533993.0,
"repeat_count": 0.0,
- "routers_loss": 0.0056966920383274555,
+ "routers_loss": 0.005574346985667944,
"skip_count": 0.0,
"step": 2192,
"text_loss": 0.16470147669315338
@@ -20841,13 +20841,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07080078125,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009329094928489969,
"loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 3537310.0,
"repeat_count": 0.0,
- "routers_loss": 0.002511024009436369,
+ "routers_loss": 0.0026400673668831587,
"skip_count": 0.0,
"step": 2194,
"text_loss": 0.3400416374206543
@@ -20860,13 +20860,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08935546875,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009327545410465452,
- "loss": 0.0126,
+ "loss": 0.0124,
"macro_f1": 0.6666666865348816,
"num_tokens": 3540045.0,
"repeat_count": 0.0,
- "routers_loss": 0.008584192954003811,
+ "routers_loss": 0.008448398672044277,
"skip_count": 3.0,
"step": 2196,
"text_loss": 0.3110542297363281
@@ -20879,13 +20879,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.0009325994234117372,
- "loss": 0.0129,
+ "loss": 0.0122,
"macro_f1": 0.32098764181137085,
"num_tokens": 3544097.0,
"repeat_count": 0.0,
- "routers_loss": 0.03748156875371933,
+ "routers_loss": 0.037553198635578156,
"skip_count": 2.0,
"step": 2198,
"text_loss": 0.36126700043678284
@@ -20898,13 +20898,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09814453125,
+ "grad_norm": 0.09716796875,
"learning_rate": 0.000932444140004014,
- "loss": 0.0129,
+ "loss": 0.0124,
"macro_f1": 0.6666666865348816,
"num_tokens": 3547054.0,
"repeat_count": 1.0,
- "routers_loss": 0.006402099970728159,
+ "routers_loss": 0.006464479025453329,
"skip_count": 0.0,
"step": 2200,
"text_loss": 0.4947047233581543
@@ -20917,13 +20917,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.158203125,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009322886908828805,
- "loss": 0.015,
+ "loss": 0.0138,
"macro_f1": 0.6666666865348816,
"num_tokens": 3549903.0,
"repeat_count": 1.0,
- "routers_loss": 0.0055928584188222885,
+ "routers_loss": 0.005384812597185373,
"skip_count": 0.0,
"step": 2202,
"text_loss": 0.5923738479614258
@@ -20936,13 +20936,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009321330761079052,
"loss": 0.0149,
"macro_f1": 0.6666666865348816,
"num_tokens": 3553745.0,
"repeat_count": 0.0,
- "routers_loss": 0.013155708089470863,
+ "routers_loss": 0.015346619300544262,
"skip_count": 2.0,
"step": 2204,
"text_loss": 0.1904175877571106
@@ -20955,13 +20955,13 @@
"f1_execute": 0.9268292784690857,
"f1_repeat": 0.800000011920929,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.06494140625,
"learning_rate": 0.00093197729573872,
- "loss": 0.0206,
+ "loss": 0.0203,
"macro_f1": 0.8422764539718628,
"num_tokens": 3557235.0,
"repeat_count": 3.0,
- "routers_loss": 0.12029488384723663,
+ "routers_loss": 0.1207597479224205,
"skip_count": 6.0,
"step": 2206,
"text_loss": 0.3904837667942047
@@ -20974,13 +20974,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0771484375,
"learning_rate": 0.0009318213498350202,
- "loss": 0.011,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 3560795.0,
"repeat_count": 0.0,
- "routers_loss": 0.0037007431965321302,
+ "routers_loss": 0.003334777895361185,
"skip_count": 0.0,
"step": 2208,
"text_loss": 0.4268290102481842
@@ -20993,13 +20993,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.0537109375,
"learning_rate": 0.0009316652384565645,
- "loss": 0.0124,
+ "loss": 0.0123,
"macro_f1": 0.3333333432674408,
"num_tokens": 3563754.0,
"repeat_count": 0.0,
- "routers_loss": 0.004071404226124287,
+ "routers_loss": 0.004230072256177664,
"skip_count": 0.0,
"step": 2210,
"text_loss": 0.40049710869789124
@@ -21012,13 +21012,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.046875,
"learning_rate": 0.0009315089616631751,
- "loss": 0.0103,
+ "loss": 0.0106,
"macro_f1": 0.3333333432674408,
"num_tokens": 3567173.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006955390563234687,
+ "routers_loss": 0.0006645230459980667,
"skip_count": 0.0,
"step": 2212,
"text_loss": 0.42568323016166687
@@ -21031,32 +21031,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0849609375,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0009313525195147376,
- "loss": 0.0128,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 3570831.0,
"repeat_count": 0.0,
- "routers_loss": 0.010293997824192047,
+ "routers_loss": 0.0097877848893404,
"skip_count": 0.0,
"step": 2214,
"text_loss": 0.45808279514312744
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.5,
"acc_skip": 0.3333333432674408,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 10.40387437628412,
- "f1_execute": 0.9583333134651184,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 0.5,
- "grad_norm": 0.07470703125,
+ "grad_norm": 0.076171875,
"learning_rate": 0.000931195912071201,
- "loss": 0.0185,
- "macro_f1": 0.8194444179534912,
+ "loss": 0.0187,
+ "macro_f1": 0.7018141150474548,
"num_tokens": 3573745.0,
"repeat_count": 2.0,
- "routers_loss": 0.06593773514032364,
+ "routers_loss": 0.07351134717464447,
"skip_count": 3.0,
"step": 2216,
"text_loss": 0.285696804523468
@@ -21069,13 +21069,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009310391393925775,
- "loss": 0.013,
+ "loss": 0.0125,
"macro_f1": 0.3333333432674408,
"num_tokens": 3576785.0,
"repeat_count": 0.0,
- "routers_loss": 0.00347105972468853,
+ "routers_loss": 0.0033160944003611803,
"skip_count": 0.0,
"step": 2218,
"text_loss": 0.17516443133354187
@@ -21088,32 +21088,32 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.04736328125,
+ "grad_norm": 0.047119140625,
"learning_rate": 0.0009308822015389424,
- "loss": 0.0244,
+ "loss": 0.0241,
"macro_f1": 0.5427350401878357,
"num_tokens": 3580695.0,
"repeat_count": 1.0,
- "routers_loss": 0.04871147498488426,
+ "routers_loss": 0.052930232137441635,
"skip_count": 1.0,
"step": 2220,
"text_loss": 0.5918155908584595
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 26.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
"epoch": 10.432051658350455,
- "f1_execute": 0.9600000381469727,
+ "f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.05517578125,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.072265625,
"learning_rate": 0.0009307250985704352,
- "loss": 0.012,
- "macro_f1": 0.542222261428833,
+ "loss": 0.0128,
+ "macro_f1": 0.6122449040412903,
"num_tokens": 3583729.0,
"repeat_count": 0.0,
- "routers_loss": 0.024859672412276268,
+ "routers_loss": 0.025454653427004814,
"skip_count": 4.0,
"step": 2222,
"text_loss": 0.2652169466018677
@@ -21126,13 +21126,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.052001953125,
"learning_rate": 0.0009305678305472575,
- "loss": 0.016,
+ "loss": 0.0158,
"macro_f1": 0.3333333432674408,
"num_tokens": 3586775.0,
"repeat_count": 0.0,
- "routers_loss": 0.010990055277943611,
+ "routers_loss": 0.011279845610260963,
"skip_count": 0.0,
"step": 2224,
"text_loss": 0.3511691987514496
@@ -21145,13 +21145,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.10791015625,
"learning_rate": 0.000930410397529675,
- "loss": 0.0171,
+ "loss": 0.017,
"macro_f1": 0.3333333432674408,
"num_tokens": 3589676.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025031559634953737,
+ "routers_loss": 0.002700264798477292,
"skip_count": 0.0,
"step": 2226,
"text_loss": 0.24045433104038239
@@ -21164,13 +21164,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.048095703125,
"learning_rate": 0.000930252799578016,
- "loss": 0.0147,
+ "loss": 0.0146,
"macro_f1": 1.0,
"num_tokens": 3593242.0,
"repeat_count": 1.0,
- "routers_loss": 0.008100497536361217,
+ "routers_loss": 0.00826631672680378,
"skip_count": 2.0,
"step": 2228,
"text_loss": 0.3777645528316498
@@ -21183,13 +21183,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0009300950367526728,
- "loss": 0.0128,
+ "loss": 0.0131,
"macro_f1": 0.8820862174034119,
"num_tokens": 3596807.0,
"repeat_count": 2.0,
- "routers_loss": 0.03150207921862602,
+ "routers_loss": 0.036221496760845184,
"skip_count": 2.0,
"step": 2230,
"text_loss": 0.502962589263916
@@ -21202,13 +21202,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07470703125,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009299371091141001,
- "loss": 0.0132,
+ "loss": 0.0131,
"macro_f1": 0.3333333432674408,
"num_tokens": 3600150.0,
"repeat_count": 0.0,
- "routers_loss": 0.006253884173929691,
+ "routers_loss": 0.006449893582612276,
"skip_count": 0.0,
"step": 2232,
"text_loss": 0.20256924629211426
@@ -21221,13 +21221,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.046142578125,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.0009297790167228161,
- "loss": 0.0119,
+ "loss": 0.012,
"macro_f1": 0.6666666865348816,
"num_tokens": 3602988.0,
"repeat_count": 0.0,
- "routers_loss": 0.007228068076074123,
+ "routers_loss": 0.007872486487030983,
"skip_count": 2.0,
"step": 2234,
"text_loss": 0.42476826906204224
@@ -21240,13 +21240,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0009296207596394022,
- "loss": 0.0103,
+ "loss": 0.0101,
"macro_f1": 0.32098764181137085,
"num_tokens": 3606071.0,
"repeat_count": 0.0,
- "routers_loss": 0.02524643763899803,
+ "routers_loss": 0.027397040277719498,
"skip_count": 2.0,
"step": 2236,
"text_loss": 0.23432791233062744
@@ -21259,13 +21259,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.0595703125,
"learning_rate": 0.0009294623379245028,
- "loss": 0.0119,
+ "loss": 0.0117,
"macro_f1": 0.3333333432674408,
"num_tokens": 3609389.0,
"repeat_count": 0.0,
- "routers_loss": 0.009672109968960285,
+ "routers_loss": 0.01042645052075386,
"skip_count": 0.0,
"step": 2238,
"text_loss": 0.16665785014629364
@@ -21278,13 +21278,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0498046875,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0009293037516388252,
- "loss": 0.0155,
+ "loss": 0.0161,
"macro_f1": 0.3333333432674408,
"num_tokens": 3612105.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010066524846479297,
+ "routers_loss": 0.0012458425480872393,
"skip_count": 0.0,
"step": 2240,
"text_loss": 0.59421306848526
@@ -21297,13 +21297,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0732421875,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0009291450008431404,
- "loss": 0.0184,
+ "loss": 0.0185,
"macro_f1": 1.0,
"num_tokens": 3615439.0,
"repeat_count": 1.0,
- "routers_loss": 0.005509128328412771,
+ "routers_loss": 0.005781981628388166,
"skip_count": 1.0,
"step": 2242,
"text_loss": 0.510798454284668
@@ -21316,13 +21316,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.09423828125,
+ "grad_norm": 0.0966796875,
"learning_rate": 0.0009289860855982814,
- "loss": 0.0172,
+ "loss": 0.0166,
"macro_f1": 0.4871794879436493,
"num_tokens": 3618842.0,
"repeat_count": 0.0,
- "routers_loss": 0.030802007764577866,
+ "routers_loss": 0.031195320188999176,
"skip_count": 3.0,
"step": 2244,
"text_loss": 0.7574363350868225
@@ -21335,13 +21335,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.04931640625,
"learning_rate": 0.0009288270059651454,
"loss": 0.0133,
"macro_f1": 0.3333333432674408,
"num_tokens": 3621823.0,
"repeat_count": 0.0,
- "routers_loss": 0.001686889911070466,
+ "routers_loss": 0.001746491645462811,
"skip_count": 0.0,
"step": 2246,
"text_loss": 0.5125683546066284
@@ -21354,13 +21354,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.220703125,
"learning_rate": 0.0009286677620046918,
- "loss": 0.0163,
+ "loss": 0.0159,
"macro_f1": 0.5492662787437439,
"num_tokens": 3624502.0,
"repeat_count": 0.0,
- "routers_loss": 0.03299177065491676,
+ "routers_loss": 0.03792348504066467,
"skip_count": 2.0,
"step": 2248,
"text_loss": 0.7533677220344543
@@ -21375,11 +21375,11 @@
"f1_skip": 0.0,
"grad_norm": 0.07763671875,
"learning_rate": 0.0009285083537779429,
- "loss": 0.0119,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 3627057.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010354233672842383,
+ "routers_loss": 0.0009684451506473124,
"skip_count": 0.0,
"step": 2250,
"text_loss": 0.2219279706478119
@@ -21392,13 +21392,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.11767578125,
"learning_rate": 0.0009283487813459845,
- "loss": 0.0145,
+ "loss": 0.0148,
"macro_f1": 0.5492662787437439,
"num_tokens": 3629720.0,
"repeat_count": 0.0,
- "routers_loss": 0.02196674607694149,
+ "routers_loss": 0.022757573053240776,
"skip_count": 2.0,
"step": 2252,
"text_loss": 0.6903313994407654
@@ -21411,13 +21411,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.1376953125,
"learning_rate": 0.0009281890447699652,
"loss": 0.015,
"macro_f1": 0.6666666865348816,
"num_tokens": 3633234.0,
"repeat_count": 1.0,
- "routers_loss": 0.002239946974441409,
+ "routers_loss": 0.003613058477640152,
"skip_count": 0.0,
"step": 2254,
"text_loss": 0.6278893351554871
@@ -21430,13 +21430,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.046142578125,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0009280291441110961,
- "loss": 0.0117,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 3636289.0,
"repeat_count": 0.0,
- "routers_loss": 0.0063575254753232,
+ "routers_loss": 0.006214062683284283,
"skip_count": 0.0,
"step": 2256,
"text_loss": 0.3011114001274109
@@ -21449,13 +21449,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.040283203125,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0009278690794306517,
- "loss": 0.0143,
+ "loss": 0.014,
"macro_f1": 0.5492662787437439,
"num_tokens": 3640251.0,
"repeat_count": 0.0,
- "routers_loss": 0.0524379126727581,
+ "routers_loss": 0.052556321024894714,
"skip_count": 2.0,
"step": 2258,
"text_loss": 0.19894185662269592
@@ -21468,13 +21468,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.8571428656578064,
"f1_skip": 1.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.08251953125,
"learning_rate": 0.0009277088507899689,
- "loss": 0.0156,
+ "loss": 0.0163,
"macro_f1": 0.9452888369560242,
"num_tokens": 3643527.0,
"repeat_count": 4.0,
- "routers_loss": 0.052486274391412735,
+ "routers_loss": 0.0572301521897316,
"skip_count": 1.0,
"step": 2260,
"text_loss": 0.5593410134315491
@@ -21487,13 +21487,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041748046875,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.0009275484582504475,
"loss": 0.0104,
"macro_f1": 0.3333333432674408,
"num_tokens": 3646959.0,
"repeat_count": 0.0,
- "routers_loss": 0.006877690553665161,
+ "routers_loss": 0.008010074496269226,
"skip_count": 0.0,
"step": 2262,
"text_loss": 0.2128177285194397
@@ -21506,13 +21506,13 @@
"f1_execute": 0.95652174949646,
"f1_repeat": 0.800000011920929,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.05322265625,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0009273879018735505,
- "loss": 0.0136,
+ "loss": 0.0138,
"macro_f1": 0.8521739840507507,
"num_tokens": 3651298.0,
"repeat_count": 3.0,
- "routers_loss": 0.03128742054104805,
+ "routers_loss": 0.035729870200157166,
"skip_count": 3.0,
"step": 2264,
"text_loss": 0.2987811267375946
@@ -21525,13 +21525,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009272271817208031,
- "loss": 0.0188,
+ "loss": 0.0182,
"macro_f1": 0.3333333432674408,
"num_tokens": 3655609.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028425443451851606,
+ "routers_loss": 0.002379779238253832,
"skip_count": 0.0,
"step": 2266,
"text_loss": 0.6024088263511658
@@ -21544,13 +21544,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06689453125,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0009270662978537939,
- "loss": 0.0101,
+ "loss": 0.0098,
"macro_f1": 0.3333333432674408,
"num_tokens": 3658444.0,
"repeat_count": 0.0,
- "routers_loss": 0.009712206199765205,
+ "routers_loss": 0.008943650871515274,
"skip_count": 0.0,
"step": 2268,
"text_loss": 0.1741207242012024
@@ -21563,13 +21563,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0009269052503341736,
- "loss": 0.0162,
+ "loss": 0.0161,
"macro_f1": 0.6595745086669922,
"num_tokens": 3662282.0,
"repeat_count": 1.0,
- "routers_loss": 0.03980376198887825,
+ "routers_loss": 0.030201267451047897,
"skip_count": 4.0,
"step": 2270,
"text_loss": 0.7300035953521729
@@ -21582,13 +21582,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.0009267440392236562,
- "loss": 0.0098,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 3665531.0,
"repeat_count": 0.0,
- "routers_loss": 0.0030603872146457434,
+ "routers_loss": 0.0026635683607310057,
"skip_count": 0.0,
"step": 2272,
"text_loss": 0.31535038352012634
@@ -21601,13 +21601,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0009265826645840178,
"loss": 0.0151,
"macro_f1": 0.3333333432674408,
"num_tokens": 3668407.0,
"repeat_count": 0.0,
- "routers_loss": 0.004795679822564125,
+ "routers_loss": 0.004258926957845688,
"skip_count": 0.0,
"step": 2274,
"text_loss": 0.7272579073905945
@@ -21620,13 +21620,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.1435546875,
+ "grad_norm": 0.125,
"learning_rate": 0.0009264211264770976,
- "loss": 0.0155,
+ "loss": 0.0154,
"macro_f1": 0.6122449040412903,
"num_tokens": 3671503.0,
"repeat_count": 0.0,
- "routers_loss": 0.0340447798371315,
+ "routers_loss": 0.038987524807453156,
"skip_count": 4.0,
"step": 2276,
"text_loss": 0.7488982677459717
@@ -21639,13 +21639,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.099609375,
"learning_rate": 0.0009262594249647975,
- "loss": 0.016,
+ "loss": 0.0164,
"macro_f1": 0.6666666865348816,
"num_tokens": 3674107.0,
"repeat_count": 0.0,
- "routers_loss": 0.007436402142047882,
+ "routers_loss": 0.007211760152131319,
"skip_count": 1.0,
"step": 2278,
"text_loss": 0.1992369294166565
@@ -21658,13 +21658,13 @@
"f1_execute": 0.9767441749572754,
"f1_repeat": 0.8571428656578064,
"f1_skip": 1.0,
- "grad_norm": 0.056396484375,
+ "grad_norm": 0.0546875,
"learning_rate": 0.0009260975601090815,
- "loss": 0.0113,
+ "loss": 0.0112,
"macro_f1": 0.9446290731430054,
"num_tokens": 3677184.0,
"repeat_count": 4.0,
- "routers_loss": 0.02465176396071911,
+ "routers_loss": 0.02538592554628849,
"skip_count": 3.0,
"step": 2280,
"text_loss": 0.46402135491371155
@@ -21677,13 +21677,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0009259355319719768,
- "loss": 0.0167,
+ "loss": 0.0162,
"macro_f1": 0.3333333432674408,
"num_tokens": 3680683.0,
"repeat_count": 0.0,
- "routers_loss": 0.0037910486571490765,
+ "routers_loss": 0.0038464947137981653,
"skip_count": 0.0,
"step": 2282,
"text_loss": 0.5804527401924133
@@ -21696,13 +21696,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009257733406155726,
- "loss": 0.0161,
+ "loss": 0.0169,
"macro_f1": 0.3333333432674408,
"num_tokens": 3683928.0,
"repeat_count": 0.0,
- "routers_loss": 0.003716849023476243,
+ "routers_loss": 0.004841136280447245,
"skip_count": 0.0,
"step": 2284,
"text_loss": 0.4834538400173187
@@ -21715,13 +21715,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0009256109861020212,
- "loss": 0.0118,
+ "loss": 0.0115,
"macro_f1": 0.3333333432674408,
"num_tokens": 3687101.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021690395660698414,
+ "routers_loss": 0.002191900508478284,
"skip_count": 0.0,
"step": 2286,
"text_loss": 0.8199604749679565
@@ -21734,13 +21734,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.000925448468493537,
"loss": 0.0162,
"macro_f1": 0.5427350401878357,
"num_tokens": 3690490.0,
"repeat_count": 1.0,
- "routers_loss": 0.034040264785289764,
+ "routers_loss": 0.03488675877451897,
"skip_count": 2.0,
"step": 2288,
"text_loss": 0.33263635635375977
@@ -21753,32 +21753,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0009252857878523971,
- "loss": 0.0133,
+ "loss": 0.0134,
"macro_f1": 0.6666666865348816,
"num_tokens": 3694109.0,
"repeat_count": 1.0,
- "routers_loss": 0.0027822356205433607,
+ "routers_loss": 0.002897309372201562,
"skip_count": 0.0,
"step": 2290,
"text_loss": 0.47494807839393616
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 10.760786615791018,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.0634765625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05810546875,
"learning_rate": 0.000925122944240941,
- "loss": 0.0156,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0153,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3697233.0,
"repeat_count": 0.0,
- "routers_loss": 0.020813947543501854,
+ "routers_loss": 0.01842675730586052,
"skip_count": 2.0,
"step": 2292,
"text_loss": 0.14693495631217957
@@ -21791,13 +21791,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.042236328125,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0009249599377215707,
- "loss": 0.0145,
+ "loss": 0.0146,
"macro_f1": 0.5866667032241821,
"num_tokens": 3700376.0,
"repeat_count": 1.0,
- "routers_loss": 0.038725610822439194,
+ "routers_loss": 0.04169808700680733,
"skip_count": 3.0,
"step": 2294,
"text_loss": 0.38051268458366394
@@ -21810,13 +21810,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059326171875,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.0009247967683567507,
- "loss": 0.0117,
+ "loss": 0.0112,
"macro_f1": 0.3272727429866791,
"num_tokens": 3703212.0,
"repeat_count": 0.0,
- "routers_loss": 0.01360203418880701,
+ "routers_loss": 0.012183113023638725,
"skip_count": 1.0,
"step": 2296,
"text_loss": 0.23789077997207642
@@ -21829,13 +21829,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0498046875,
+ "grad_norm": 0.05712890625,
"learning_rate": 0.0009246334362090077,
- "loss": 0.0135,
+ "loss": 0.0137,
"macro_f1": 0.8823530077934265,
"num_tokens": 3706490.0,
"repeat_count": 1.0,
- "routers_loss": 0.021909991279244423,
+ "routers_loss": 0.01880069635808468,
"skip_count": 2.0,
"step": 2298,
"text_loss": 0.29067978262901306
@@ -21848,13 +21848,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.08203125,
"learning_rate": 0.000924469941340931,
- "loss": 0.0175,
+ "loss": 0.0173,
"macro_f1": 0.3272727429866791,
"num_tokens": 3709804.0,
"repeat_count": 1.0,
- "routers_loss": 0.03153124824166298,
+ "routers_loss": 0.027359159663319588,
"skip_count": 0.0,
"step": 2300,
"text_loss": 0.67828369140625
@@ -21867,13 +21867,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.000924306283815172,
- "loss": 0.0154,
+ "loss": 0.0153,
"macro_f1": 0.3333333432674408,
"num_tokens": 3712824.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034419491421431303,
+ "routers_loss": 0.003152279881760478,
"skip_count": 0.0,
"step": 2302,
"text_loss": 0.8333184719085693
@@ -21886,13 +21886,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009241424636944445,
- "loss": 0.0163,
+ "loss": 0.0159,
"macro_f1": 0.5492662787437439,
"num_tokens": 3715385.0,
"repeat_count": 0.0,
- "routers_loss": 0.03655214607715607,
+ "routers_loss": 0.0442950464785099,
"skip_count": 2.0,
"step": 2304,
"text_loss": 0.41893699765205383
@@ -21905,13 +21905,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0576171875,
+ "grad_norm": 0.058837890625,
"learning_rate": 0.0009239784810415249,
- "loss": 0.014,
+ "loss": 0.0137,
"macro_f1": 0.8823530077934265,
"num_tokens": 3719080.0,
"repeat_count": 1.0,
- "routers_loss": 0.015360959805548191,
+ "routers_loss": 0.015729321166872978,
"skip_count": 2.0,
"step": 2306,
"text_loss": 0.13360483944416046
@@ -21924,13 +21924,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.0537109375,
+ "grad_norm": 0.06787109375,
"learning_rate": 0.0009238143359192514,
"loss": 0.0136,
"macro_f1": 0.5934640765190125,
"num_tokens": 3722439.0,
"repeat_count": 0.0,
- "routers_loss": 0.027275927364826202,
+ "routers_loss": 0.028816604986786842,
"skip_count": 3.0,
"step": 2308,
"text_loss": 0.39594101905822754
@@ -21943,13 +21943,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0546875,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.000923650028390525,
- "loss": 0.0163,
+ "loss": 0.0166,
"macro_f1": 0.6666666865348816,
"num_tokens": 3725092.0,
"repeat_count": 0.0,
- "routers_loss": 0.003742894157767296,
+ "routers_loss": 0.0036455015651881695,
"skip_count": 2.0,
"step": 2310,
"text_loss": 0.6169708371162415
@@ -21962,13 +21962,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0927734375,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009234855585183086,
- "loss": 0.0135,
+ "loss": 0.014,
"macro_f1": 0.6666666865348816,
"num_tokens": 3728412.0,
"repeat_count": 0.0,
- "routers_loss": 0.009356650523841381,
+ "routers_loss": 0.007565604057163,
"skip_count": 1.0,
"step": 2312,
"text_loss": 0.21257059276103973
@@ -21983,11 +21983,11 @@
"f1_skip": 0.800000011920929,
"grad_norm": 0.0517578125,
"learning_rate": 0.0009233209263656273,
- "loss": 0.0189,
+ "loss": 0.0184,
"macro_f1": 0.9262410998344421,
"num_tokens": 3731467.0,
"repeat_count": 2.0,
- "routers_loss": 0.02852487564086914,
+ "routers_loss": 0.02510629966855049,
"skip_count": 3.0,
"step": 2314,
"text_loss": 0.21639840304851532
@@ -22000,13 +22000,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.057861328125,
"learning_rate": 0.0009231561319955684,
- "loss": 0.0151,
+ "loss": 0.0154,
"macro_f1": 0.3333333432674408,
"num_tokens": 3734906.0,
"repeat_count": 0.0,
- "routers_loss": 0.007533316500484943,
+ "routers_loss": 0.00872227642685175,
"skip_count": 0.0,
"step": 2316,
"text_loss": 0.35639774799346924
@@ -22019,13 +22019,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009229911754712815,
"loss": 0.0176,
"macro_f1": 0.3333333432674408,
"num_tokens": 3737943.0,
"repeat_count": 0.0,
- "routers_loss": 0.004666361026465893,
+ "routers_loss": 0.004695790819823742,
"skip_count": 0.0,
"step": 2318,
"text_loss": 0.5269573330879211
@@ -22038,32 +22038,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.036376953125,
"learning_rate": 0.0009228260568559781,
- "loss": 0.0117,
+ "loss": 0.0115,
"macro_f1": 0.3272727429866791,
"num_tokens": 3741833.0,
"repeat_count": 1.0,
- "routers_loss": 0.020992714911699295,
+ "routers_loss": 0.0217357836663723,
"skip_count": 0.0,
"step": 2320,
"text_loss": 0.5110208988189697
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 10.901673026122689,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.1416015625,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1953125,
"learning_rate": 0.0009226607762129322,
- "loss": 0.0204,
- "macro_f1": 0.6603773832321167,
+ "loss": 0.0201,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 3744642.0,
"repeat_count": 1.0,
- "routers_loss": 0.047016773372888565,
+ "routers_loss": 0.05595960095524788,
"skip_count": 1.0,
"step": 2322,
"text_loss": 0.6291998624801636
@@ -22078,11 +22078,11 @@
"f1_skip": 0.0,
"grad_norm": 0.056884765625,
"learning_rate": 0.0009224953336054796,
- "loss": 0.0156,
+ "loss": 0.0161,
"macro_f1": 0.3333333432674408,
"num_tokens": 3748127.0,
"repeat_count": 0.0,
- "routers_loss": 0.006612313445657492,
+ "routers_loss": 0.0071634589694440365,
"skip_count": 0.0,
"step": 2324,
"text_loss": 0.7404762506484985
@@ -22095,13 +22095,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.000922329729097018,
- "loss": 0.0164,
+ "loss": 0.0169,
"macro_f1": 0.3333333432674408,
"num_tokens": 3751373.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012452995870262384,
+ "routers_loss": 0.0011676300782710314,
"skip_count": 0.0,
"step": 2326,
"text_loss": 0.2915459871292114
@@ -22114,13 +22114,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.055908203125,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.0009221639627510075,
- "loss": 0.0128,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 3754518.0,
"repeat_count": 0.0,
- "routers_loss": 0.011379311792552471,
+ "routers_loss": 0.01039792038500309,
"skip_count": 0.0,
"step": 2328,
"text_loss": 0.22066321969032288
@@ -22133,13 +22133,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0009219980346309702,
- "loss": 0.0127,
+ "loss": 0.0128,
"macro_f1": 0.3333333432674408,
"num_tokens": 3757621.0,
"repeat_count": 0.0,
- "routers_loss": 0.002973968628793955,
+ "routers_loss": 0.0032070958986878395,
"skip_count": 0.0,
"step": 2330,
"text_loss": 0.5558560490608215
@@ -22152,13 +22152,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.076171875,
"learning_rate": 0.0009218319448004899,
- "loss": 0.012,
+ "loss": 0.0118,
"macro_f1": 0.3333333432674408,
"num_tokens": 3760885.0,
"repeat_count": 0.0,
- "routers_loss": 0.00768645154312253,
+ "routers_loss": 0.007085457909852266,
"skip_count": 0.0,
"step": 2332,
"text_loss": 0.4348253607749939
@@ -22171,13 +22171,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.0009216656933232129,
- "loss": 0.0167,
+ "loss": 0.016,
"macro_f1": 0.6666666865348816,
"num_tokens": 3764462.0,
"repeat_count": 0.0,
- "routers_loss": 0.006761785596609116,
+ "routers_loss": 0.005504854489117861,
"skip_count": 1.0,
"step": 2334,
"text_loss": 0.35828644037246704
@@ -22190,13 +22190,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0576171875,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0009214992802628463,
- "loss": 0.0129,
+ "loss": 0.0131,
"macro_f1": 0.3333333432674408,
"num_tokens": 3767159.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013711688807234168,
+ "routers_loss": 0.0013970810687169433,
"skip_count": 0.0,
"step": 2336,
"text_loss": 0.2956557869911194
@@ -22209,13 +22209,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.08203125,
"learning_rate": 0.0009213327056831607,
- "loss": 0.0174,
+ "loss": 0.0181,
"macro_f1": 0.3272727429866791,
"num_tokens": 3770408.0,
"repeat_count": 0.0,
- "routers_loss": 0.04009406641125679,
+ "routers_loss": 0.0427570566534996,
"skip_count": 1.0,
"step": 2338,
"text_loss": 0.14883014559745789
@@ -22228,13 +22228,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0009211659696479875,
- "loss": 0.0095,
+ "loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 3773474.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013272224459797144,
+ "routers_loss": 0.0011273405980318785,
"skip_count": 0.0,
"step": 2340,
"text_loss": 0.26011669635772705
@@ -22249,11 +22249,11 @@
"f1_skip": 0.0,
"grad_norm": 0.059814453125,
"learning_rate": 0.00092099907222122,
- "loss": 0.0145,
+ "loss": 0.0148,
"macro_f1": 0.3333333432674408,
"num_tokens": 3776909.0,
"repeat_count": 0.0,
- "routers_loss": 0.001724833040498197,
+ "routers_loss": 0.0016178421210497618,
"skip_count": 0.0,
"step": 2342,
"text_loss": 0.49078530073165894
@@ -22266,13 +22266,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05908203125,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.000920832013466814,
- "loss": 0.0132,
+ "loss": 0.0129,
"macro_f1": 0.3333333432674408,
"num_tokens": 3780741.0,
"repeat_count": 0.0,
- "routers_loss": 0.005641496740281582,
+ "routers_loss": 0.005510095041245222,
"skip_count": 0.0,
"step": 2344,
"text_loss": 0.4870249927043915
@@ -22285,13 +22285,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.037109375,
"learning_rate": 0.0009206647934487866,
- "loss": 0.011,
+ "loss": 0.0114,
"macro_f1": 0.6666666865348816,
"num_tokens": 3784673.0,
"repeat_count": 1.0,
- "routers_loss": 0.003907595761120319,
+ "routers_loss": 0.0047357892617583275,
"skip_count": 0.0,
"step": 2346,
"text_loss": 0.3251725733280182
@@ -22304,13 +22304,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.057861328125,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0009204974122312167,
- "loss": 0.0141,
+ "loss": 0.0142,
"macro_f1": 0.6666666865348816,
"num_tokens": 3787503.0,
"repeat_count": 0.0,
- "routers_loss": 0.007570050656795502,
+ "routers_loss": 0.00795028731226921,
"skip_count": 1.0,
"step": 2348,
"text_loss": 0.18282145261764526
@@ -22323,13 +22323,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.055908203125,
+ "grad_norm": 0.060546875,
"learning_rate": 0.0009203298698782452,
- "loss": 0.0079,
+ "loss": 0.0081,
"macro_f1": 0.6666666865348816,
"num_tokens": 3790528.0,
"repeat_count": 1.0,
- "routers_loss": 0.0009280897793360054,
+ "routers_loss": 0.0009506374481134117,
"skip_count": 0.0,
"step": 2350,
"text_loss": 0.4093080461025238
@@ -22342,13 +22342,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.045166015625,
+ "grad_norm": 0.047607421875,
"learning_rate": 0.0009201621664540747,
"loss": 0.0155,
"macro_f1": 0.6666666865348816,
"num_tokens": 3794134.0,
"repeat_count": 1.0,
- "routers_loss": 0.005288597662001848,
+ "routers_loss": 0.005159572698175907,
"skip_count": 0.0,
"step": 2352,
"text_loss": 0.5451981425285339
@@ -22361,13 +22361,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009199943020229694,
- "loss": 0.0146,
+ "loss": 0.0148,
"macro_f1": 0.3333333432674408,
"num_tokens": 3797414.0,
"repeat_count": 0.0,
- "routers_loss": 0.002237799344584346,
+ "routers_loss": 0.002356168581172824,
"skip_count": 0.0,
"step": 2354,
"text_loss": 0.3070453405380249
@@ -22380,13 +22380,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.0810546875,
"learning_rate": 0.0009198262766492554,
- "loss": 0.0144,
+ "loss": 0.0141,
"macro_f1": 0.6666666865348816,
"num_tokens": 3800094.0,
"repeat_count": 0.0,
- "routers_loss": 0.006226782687008381,
+ "routers_loss": 0.0051761893555521965,
"skip_count": 1.0,
"step": 2356,
"text_loss": 0.5880904197692871
@@ -22399,13 +22399,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.049072265625,
+ "grad_norm": 0.049560546875,
"learning_rate": 0.00091965809039732,
- "loss": 0.0136,
+ "loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 3803280.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027645498048514128,
+ "routers_loss": 0.0025952060241252184,
"skip_count": 0.0,
"step": 2358,
"text_loss": 0.5210731625556946
@@ -22418,13 +22418,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.06787109375,
"learning_rate": 0.0009194897433316127,
- "loss": 0.0122,
+ "loss": 0.0125,
"macro_f1": 0.6666666865348816,
"num_tokens": 3805866.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034913592971861362,
+ "routers_loss": 0.0042560105212032795,
"skip_count": 2.0,
"step": 2360,
"text_loss": 0.6472984552383423
@@ -22437,13 +22437,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08056640625,
+ "grad_norm": 0.07568359375,
"learning_rate": 0.0009193212355166446,
- "loss": 0.0112,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 3808952.0,
"repeat_count": 0.0,
- "routers_loss": 0.002706601284444332,
+ "routers_loss": 0.0026232977397739887,
"skip_count": 0.0,
"step": 2362,
"text_loss": 0.450063556432724
@@ -22456,13 +22456,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009191525670169881,
- "loss": 0.0108,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 3812080.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032696903217583895,
+ "routers_loss": 0.0034355956595391035,
"skip_count": 0.0,
"step": 2364,
"text_loss": 0.49727216362953186
@@ -22475,13 +22475,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.000918983737897277,
- "loss": 0.0115,
+ "loss": 0.0112,
"macro_f1": 0.6666666865348816,
"num_tokens": 3815282.0,
"repeat_count": 0.0,
- "routers_loss": 0.006245410069823265,
+ "routers_loss": 0.0055653867311775684,
"skip_count": 1.0,
"step": 2366,
"text_loss": 0.6336377859115601
@@ -22496,11 +22496,11 @@
"f1_skip": 1.0,
"grad_norm": 0.033447265625,
"learning_rate": 0.0009188147482222071,
- "loss": 0.0079,
+ "loss": 0.008,
"macro_f1": 1.0,
"num_tokens": 3818106.0,
"repeat_count": 2.0,
- "routers_loss": 0.011230813339352608,
+ "routers_loss": 0.011016021482646465,
"skip_count": 2.0,
"step": 2368,
"text_loss": 0.22513329982757568
@@ -22515,11 +22515,11 @@
"f1_skip": 0.0,
"grad_norm": 0.04296875,
"learning_rate": 0.0009186455980565358,
- "loss": 0.0109,
+ "loss": 0.0105,
"macro_f1": 0.6666666865348816,
"num_tokens": 3821228.0,
"repeat_count": 1.0,
- "routers_loss": 0.014897257089614868,
+ "routers_loss": 0.014039464294910431,
"skip_count": 0.0,
"step": 2370,
"text_loss": 0.21331638097763062
@@ -22532,13 +22532,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0009184762874650816,
- "loss": 0.0131,
+ "loss": 0.0128,
"macro_f1": 0.3333333432674408,
"num_tokens": 3825048.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015503648901358247,
+ "routers_loss": 0.001088051125407219,
"skip_count": 0.0,
"step": 2372,
"text_loss": 0.6031543612480164
@@ -22551,13 +22551,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009183068165127245,
- "loss": 0.0127,
+ "loss": 0.013,
"macro_f1": 0.6666666865348816,
"num_tokens": 3828781.0,
"repeat_count": 0.0,
- "routers_loss": 0.00723480898886919,
+ "routers_loss": 0.006263940595090389,
"skip_count": 1.0,
"step": 2374,
"text_loss": 0.6249601244926453
@@ -22570,13 +22570,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.076171875,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0009181371852644062,
- "loss": 0.0139,
+ "loss": 0.0133,
"macro_f1": 0.6666666865348816,
"num_tokens": 3832507.0,
"repeat_count": 1.0,
- "routers_loss": 0.002053398173302412,
+ "routers_loss": 0.001987969037145376,
"skip_count": 0.0,
"step": 2376,
"text_loss": 0.37972065806388855
@@ -22589,32 +22589,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06689453125,
+ "grad_norm": 0.0908203125,
"learning_rate": 0.0009179673937851299,
"loss": 0.0158,
"macro_f1": 0.6666666865348816,
"num_tokens": 3835644.0,
"repeat_count": 0.0,
- "routers_loss": 0.007927518337965012,
+ "routers_loss": 0.007635094691067934,
"skip_count": 1.0,
"step": 2378,
"text_loss": 0.46319663524627686
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 11.173759906075727,
"f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.06298828125,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009177974421399598,
- "loss": 0.0144,
- "macro_f1": 0.5555555820465088,
+ "loss": 0.0137,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3838700.0,
"repeat_count": 0.0,
- "routers_loss": 0.01924682781100273,
+ "routers_loss": 0.01617279462516308,
"skip_count": 2.0,
"step": 2380,
"text_loss": 0.32141056656837463
@@ -22627,13 +22627,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.046875,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0009176273303940217,
- "loss": 0.0106,
+ "loss": 0.011,
"macro_f1": 0.6666666865348816,
"num_tokens": 3841953.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021689811255782843,
+ "routers_loss": 0.0022273799404501915,
"skip_count": 2.0,
"step": 2382,
"text_loss": 0.5908139944076538
@@ -22646,13 +22646,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.062255859375,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0009174570586125026,
- "loss": 0.0119,
+ "loss": 0.0122,
"macro_f1": 0.32098767161369324,
"num_tokens": 3845763.0,
"repeat_count": 1.0,
- "routers_loss": 0.03431013971567154,
+ "routers_loss": 0.030915161594748497,
"skip_count": 0.0,
"step": 2384,
"text_loss": 0.41400137543678284
@@ -22665,13 +22665,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.043212890625,
+ "grad_norm": 0.04248046875,
"learning_rate": 0.0009172866268606513,
- "loss": 0.0123,
+ "loss": 0.0122,
"macro_f1": 0.6666666865348816,
"num_tokens": 3848984.0,
"repeat_count": 0.0,
- "routers_loss": 0.008275258354842663,
+ "routers_loss": 0.010480951517820358,
"skip_count": 2.0,
"step": 2386,
"text_loss": 0.2560874819755554
@@ -22684,13 +22684,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04736328125,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0009171160352037775,
- "loss": 0.0121,
+ "loss": 0.0124,
"macro_f1": 0.6666666865348816,
"num_tokens": 3852118.0,
"repeat_count": 0.0,
- "routers_loss": 0.007780806161463261,
+ "routers_loss": 0.00809961836785078,
"skip_count": 1.0,
"step": 2388,
"text_loss": 0.28236693143844604
@@ -22709,7 +22709,7 @@
"macro_f1": 1.0,
"num_tokens": 3855314.0,
"repeat_count": 1.0,
- "routers_loss": 0.00553786288946867,
+ "routers_loss": 0.005569872446358204,
"skip_count": 1.0,
"step": 2390,
"text_loss": 0.4578137695789337
@@ -22722,13 +22722,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08447265625,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009167743724365073,
- "loss": 0.01,
+ "loss": 0.0105,
"macro_f1": 0.6666666865348816,
"num_tokens": 3858301.0,
"repeat_count": 0.0,
- "routers_loss": 0.004066115710884333,
+ "routers_loss": 0.0038610948249697685,
"skip_count": 1.0,
"step": 2392,
"text_loss": 0.14082716405391693
@@ -22741,13 +22741,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0810546875,
+ "grad_norm": 0.1376953125,
"learning_rate": 0.0009166033014570368,
- "loss": 0.0104,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 3861296.0,
"repeat_count": 0.0,
- "routers_loss": 0.002403446938842535,
+ "routers_loss": 0.0017607157351449132,
"skip_count": 0.0,
"step": 2394,
"text_loss": 0.384442001581192
@@ -22760,13 +22760,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.054443359375,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.0009164320708343954,
- "loss": 0.0137,
+ "loss": 0.0131,
"macro_f1": 0.6666666865348816,
"num_tokens": 3863985.0,
"repeat_count": 2.0,
- "routers_loss": 0.010212135501205921,
+ "routers_loss": 0.009627950377762318,
"skip_count": 0.0,
"step": 2396,
"text_loss": 0.6969521045684814
@@ -22779,13 +22779,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07275390625,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009162606806341989,
"loss": 0.0107,
"macro_f1": 0.3333333432674408,
"num_tokens": 3866636.0,
"repeat_count": 0.0,
- "routers_loss": 0.007781816180795431,
+ "routers_loss": 0.006915586534887552,
"skip_count": 0.0,
"step": 2398,
"text_loss": 0.48069697618484497
@@ -22798,32 +22798,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.04248046875,
"learning_rate": 0.0009160891309221242,
- "loss": 0.0151,
+ "loss": 0.0149,
"macro_f1": 0.6666666865348816,
"num_tokens": 3870867.0,
"repeat_count": 1.0,
- "routers_loss": 0.0016227158484980464,
+ "routers_loss": 0.0013031222624704242,
"skip_count": 0.0,
"step": 2400,
"text_loss": 0.3882075846195221
},
{
"acc_repeat": 0.5,
- "acc_skip": 1.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
"epoch": 11.277076606985618,
- "f1_execute": 0.9803921580314636,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.6666666865348816,
- "f1_skip": 1.0,
- "grad_norm": 0.06298828125,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0009159174217639096,
- "loss": 0.0114,
- "macro_f1": 0.8823530077934265,
+ "loss": 0.0112,
+ "macro_f1": 0.5427350401878357,
"num_tokens": 3873663.0,
"repeat_count": 2.0,
- "routers_loss": 0.06490851938724518,
+ "routers_loss": 0.06621067970991135,
"skip_count": 1.0,
"step": 2402,
"text_loss": 0.5740041136741638
@@ -22836,13 +22836,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0009157455532253547,
- "loss": 0.0075,
+ "loss": 0.0071,
"macro_f1": 0.6666666865348816,
"num_tokens": 3876788.0,
"repeat_count": 1.0,
- "routers_loss": 0.007105287164449692,
+ "routers_loss": 0.005957918707281351,
"skip_count": 0.0,
"step": 2404,
"text_loss": 0.26025933027267456
@@ -22855,13 +22855,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 1.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.06787109375,
+ "grad_norm": 0.08642578125,
"learning_rate": 0.0009155735253723191,
- "loss": 0.0125,
+ "loss": 0.0126,
"macro_f1": 0.9452888369560242,
"num_tokens": 3879942.0,
"repeat_count": 1.0,
- "routers_loss": 0.03736003860831261,
+ "routers_loss": 0.039429809898138046,
"skip_count": 4.0,
"step": 2406,
"text_loss": 1.1349908113479614
@@ -22874,13 +22874,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.047607421875,
"learning_rate": 0.0009154013382707251,
- "loss": 0.011,
+ "loss": 0.0113,
"macro_f1": 0.3333333432674408,
"num_tokens": 3882682.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012925176415592432,
+ "routers_loss": 0.0012570557883009315,
"skip_count": 0.0,
"step": 2408,
"text_loss": 0.5611135363578796
@@ -22895,11 +22895,11 @@
"f1_skip": 0.0,
"grad_norm": 0.034423828125,
"learning_rate": 0.0009152289919865543,
- "loss": 0.0124,
+ "loss": 0.0123,
"macro_f1": 0.3333333432674408,
"num_tokens": 3886425.0,
"repeat_count": 0.0,
- "routers_loss": 0.001746711554005742,
+ "routers_loss": 0.0017455556662753224,
"skip_count": 0.0,
"step": 2410,
"text_loss": 0.7523751854896545
@@ -22912,13 +22912,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04345703125,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0009150564865858506,
- "loss": 0.0112,
+ "loss": 0.0114,
"macro_f1": 0.6666666865348816,
"num_tokens": 3889273.0,
"repeat_count": 0.0,
- "routers_loss": 0.011005193926393986,
+ "routers_loss": 0.011178011074662209,
"skip_count": 1.0,
"step": 2412,
"text_loss": 0.26942551136016846
@@ -22931,13 +22931,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.800000011920929,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009148838221347182,
- "loss": 0.0102,
+ "loss": 0.0107,
"macro_f1": 0.5934640765190125,
"num_tokens": 3892199.0,
"repeat_count": 3.0,
- "routers_loss": 0.017795369029045105,
+ "routers_loss": 0.019628092646598816,
"skip_count": 0.0,
"step": 2414,
"text_loss": 0.5492315888404846
@@ -22950,13 +22950,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.0009147109986993225,
"loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 3895362.0,
"repeat_count": 1.0,
- "routers_loss": 0.011693861335515976,
+ "routers_loss": 0.012255983427166939,
"skip_count": 0.0,
"step": 2416,
"text_loss": 0.23798216879367828
@@ -22969,13 +22969,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009145380163458899,
- "loss": 0.0177,
+ "loss": 0.0178,
"macro_f1": 0.3333333432674408,
"num_tokens": 3898476.0,
"repeat_count": 0.0,
- "routers_loss": 0.007135285064578056,
+ "routers_loss": 0.007018954027444124,
"skip_count": 0.0,
"step": 2418,
"text_loss": 0.1923145055770874
@@ -22988,13 +22988,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03515625,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0009143648751407074,
- "loss": 0.0082,
+ "loss": 0.0081,
"macro_f1": 0.3333333432674408,
"num_tokens": 3901817.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008607010240666568,
+ "routers_loss": 0.0008574824314564466,
"skip_count": 0.0,
"step": 2420,
"text_loss": 0.4001806974411011
@@ -23007,13 +23007,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.11328125,
"learning_rate": 0.0009141915751501231,
- "loss": 0.0101,
+ "loss": 0.0102,
"macro_f1": 0.5492662787437439,
"num_tokens": 3905461.0,
"repeat_count": 0.0,
- "routers_loss": 0.015359465964138508,
+ "routers_loss": 0.01572350226342678,
"skip_count": 2.0,
"step": 2422,
"text_loss": 0.19519129395484924
@@ -23026,13 +23026,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0380859375,
+ "grad_norm": 0.037353515625,
"learning_rate": 0.0009140181164405458,
- "loss": 0.011,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 3908878.0,
"repeat_count": 0.0,
- "routers_loss": 0.00047823251225054264,
+ "routers_loss": 0.0005503420252352953,
"skip_count": 0.0,
"step": 2424,
"text_loss": 0.6937088370323181
@@ -23047,11 +23047,11 @@
"f1_skip": 0.0,
"grad_norm": 0.068359375,
"learning_rate": 0.0009138444990784454,
- "loss": 0.0129,
+ "loss": 0.013,
"macro_f1": 0.3333333432674408,
"num_tokens": 3912053.0,
"repeat_count": 0.0,
- "routers_loss": 0.0070601715706288815,
+ "routers_loss": 0.007556677330285311,
"skip_count": 0.0,
"step": 2426,
"text_loss": 0.35431069135665894
@@ -23064,13 +23064,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.06201171875,
"learning_rate": 0.000913670723130352,
- "loss": 0.0123,
+ "loss": 0.0117,
"macro_f1": 0.3333333432674408,
"num_tokens": 3915192.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010537977796047926,
+ "routers_loss": 0.0013609991874545813,
"skip_count": 0.0,
"step": 2428,
"text_loss": 0.5171207189559937
@@ -23083,13 +23083,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0517578125,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0009134967886628573,
- "loss": 0.0117,
+ "loss": 0.0115,
"macro_f1": 1.0,
"num_tokens": 3917927.0,
"repeat_count": 2.0,
- "routers_loss": 0.012852456420660019,
+ "routers_loss": 0.010895746760070324,
"skip_count": 2.0,
"step": 2430,
"text_loss": 0.2852934002876282
@@ -23102,13 +23102,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0009133226957426133,
- "loss": 0.0134,
+ "loss": 0.0132,
"macro_f1": 0.5492662787437439,
"num_tokens": 3921460.0,
"repeat_count": 2.0,
- "routers_loss": 0.05307198315858841,
+ "routers_loss": 0.04196908697485924,
"skip_count": 0.0,
"step": 2432,
"text_loss": 0.4864770770072937
@@ -23121,13 +23121,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009131484444363324,
- "loss": 0.0154,
+ "loss": 0.0155,
"macro_f1": 0.3333333432674408,
"num_tokens": 3924662.0,
"repeat_count": 0.0,
- "routers_loss": 0.004656757228076458,
+ "routers_loss": 0.004484197124838829,
"skip_count": 0.0,
"step": 2434,
"text_loss": 0.7568684220314026
@@ -23140,13 +23140,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0498046875,
+ "grad_norm": 0.05078125,
"learning_rate": 0.0009129740348107882,
- "loss": 0.0113,
+ "loss": 0.0114,
"macro_f1": 0.6666666865348816,
"num_tokens": 3927337.0,
"repeat_count": 0.0,
- "routers_loss": 0.0042406003922224045,
+ "routers_loss": 0.004351360257714987,
"skip_count": 2.0,
"step": 2436,
"text_loss": 0.5953161716461182
@@ -23159,13 +23159,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.0517578125,
+ "grad_norm": 0.04736328125,
"learning_rate": 0.0009127994669328151,
- "loss": 0.0089,
+ "loss": 0.0085,
"macro_f1": 0.6122449040412903,
"num_tokens": 3930407.0,
"repeat_count": 0.0,
- "routers_loss": 0.018079286441206932,
+ "routers_loss": 0.01664198748767376,
"skip_count": 4.0,
"step": 2438,
"text_loss": 0.5320524573326111
@@ -23178,13 +23178,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.0595703125,
"learning_rate": 0.0009126247408693071,
- "loss": 0.0072,
+ "loss": 0.0071,
"macro_f1": 0.6666666865348816,
"num_tokens": 3933184.0,
"repeat_count": 0.0,
- "routers_loss": 0.002266801195219159,
+ "routers_loss": 0.0017819046042859554,
"skip_count": 1.0,
"step": 2440,
"text_loss": 0.6051273345947266
@@ -23197,13 +23197,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0009124498566872204,
- "loss": 0.01,
+ "loss": 0.0105,
"macro_f1": 0.3333333432674408,
"num_tokens": 3936620.0,
"repeat_count": 0.0,
- "routers_loss": 0.005790423136204481,
+ "routers_loss": 0.005519696045666933,
"skip_count": 0.0,
"step": 2442,
"text_loss": 0.12987950444221497
@@ -23216,13 +23216,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052734375,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0009122748144535704,
- "loss": 0.011,
+ "loss": 0.0111,
"macro_f1": 0.32098764181137085,
"num_tokens": 3940010.0,
"repeat_count": 0.0,
- "routers_loss": 0.04591076448559761,
+ "routers_loss": 0.04543351009488106,
"skip_count": 2.0,
"step": 2444,
"text_loss": 0.4642033576965332
@@ -23235,13 +23235,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.04296875,
"learning_rate": 0.0009120996142354338,
- "loss": 0.0122,
+ "loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 3943135.0,
"repeat_count": 0.0,
- "routers_loss": 0.004969341680407524,
+ "routers_loss": 0.00550565542653203,
"skip_count": 0.0,
"step": 2446,
"text_loss": 0.5697627067565918
@@ -23254,13 +23254,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05615234375,
+ "grad_norm": 0.05029296875,
"learning_rate": 0.0009119242560999477,
"loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 3946650.0,
"repeat_count": 0.0,
- "routers_loss": 0.00830315612256527,
+ "routers_loss": 0.008842485956847668,
"skip_count": 0.0,
"step": 2448,
"text_loss": 0.17046524584293365
@@ -23273,13 +23273,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.078125,
+ "grad_norm": 0.08154296875,
"learning_rate": 0.0009117487401143095,
"loss": 0.0154,
"macro_f1": 0.6666666865348816,
"num_tokens": 3949470.0,
"repeat_count": 1.0,
- "routers_loss": 0.0059144929982721806,
+ "routers_loss": 0.005900127813220024,
"skip_count": 0.0,
"step": 2450,
"text_loss": 0.37260866165161133
@@ -23292,13 +23292,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.030029296875,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0009115730663457773,
- "loss": 0.0132,
+ "loss": 0.0137,
"macro_f1": 1.0,
"num_tokens": 3952546.0,
"repeat_count": 1.0,
- "routers_loss": 0.0029762545600533485,
+ "routers_loss": 0.003409258322790265,
"skip_count": 1.0,
"step": 2452,
"text_loss": 0.5308008193969727
@@ -23311,13 +23311,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.052001953125,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0009113972348616698,
- "loss": 0.0091,
+ "loss": 0.0095,
"macro_f1": 0.6666666865348816,
"num_tokens": 3955817.0,
"repeat_count": 0.0,
- "routers_loss": 0.011962058953940868,
+ "routers_loss": 0.010098597034811974,
"skip_count": 1.0,
"step": 2454,
"text_loss": 0.39226648211479187
@@ -23330,13 +23330,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009112212457293658,
- "loss": 0.0101,
+ "loss": 0.0102,
"macro_f1": 0.3272727429866791,
"num_tokens": 3958911.0,
"repeat_count": 0.0,
- "routers_loss": 0.07289884239435196,
+ "routers_loss": 0.08184818178415298,
"skip_count": 0.0,
"step": 2456,
"text_loss": 0.45411455631256104
@@ -23349,13 +23349,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.041259765625,
"learning_rate": 0.0009110450990163047,
- "loss": 0.0124,
+ "loss": 0.0127,
"macro_f1": 0.3333333432674408,
"num_tokens": 3962584.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009638209594413638,
+ "routers_loss": 0.0009352223132736981,
"skip_count": 0.0,
"step": 2458,
"text_loss": 0.47292324900627136
@@ -23368,13 +23368,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0400390625,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0009108687947899863,
- "loss": 0.0078,
+ "loss": 0.0077,
"macro_f1": 1.0,
"num_tokens": 3965597.0,
"repeat_count": 1.0,
- "routers_loss": 0.008587516844272614,
+ "routers_loss": 0.008150188252329826,
"skip_count": 2.0,
"step": 2460,
"text_loss": 0.33208340406417847
@@ -23387,13 +23387,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0009106923331179707,
- "loss": 0.0126,
+ "loss": 0.0125,
"macro_f1": 0.5492662787437439,
"num_tokens": 3968664.0,
"repeat_count": 0.0,
- "routers_loss": 0.05080332234501839,
+ "routers_loss": 0.050999004393815994,
"skip_count": 2.0,
"step": 2462,
"text_loss": 0.2459995150566101
@@ -23406,13 +23406,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07080078125,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0009105157140678782,
- "loss": 0.0124,
+ "loss": 0.0126,
"macro_f1": 0.6666666865348816,
"num_tokens": 3971772.0,
"repeat_count": 0.0,
- "routers_loss": 0.007348654326051474,
+ "routers_loss": 0.006196586415171623,
"skip_count": 1.0,
"step": 2464,
"text_loss": 0.23956991732120514
@@ -23425,13 +23425,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06787109375,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0009103389377073896,
- "loss": 0.0099,
+ "loss": 0.01,
"macro_f1": 0.3333333432674408,
"num_tokens": 3976224.0,
"repeat_count": 0.0,
- "routers_loss": 0.007161752786487341,
+ "routers_loss": 0.008181816898286343,
"skip_count": 0.0,
"step": 2466,
"text_loss": 0.3235875070095062
@@ -23444,13 +23444,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.057373046875,
"learning_rate": 0.0009101620041042462,
- "loss": 0.0119,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 3978876.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015090530505403876,
+ "routers_loss": 0.0015451472718268633,
"skip_count": 0.0,
"step": 2468,
"text_loss": 0.4038759469985962
@@ -23463,13 +23463,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07275390625,
+ "grad_norm": 0.09130859375,
"learning_rate": 0.000909984913326249,
- "loss": 0.0129,
+ "loss": 0.0131,
"macro_f1": 0.3272727429866791,
"num_tokens": 3981992.0,
"repeat_count": 0.0,
- "routers_loss": 0.021420184522867203,
+ "routers_loss": 0.021785033866763115,
"skip_count": 1.0,
"step": 2470,
"text_loss": 0.6346460580825806
@@ -23482,13 +23482,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.0712890625,
"learning_rate": 0.0009098076654412595,
- "loss": 0.0092,
+ "loss": 0.0094,
"macro_f1": 0.3333333432674408,
"num_tokens": 3984560.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010742908343672752,
+ "routers_loss": 0.0011462471447885036,
"skip_count": 0.0,
"step": 2472,
"text_loss": 0.3449646532535553
@@ -23501,13 +23501,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05078125,
+ "grad_norm": 0.049560546875,
"learning_rate": 0.0009096302605171996,
- "loss": 0.011,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 3987548.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015209210105240345,
+ "routers_loss": 0.0014367027906700969,
"skip_count": 0.0,
"step": 2474,
"text_loss": 0.5918350219726562
@@ -23520,13 +23520,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.0478515625,
"learning_rate": 0.0009094526986220513,
"loss": 0.0124,
"macro_f1": 0.3333333432674408,
"num_tokens": 3990727.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008761848439462483,
+ "routers_loss": 0.0008977655088528991,
"skip_count": 0.0,
"step": 2476,
"text_loss": 0.463350385427475
@@ -23539,13 +23539,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.0009092749798238563,
- "loss": 0.0146,
+ "loss": 0.015,
"macro_f1": 0.3272727429866791,
"num_tokens": 3993757.0,
"repeat_count": 1.0,
- "routers_loss": 0.01623794063925743,
+ "routers_loss": 0.016712551936507225,
"skip_count": 0.0,
"step": 2478,
"text_loss": 0.5621229410171509
@@ -23558,13 +23558,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07080078125,
+ "grad_norm": 0.06640625,
"learning_rate": 0.000909097104190717,
- "loss": 0.0174,
+ "loss": 0.0172,
"macro_f1": 0.32098764181137085,
"num_tokens": 3997259.0,
"repeat_count": 0.0,
- "routers_loss": 0.04170118644833565,
+ "routers_loss": 0.04134179651737213,
"skip_count": 2.0,
"step": 2480,
"text_loss": 0.375476598739624
@@ -23577,32 +23577,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.046875,
+ "grad_norm": 0.044677734375,
"learning_rate": 0.0009089190717907956,
- "loss": 0.0116,
+ "loss": 0.0117,
"macro_f1": 0.3333333432674408,
"num_tokens": 4000563.0,
"repeat_count": 0.0,
- "routers_loss": 0.003591755870729685,
+ "routers_loss": 0.003462378401309252,
"skip_count": 0.0,
"step": 2482,
"text_loss": 0.5553798675537109
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 11.66216612855885,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.0693359375,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0009087408826923146,
- "loss": 0.0185,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0182,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 4004065.0,
"repeat_count": 0.0,
- "routers_loss": 0.009214848279953003,
+ "routers_loss": 0.008057428523898125,
"skip_count": 2.0,
"step": 2484,
"text_loss": 0.4329465329647064
@@ -23615,13 +23615,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0009085625369635564,
- "loss": 0.0111,
+ "loss": 0.0114,
"macro_f1": 0.3333333432674408,
"num_tokens": 4007119.0,
"repeat_count": 0.0,
- "routers_loss": 0.0059350160881876945,
+ "routers_loss": 0.005759050603955984,
"skip_count": 0.0,
"step": 2486,
"text_loss": 0.501268744468689
@@ -23634,13 +23634,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.0009083840346728631,
- "loss": 0.0118,
+ "loss": 0.0122,
"macro_f1": 0.3272727429866791,
"num_tokens": 4010547.0,
"repeat_count": 1.0,
- "routers_loss": 0.019803427159786224,
+ "routers_loss": 0.020763102918863297,
"skip_count": 0.0,
"step": 2488,
"text_loss": 0.480196475982666
@@ -23653,13 +23653,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.058349609375,
+ "grad_norm": 0.05078125,
"learning_rate": 0.0009082053758886374,
- "loss": 0.0118,
+ "loss": 0.0117,
"macro_f1": 0.6666666865348816,
"num_tokens": 4014600.0,
"repeat_count": 0.0,
- "routers_loss": 0.006243673153221607,
+ "routers_loss": 0.005801836494356394,
"skip_count": 1.0,
"step": 2490,
"text_loss": 0.18249782919883728
@@ -23672,13 +23672,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0009080265606793416,
- "loss": 0.0132,
+ "loss": 0.0128,
"macro_f1": 1.0,
"num_tokens": 4017964.0,
"repeat_count": 1.0,
- "routers_loss": 0.003960726782679558,
+ "routers_loss": 0.004226063843816519,
"skip_count": 1.0,
"step": 2492,
"text_loss": 0.6573076248168945
@@ -23691,13 +23691,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0537109375,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.000907847589113498,
- "loss": 0.0127,
+ "loss": 0.0125,
"macro_f1": 0.6666666865348816,
"num_tokens": 4020694.0,
"repeat_count": 0.0,
- "routers_loss": 0.004959117621183395,
+ "routers_loss": 0.004281101748347282,
"skip_count": 2.0,
"step": 2494,
"text_loss": 0.3944586217403412
@@ -23710,13 +23710,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.000907668461259689,
- "loss": 0.0157,
+ "loss": 0.0152,
"macro_f1": 0.6666666865348816,
"num_tokens": 4023757.0,
"repeat_count": 0.0,
- "routers_loss": 0.009721433743834496,
+ "routers_loss": 0.008786370046436787,
"skip_count": 1.0,
"step": 2496,
"text_loss": 0.6452898979187012
@@ -23729,13 +23729,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0009074891771865566,
- "loss": 0.0124,
+ "loss": 0.0125,
"macro_f1": 0.3333333432674408,
"num_tokens": 4026601.0,
"repeat_count": 0.0,
- "routers_loss": 0.00491701066493988,
+ "routers_loss": 0.005209595896303654,
"skip_count": 0.0,
"step": 2498,
"text_loss": 0.9633619785308838
@@ -23748,13 +23748,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.03759765625,
"learning_rate": 0.0009073097369628028,
- "loss": 0.0131,
+ "loss": 0.013,
"macro_f1": 1.0,
"num_tokens": 4030321.0,
"repeat_count": 3.0,
- "routers_loss": 0.009832080453634262,
+ "routers_loss": 0.00860709697008133,
"skip_count": 1.0,
"step": 2500,
"text_loss": 0.48566827178001404
@@ -23767,13 +23767,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047607421875,
+ "grad_norm": 0.04443359375,
"learning_rate": 0.0009071301406571893,
- "loss": 0.0137,
+ "loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 4033234.0,
"repeat_count": 0.0,
- "routers_loss": 0.003301833290606737,
+ "routers_loss": 0.0035277456045150757,
"skip_count": 0.0,
"step": 2502,
"text_loss": 0.3771554231643677
@@ -23786,13 +23786,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.000906950388338538,
- "loss": 0.0134,
+ "loss": 0.0136,
"macro_f1": 0.3333333432674408,
"num_tokens": 4036417.0,
"repeat_count": 0.0,
- "routers_loss": 0.001580960932187736,
+ "routers_loss": 0.0013424850767478347,
"skip_count": 0.0,
"step": 2504,
"text_loss": 0.8962806463241577
@@ -23805,13 +23805,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.0009067704800757301,
- "loss": 0.0091,
+ "loss": 0.0095,
"macro_f1": 0.3333333432674408,
"num_tokens": 4039564.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011505817528814077,
+ "routers_loss": 0.0010423909407109022,
"skip_count": 0.0,
"step": 2506,
"text_loss": 0.43170279264450073
@@ -23824,13 +23824,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.04248046875,
"learning_rate": 0.000906590415937707,
- "loss": 0.0095,
+ "loss": 0.0094,
"macro_f1": 0.3272727429866791,
"num_tokens": 4043212.0,
"repeat_count": 0.0,
- "routers_loss": 0.023224346339702606,
+ "routers_loss": 0.021780289709568024,
"skip_count": 1.0,
"step": 2508,
"text_loss": 0.41495826840400696
@@ -23843,13 +23843,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0009064101959934696,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 4046687.0,
"repeat_count": 0.0,
- "routers_loss": 0.007955167442560196,
+ "routers_loss": 0.007261929102241993,
"skip_count": 1.0,
"step": 2510,
"text_loss": 0.21821187436580658
@@ -23862,13 +23862,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.057861328125,
"learning_rate": 0.0009062298203120783,
- "loss": 0.0101,
+ "loss": 0.0102,
"macro_f1": 0.6666666865348816,
"num_tokens": 4050735.0,
"repeat_count": 0.0,
- "routers_loss": 0.006164440419524908,
+ "routers_loss": 0.007447180338203907,
"skip_count": 2.0,
"step": 2512,
"text_loss": 0.1818767935037613
@@ -23881,13 +23881,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.058837890625,
+ "grad_norm": 0.06494140625,
"learning_rate": 0.0009060492889626535,
- "loss": 0.014,
+ "loss": 0.0142,
"macro_f1": 0.3272727429866791,
"num_tokens": 4054426.0,
"repeat_count": 1.0,
- "routers_loss": 0.0713663101196289,
+ "routers_loss": 0.0718490406870842,
"skip_count": 0.0,
"step": 2514,
"text_loss": 0.22798970341682434
@@ -23900,13 +23900,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.099609375,
"learning_rate": 0.0009058686020143753,
- "loss": 0.0182,
+ "loss": 0.0183,
"macro_f1": 0.3333333432674408,
"num_tokens": 4057615.0,
"repeat_count": 0.0,
- "routers_loss": 0.0052308146841824055,
+ "routers_loss": 0.0052676633931696415,
"skip_count": 0.0,
"step": 2516,
"text_loss": 0.1712338626384735
@@ -23919,13 +23919,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04052734375,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.0009056877595364832,
- "loss": 0.0143,
+ "loss": 0.0137,
"macro_f1": 0.3333333432674408,
"num_tokens": 4060338.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020465939305722713,
+ "routers_loss": 0.0018052728846669197,
"skip_count": 0.0,
"step": 2518,
"text_loss": 0.6811438798904419
@@ -23938,13 +23938,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.083984375,
"learning_rate": 0.0009055067615982761,
- "loss": 0.0114,
+ "loss": 0.0113,
"macro_f1": 0.3333333432674408,
"num_tokens": 4062887.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008663221378810704,
+ "routers_loss": 0.0009029926732182503,
"skip_count": 0.0,
"step": 2520,
"text_loss": 0.5480356812477112
@@ -23957,13 +23957,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.0009053256082691133,
- "loss": 0.0104,
+ "loss": 0.0106,
"macro_f1": 0.3333333432674408,
"num_tokens": 4065357.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026889131404459476,
+ "routers_loss": 0.0027515271212905645,
"skip_count": 0.0,
"step": 2522,
"text_loss": 0.5234101414680481
@@ -23978,11 +23978,11 @@
"f1_skip": 0.0,
"grad_norm": 0.08203125,
"learning_rate": 0.0009051442996184127,
- "loss": 0.0181,
+ "loss": 0.0174,
"macro_f1": 0.3333333432674408,
"num_tokens": 4068111.0,
"repeat_count": 0.0,
- "routers_loss": 0.002255887258797884,
+ "routers_loss": 0.002199822571128607,
"skip_count": 0.0,
"step": 2524,
"text_loss": 0.2418575882911682
@@ -23995,13 +23995,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.060546875,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009049628357156521,
- "loss": 0.0144,
+ "loss": 0.0143,
"macro_f1": 0.6666666865348816,
"num_tokens": 4071284.0,
"repeat_count": 0.0,
- "routers_loss": 0.005672316066920757,
+ "routers_loss": 0.006303096655756235,
"skip_count": 2.0,
"step": 2526,
"text_loss": 0.7948065996170044
@@ -24014,13 +24014,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0380859375,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.000904781216630369,
- "loss": 0.007,
+ "loss": 0.0068,
"macro_f1": 0.6601307392120361,
"num_tokens": 4074750.0,
"repeat_count": 1.0,
- "routers_loss": 0.017167411744594574,
+ "routers_loss": 0.01791904680430889,
"skip_count": 2.0,
"step": 2528,
"text_loss": 0.809726357460022
@@ -24033,13 +24033,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.053955078125,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0009045994424321602,
- "loss": 0.0101,
+ "loss": 0.0102,
"macro_f1": 1.0,
"num_tokens": 4078617.0,
"repeat_count": 2.0,
- "routers_loss": 0.019105618819594383,
+ "routers_loss": 0.016553178429603577,
"skip_count": 2.0,
"step": 2530,
"text_loss": 0.8755000829696655
@@ -24052,13 +24052,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.060791015625,
+ "grad_norm": 0.061767578125,
"learning_rate": 0.0009044175131906817,
"loss": 0.0145,
"macro_f1": 0.3333333432674408,
"num_tokens": 4080936.0,
"repeat_count": 0.0,
- "routers_loss": 0.007993129082024097,
+ "routers_loss": 0.00884837657213211,
"skip_count": 0.0,
"step": 2532,
"text_loss": 0.795871913433075
@@ -24071,13 +24071,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.05029296875,
"learning_rate": 0.0009042354289756491,
- "loss": 0.0124,
+ "loss": 0.0122,
"macro_f1": 0.3333333432674408,
"num_tokens": 4084459.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024954001419246197,
+ "routers_loss": 0.0024387789890170097,
"skip_count": 0.0,
"step": 2534,
"text_loss": 0.18875400722026825
@@ -24090,13 +24090,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009040531898568379,
- "loss": 0.0169,
+ "loss": 0.0171,
"macro_f1": 0.3333333432674408,
"num_tokens": 4088464.0,
"repeat_count": 0.0,
- "routers_loss": 0.004360117018222809,
+ "routers_loss": 0.00491489190608263,
"skip_count": 0.0,
"step": 2536,
"text_loss": 0.334369033575058
@@ -24109,13 +24109,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0927734375,
+ "grad_norm": 0.091796875,
"learning_rate": 0.000903870795904082,
- "loss": 0.0142,
+ "loss": 0.0145,
"macro_f1": 0.6666666865348816,
"num_tokens": 4091659.0,
"repeat_count": 0.0,
- "routers_loss": 0.00429064966738224,
+ "routers_loss": 0.004592662677168846,
"skip_count": 2.0,
"step": 2538,
"text_loss": 0.21298295259475708
@@ -24130,11 +24130,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.0458984375,
"learning_rate": 0.000903688247187275,
- "loss": 0.0136,
+ "loss": 0.0137,
"macro_f1": 0.5492662787437439,
"num_tokens": 4095496.0,
"repeat_count": 0.0,
- "routers_loss": 0.0132954316213727,
+ "routers_loss": 0.011647242121398449,
"skip_count": 2.0,
"step": 2540,
"text_loss": 0.2985081672668457
@@ -24147,13 +24147,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.04443359375,
"learning_rate": 0.0009035055437763704,
- "loss": 0.0129,
+ "loss": 0.0124,
"macro_f1": 0.3333333432674408,
"num_tokens": 4098663.0,
"repeat_count": 0.0,
- "routers_loss": 0.002104961546137929,
+ "routers_loss": 0.0021238960325717926,
"skip_count": 0.0,
"step": 2542,
"text_loss": 0.35359489917755127
@@ -24166,13 +24166,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.060791015625,
+ "grad_norm": 0.05859375,
"learning_rate": 0.0009033226857413803,
- "loss": 0.0167,
+ "loss": 0.0163,
"macro_f1": 0.6666666865348816,
"num_tokens": 4101588.0,
"repeat_count": 1.0,
- "routers_loss": 0.002973714144900441,
+ "routers_loss": 0.0024701557122170925,
"skip_count": 0.0,
"step": 2544,
"text_loss": 1.1577601432800293
@@ -24185,13 +24185,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.080078125,
"learning_rate": 0.000903139673152376,
- "loss": 0.0119,
+ "loss": 0.012,
"macro_f1": 0.3333333432674408,
"num_tokens": 4104643.0,
"repeat_count": 0.0,
- "routers_loss": 0.002359170001000166,
+ "routers_loss": 0.002499542199075222,
"skip_count": 0.0,
"step": 2546,
"text_loss": 1.0173401832580566
@@ -24204,13 +24204,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0615234375,
+ "grad_norm": 0.059814453125,
"learning_rate": 0.0009029565060794885,
- "loss": 0.0168,
+ "loss": 0.0165,
"macro_f1": 0.3333333432674408,
"num_tokens": 4109247.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033595687709748745,
+ "routers_loss": 0.0034200598020106554,
"skip_count": 0.0,
"step": 2548,
"text_loss": 0.5690504312515259
@@ -24223,13 +24223,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.07421875,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009027731845929079,
"loss": 0.0155,
"macro_f1": 0.8823530077934265,
"num_tokens": 4112597.0,
"repeat_count": 1.0,
- "routers_loss": 0.015323673374950886,
+ "routers_loss": 0.015981333330273628,
"skip_count": 1.0,
"step": 2550,
"text_loss": 0.294549822807312
@@ -24242,13 +24242,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.043212890625,
+ "grad_norm": 0.06103515625,
"learning_rate": 0.0009025897087628829,
- "loss": 0.0063,
+ "loss": 0.0064,
"macro_f1": 0.5492662787437439,
"num_tokens": 4115844.0,
"repeat_count": 0.0,
- "routers_loss": 0.02122018299996853,
+ "routers_loss": 0.02606951631605625,
"skip_count": 2.0,
"step": 2552,
"text_loss": 0.22692419588565826
@@ -24261,13 +24261,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07763671875,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009024060786597222,
"loss": 0.0202,
"macro_f1": 0.3333333432674408,
"num_tokens": 4118634.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010765352053567767,
+ "routers_loss": 0.001026194542646408,
"skip_count": 0.0,
"step": 2554,
"text_loss": 0.6807059645652771
@@ -24280,13 +24280,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.000902222294353793,
- "loss": 0.0128,
+ "loss": 0.0124,
"macro_f1": 0.3333333432674408,
"num_tokens": 4122024.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017301233019679785,
+ "routers_loss": 0.001974924933165312,
"skip_count": 0.0,
"step": 2556,
"text_loss": 0.7373668551445007
@@ -24299,13 +24299,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.055908203125,
+ "grad_norm": 0.04833984375,
"learning_rate": 0.0009020383559155219,
- "loss": 0.0056,
+ "loss": 0.0054,
"macro_f1": 1.0,
"num_tokens": 4124803.0,
"repeat_count": 1.0,
- "routers_loss": 0.004307204391807318,
+ "routers_loss": 0.004662613850086927,
"skip_count": 2.0,
"step": 2558,
"text_loss": 0.21808166801929474
@@ -24318,13 +24318,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.029541015625,
+ "grad_norm": 0.0263671875,
"learning_rate": 0.0009018542634153943,
- "loss": 0.0064,
+ "loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 4127680.0,
"repeat_count": 0.0,
- "routers_loss": 0.0073805381543934345,
+ "routers_loss": 0.006881687790155411,
"skip_count": 0.0,
"step": 2560,
"text_loss": 0.25192978978157043
@@ -24339,11 +24339,11 @@
"f1_skip": 1.0,
"grad_norm": 0.049560546875,
"learning_rate": 0.0009016700169239551,
- "loss": 0.0108,
+ "loss": 0.0105,
"macro_f1": 1.0,
"num_tokens": 4130431.0,
"repeat_count": 1.0,
- "routers_loss": 0.005493874195963144,
+ "routers_loss": 0.005977808032184839,
"skip_count": 1.0,
"step": 2562,
"text_loss": 0.4700816869735718
@@ -24356,13 +24356,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0009014856165118075,
- "loss": 0.0154,
+ "loss": 0.0153,
"macro_f1": 0.6666666865348816,
"num_tokens": 4133535.0,
"repeat_count": 0.0,
- "routers_loss": 0.006889877840876579,
+ "routers_loss": 0.007005698047578335,
"skip_count": 1.0,
"step": 2564,
"text_loss": 0.6558199524879456
@@ -24375,13 +24375,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03125,
+ "grad_norm": 0.030517578125,
"learning_rate": 0.0009013010622496144,
- "loss": 0.009,
+ "loss": 0.0088,
"macro_f1": 0.3333333432674408,
"num_tokens": 4136534.0,
"repeat_count": 0.0,
- "routers_loss": 0.008495541289448738,
+ "routers_loss": 0.007262171246111393,
"skip_count": 0.0,
"step": 2566,
"text_loss": 0.2565421462059021
@@ -24394,13 +24394,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0009011163542080971,
- "loss": 0.0089,
+ "loss": 0.0088,
"macro_f1": 0.5934640765190125,
"num_tokens": 4139762.0,
"repeat_count": 0.0,
- "routers_loss": 0.05929862707853317,
+ "routers_loss": 0.05431923270225525,
"skip_count": 3.0,
"step": 2568,
"text_loss": 0.19896510243415833
@@ -24413,13 +24413,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.026611328125,
"learning_rate": 0.0009009314924580363,
- "loss": 0.0086,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 4143398.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033934004604816437,
+ "routers_loss": 0.003667369019240141,
"skip_count": 0.0,
"step": 2570,
"text_loss": 0.6581419110298157
@@ -24432,13 +24432,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.054931640625,
+ "grad_norm": 0.052978515625,
"learning_rate": 0.0009007464770702712,
"loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 4146248.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012826769379898906,
+ "routers_loss": 0.00132099783513695,
"skip_count": 0.0,
"step": 2572,
"text_loss": 0.5316711068153381
@@ -24451,13 +24451,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.0009005613081157002,
"loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 4149455.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019460092298686504,
+ "routers_loss": 0.0020061524119228125,
"skip_count": 0.0,
"step": 2574,
"text_loss": 0.5400773882865906
@@ -24470,13 +24470,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.05517578125,
"learning_rate": 0.0009003759856652802,
- "loss": 0.0112,
+ "loss": 0.0111,
"macro_f1": 0.6666666865348816,
"num_tokens": 4152774.0,
"repeat_count": 0.0,
- "routers_loss": 0.004493138287216425,
+ "routers_loss": 0.002621434163302183,
"skip_count": 1.0,
"step": 2576,
"text_loss": 0.3672606945037842
@@ -24489,13 +24489,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.055908203125,
+ "grad_norm": 0.051513671875,
"learning_rate": 0.0009001905097900273,
"loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 4155835.0,
"repeat_count": 0.0,
- "routers_loss": 0.005607665050774813,
+ "routers_loss": 0.005290219560265541,
"skip_count": 0.0,
"step": 2578,
"text_loss": 0.8159038424491882
@@ -24508,13 +24508,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04345703125,
+ "grad_norm": 0.040771484375,
"learning_rate": 0.0009000048805610161,
- "loss": 0.0123,
+ "loss": 0.0119,
"macro_f1": 0.3333333432674408,
"num_tokens": 4158874.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015080278972163796,
+ "routers_loss": 0.0013576085912063718,
"skip_count": 0.0,
"step": 2580,
"text_loss": 0.5518951416015625
@@ -24529,11 +24529,11 @@
"f1_skip": 0.0,
"grad_norm": 0.138671875,
"learning_rate": 0.00089981909804938,
- "loss": 0.0142,
+ "loss": 0.0143,
"macro_f1": 0.3333333432674408,
"num_tokens": 4162076.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022276053205132484,
+ "routers_loss": 0.0021483441814780235,
"skip_count": 0.0,
"step": 2582,
"text_loss": 0.43552228808403015
@@ -24546,13 +24546,13 @@
"f1_execute": 0.9387754797935486,
"f1_repeat": 1.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.07421875,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0008996331623263114,
- "loss": 0.0116,
+ "loss": 0.0117,
"macro_f1": 0.7795917987823486,
"num_tokens": 4165041.0,
"repeat_count": 1.0,
- "routers_loss": 0.0499282106757164,
+ "routers_loss": 0.0544300302863121,
"skip_count": 4.0,
"step": 2584,
"text_loss": 0.24812501668930054
@@ -24565,13 +24565,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.047607421875,
"learning_rate": 0.0008994470734630611,
- "loss": 0.01,
+ "loss": 0.0101,
"macro_f1": 0.3333333432674408,
"num_tokens": 4168290.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016360745066776872,
+ "routers_loss": 0.0017150711501017213,
"skip_count": 0.0,
"step": 2586,
"text_loss": 0.6392097473144531
@@ -24584,32 +24584,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05224609375,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0008992608315309388,
- "loss": 0.0149,
+ "loss": 0.015,
"macro_f1": 0.6666666865348816,
"num_tokens": 4171310.0,
"repeat_count": 0.0,
- "routers_loss": 0.0037772543728351593,
+ "routers_loss": 0.0046473173424601555,
"skip_count": 2.0,
"step": 2588,
"text_loss": 0.6534156799316406
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 12.15967126504256,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.060791015625,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06591796875,
"learning_rate": 0.0008990744366013125,
- "loss": 0.0104,
- "macro_f1": 0.6538461446762085,
+ "loss": 0.0105,
+ "macro_f1": 0.3144654333591461,
"num_tokens": 4174042.0,
"repeat_count": 2.0,
- "routers_loss": 0.05992122367024422,
+ "routers_loss": 0.060913100838661194,
"skip_count": 1.0,
"step": 2590,
"text_loss": 0.5365690588951111
@@ -24622,13 +24622,13 @@
"f1_execute": 0.9583333134651184,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.055419921875,
"learning_rate": 0.0008988878887456093,
"loss": 0.0118,
"macro_f1": 0.6051587462425232,
"num_tokens": 4177666.0,
"repeat_count": 1.0,
- "routers_loss": 0.0679154023528099,
+ "routers_loss": 0.06268956512212753,
"skip_count": 4.0,
"step": 2592,
"text_loss": 0.226226806640625
@@ -24643,11 +24643,11 @@
"f1_skip": 0.0,
"grad_norm": 0.03662109375,
"learning_rate": 0.0008987011880353149,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.32098764181137085,
"num_tokens": 4180490.0,
"repeat_count": 0.0,
- "routers_loss": 0.03284052759408951,
+ "routers_loss": 0.030141465365886688,
"skip_count": 2.0,
"step": 2594,
"text_loss": 0.2581401765346527
@@ -24660,13 +24660,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.051513671875,
+ "grad_norm": 0.044677734375,
"learning_rate": 0.0008985143345419729,
- "loss": 0.0087,
+ "loss": 0.0082,
"macro_f1": 0.5492662787437439,
"num_tokens": 4183300.0,
"repeat_count": 0.0,
- "routers_loss": 0.01971421390771866,
+ "routers_loss": 0.018745863810181618,
"skip_count": 2.0,
"step": 2596,
"text_loss": 0.7778542637825012
@@ -24679,13 +24679,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0703125,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0008983273283371862,
- "loss": 0.0099,
+ "loss": 0.0096,
"macro_f1": 0.5492662787437439,
"num_tokens": 4186535.0,
"repeat_count": 0.0,
- "routers_loss": 0.028065117076039314,
+ "routers_loss": 0.026792079210281372,
"skip_count": 2.0,
"step": 2598,
"text_loss": 0.34700271487236023
@@ -24698,13 +24698,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0008981401694926159,
- "loss": 0.0077,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 4189082.0,
"repeat_count": 0.0,
- "routers_loss": 0.00166845612693578,
+ "routers_loss": 0.001914160675369203,
"skip_count": 0.0,
"step": 2600,
"text_loss": 0.6879339218139648
@@ -24717,13 +24717,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0008979528580799815,
- "loss": 0.0138,
+ "loss": 0.0136,
"macro_f1": 0.6666666865348816,
"num_tokens": 4192330.0,
"repeat_count": 0.0,
- "routers_loss": 0.007527270819991827,
+ "routers_loss": 0.007978348061442375,
"skip_count": 2.0,
"step": 2602,
"text_loss": 0.3524550497531891
@@ -24736,13 +24736,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.0008977653941710613,
- "loss": 0.0137,
+ "loss": 0.0134,
"macro_f1": 0.6666666865348816,
"num_tokens": 4196117.0,
"repeat_count": 2.0,
- "routers_loss": 0.00412185862660408,
+ "routers_loss": 0.0035376469604671,
"skip_count": 0.0,
"step": 2604,
"text_loss": 0.42356348037719727
@@ -24755,13 +24755,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06005859375,
+ "grad_norm": 0.05810546875,
"learning_rate": 0.0008975777778376916,
- "loss": 0.0157,
+ "loss": 0.0156,
"macro_f1": 0.6666666865348816,
"num_tokens": 4200423.0,
"repeat_count": 0.0,
- "routers_loss": 0.007787751499563456,
+ "routers_loss": 0.008262477815151215,
"skip_count": 1.0,
"step": 2606,
"text_loss": 0.5272893905639648
@@ -24774,13 +24774,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0008973900091517675,
"loss": 0.0114,
"macro_f1": 0.3272727429866791,
"num_tokens": 4203257.0,
"repeat_count": 0.0,
- "routers_loss": 0.024111779406666756,
+ "routers_loss": 0.022957922890782356,
"skip_count": 1.0,
"step": 2608,
"text_loss": 0.2713734805583954
@@ -24793,13 +24793,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.045166015625,
+ "grad_norm": 0.043701171875,
"learning_rate": 0.000897202088185242,
- "loss": 0.0091,
+ "loss": 0.0085,
"macro_f1": 0.6666666865348816,
"num_tokens": 4206243.0,
"repeat_count": 0.0,
- "routers_loss": 0.0057326615788042545,
+ "routers_loss": 0.006623407825827599,
"skip_count": 2.0,
"step": 2610,
"text_loss": 0.5920525789260864
@@ -24812,13 +24812,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.0517578125,
"learning_rate": 0.0008970140150101274,
- "loss": 0.0118,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 4209264.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008877563523128629,
+ "routers_loss": 0.0008602747693657875,
"skip_count": 0.0,
"step": 2612,
"text_loss": 0.33421996235847473
@@ -24831,13 +24831,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.030517578125,
"learning_rate": 0.0008968257896984932,
- "loss": 0.0067,
+ "loss": 0.0062,
"macro_f1": 0.6666666865348816,
"num_tokens": 4212058.0,
"repeat_count": 0.0,
- "routers_loss": 0.0039034869987517595,
+ "routers_loss": 0.0024653903674334288,
"skip_count": 1.0,
"step": 2614,
"text_loss": 0.37923356890678406
@@ -24850,13 +24850,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.06298828125,
"learning_rate": 0.0008966374123224677,
- "loss": 0.0085,
+ "loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 4214929.0,
"repeat_count": 0.0,
- "routers_loss": 0.01140254084020853,
+ "routers_loss": 0.010878405533730984,
"skip_count": 0.0,
"step": 2616,
"text_loss": 0.4350503981113434
@@ -24869,13 +24869,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03125,
+ "grad_norm": 0.0303955078125,
"learning_rate": 0.0008964488829542376,
"loss": 0.0083,
"macro_f1": 0.3272727429866791,
"num_tokens": 4219170.0,
"repeat_count": 0.0,
- "routers_loss": 0.028559349477291107,
+ "routers_loss": 0.02864212542772293,
"skip_count": 1.0,
"step": 2618,
"text_loss": 0.26250728964805603
@@ -24888,13 +24888,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.061279296875,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0008962602016660478,
- "loss": 0.0097,
+ "loss": 0.0096,
"macro_f1": 0.6666666865348816,
"num_tokens": 4222077.0,
"repeat_count": 0.0,
- "routers_loss": 0.010525460354983807,
+ "routers_loss": 0.010444172658026218,
"skip_count": 2.0,
"step": 2620,
"text_loss": 0.4718937575817108
@@ -24907,13 +24907,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.048583984375,
+ "grad_norm": 0.0478515625,
"learning_rate": 0.0008960713685302011,
- "loss": 0.0104,
+ "loss": 0.0105,
"macro_f1": 0.6666666865348816,
"num_tokens": 4225383.0,
"repeat_count": 0.0,
- "routers_loss": 0.005284689832478762,
+ "routers_loss": 0.006409442983567715,
"skip_count": 1.0,
"step": 2622,
"text_loss": 0.30420538783073425
@@ -24926,13 +24926,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0284423828125,
+ "grad_norm": 0.02978515625,
"learning_rate": 0.0008958823836190588,
- "loss": 0.0051,
+ "loss": 0.005,
"macro_f1": 0.3272727429866791,
"num_tokens": 4228349.0,
"repeat_count": 0.0,
- "routers_loss": 0.011040215380489826,
+ "routers_loss": 0.009996986016631126,
"skip_count": 1.0,
"step": 2624,
"text_loss": 0.5392362475395203
@@ -24945,13 +24945,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.031494140625,
"learning_rate": 0.0008956932470050404,
"loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 4232007.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014406041009351611,
+ "routers_loss": 0.0014383369125425816,
"skip_count": 0.0,
"step": 2626,
"text_loss": 0.7112401127815247
@@ -24964,13 +24964,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0008955039587606233,
- "loss": 0.0111,
+ "loss": 0.0109,
"macro_f1": 0.6666666865348816,
"num_tokens": 4235122.0,
"repeat_count": 0.0,
- "routers_loss": 0.007106760982424021,
+ "routers_loss": 0.00781513936817646,
"skip_count": 3.0,
"step": 2628,
"text_loss": 0.17802883684635162
@@ -24983,13 +24983,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0400390625,
+ "grad_norm": 0.0439453125,
"learning_rate": 0.0008953145189583429,
- "loss": 0.0125,
+ "loss": 0.0126,
"macro_f1": 0.542222261428833,
"num_tokens": 4238248.0,
"repeat_count": 0.0,
- "routers_loss": 0.06423533707857132,
+ "routers_loss": 0.062252625823020935,
"skip_count": 4.0,
"step": 2630,
"text_loss": 0.5551572442054749
@@ -25002,13 +25002,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0008951249276707933,
- "loss": 0.012,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 4241042.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010294591775164008,
+ "routers_loss": 0.0011421777307987213,
"skip_count": 0.0,
"step": 2632,
"text_loss": 0.7092233896255493
@@ -25021,13 +25021,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0008949351849706261,
- "loss": 0.0122,
+ "loss": 0.0117,
"macro_f1": 0.3333333432674408,
"num_tokens": 4243939.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032732547260820866,
+ "routers_loss": 0.0032689040526747704,
"skip_count": 0.0,
"step": 2634,
"text_loss": 0.19925718009471893
@@ -25040,13 +25040,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0390625,
+ "grad_norm": 0.033935546875,
"learning_rate": 0.0008947452909305509,
- "loss": 0.0112,
+ "loss": 0.0109,
"macro_f1": 0.6666666865348816,
"num_tokens": 4247535.0,
"repeat_count": 1.0,
- "routers_loss": 0.0021109411027282476,
+ "routers_loss": 0.002066014800220728,
"skip_count": 0.0,
"step": 2636,
"text_loss": 0.5249715447425842
@@ -25059,13 +25059,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.11279296875,
+ "grad_norm": 0.09326171875,
"learning_rate": 0.0008945552456233356,
"loss": 0.0169,
"macro_f1": 0.8820862174034119,
"num_tokens": 4251441.0,
"repeat_count": 2.0,
- "routers_loss": 0.029545020312070847,
+ "routers_loss": 0.029332537204027176,
"skip_count": 2.0,
"step": 2638,
"text_loss": 0.19229578971862793
@@ -25078,13 +25078,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.078125,
"learning_rate": 0.0008943650491218058,
- "loss": 0.0083,
+ "loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 4254314.0,
"repeat_count": 0.0,
- "routers_loss": 0.0075805820524692535,
+ "routers_loss": 0.0075911120511591434,
"skip_count": 0.0,
"step": 2640,
"text_loss": 0.27059751749038696
@@ -25097,13 +25097,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.0008941747014988453,
- "loss": 0.0155,
+ "loss": 0.0156,
"macro_f1": 0.3333333432674408,
"num_tokens": 4257442.0,
"repeat_count": 0.0,
- "routers_loss": 0.008832095190882683,
+ "routers_loss": 0.009030844084918499,
"skip_count": 0.0,
"step": 2642,
"text_loss": 0.36747801303863525
@@ -25116,13 +25116,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.080078125,
+ "grad_norm": 0.123046875,
"learning_rate": 0.0008939842028273956,
- "loss": 0.011,
+ "loss": 0.0112,
"macro_f1": 0.6666666865348816,
"num_tokens": 4260386.0,
"repeat_count": 0.0,
- "routers_loss": 0.008952614851295948,
+ "routers_loss": 0.007844001986086369,
"skip_count": 1.0,
"step": 2644,
"text_loss": 0.6397647857666016
@@ -25135,13 +25135,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0250244140625,
+ "grad_norm": 0.0283203125,
"learning_rate": 0.0008937935531804562,
- "loss": 0.0075,
+ "loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 4263516.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017659157747402787,
+ "routers_loss": 0.0018789108144119382,
"skip_count": 0.0,
"step": 2646,
"text_loss": 0.4795534908771515
@@ -25154,13 +25154,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05419921875,
+ "grad_norm": 0.06494140625,
"learning_rate": 0.0008936027526310844,
- "loss": 0.0101,
+ "loss": 0.0098,
"macro_f1": 0.3272727429866791,
"num_tokens": 4266744.0,
"repeat_count": 0.0,
- "routers_loss": 0.03944230079650879,
+ "routers_loss": 0.0348590686917305,
"skip_count": 1.0,
"step": 2648,
"text_loss": 0.27691999077796936
@@ -25173,13 +25173,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.000893411801252395,
"loss": 0.015,
"macro_f1": 0.6666666865348816,
"num_tokens": 4269766.0,
"repeat_count": 0.0,
- "routers_loss": 0.0037144431844353676,
+ "routers_loss": 0.004543309565633535,
"skip_count": 1.0,
"step": 2650,
"text_loss": 0.18867231905460358
@@ -25192,13 +25192,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0008932206991175615,
- "loss": 0.0143,
+ "loss": 0.0141,
"macro_f1": 0.6666666865348816,
"num_tokens": 4273513.0,
"repeat_count": 0.0,
- "routers_loss": 0.003659905167296529,
+ "routers_loss": 0.0035277456045150757,
"skip_count": 1.0,
"step": 2652,
"text_loss": 0.45613357424736023
@@ -25211,13 +25211,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.055908203125,
"learning_rate": 0.0008930294462998143,
"loss": 0.015,
"macro_f1": 0.6666666865348816,
"num_tokens": 4276878.0,
"repeat_count": 1.0,
- "routers_loss": 0.011676746420562267,
+ "routers_loss": 0.011337592266499996,
"skip_count": 0.0,
"step": 2654,
"text_loss": 0.24733254313468933
@@ -25230,13 +25230,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.0869140625,
"learning_rate": 0.0008928380428724419,
- "loss": 0.0061,
+ "loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 4279915.0,
"repeat_count": 0.0,
- "routers_loss": 0.000998969655483961,
+ "routers_loss": 0.0010295971296727657,
"skip_count": 1.0,
"step": 2656,
"text_loss": 0.41722849011421204
@@ -25249,13 +25249,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0008926464889087903,
- "loss": 0.0109,
+ "loss": 0.0116,
"macro_f1": 0.6666666865348816,
"num_tokens": 4282888.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016260759439319372,
+ "routers_loss": 0.0017198545392602682,
"skip_count": 2.0,
"step": 2658,
"text_loss": 0.738322377204895
@@ -25268,13 +25268,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0008924547844822634,
- "loss": 0.0101,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 4285805.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010900370543822646,
+ "routers_loss": 0.001339946174994111,
"skip_count": 0.0,
"step": 2660,
"text_loss": 0.4802379906177521
@@ -25287,13 +25287,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.05322265625,
"learning_rate": 0.000892262929666323,
- "loss": 0.0101,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 4290282.0,
"repeat_count": 0.0,
- "routers_loss": 0.002275131642818451,
+ "routers_loss": 0.0022340165451169014,
"skip_count": 0.0,
"step": 2662,
"text_loss": 0.6503544449806213
@@ -25306,13 +25306,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0419921875,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0008920709245344878,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 4294106.0,
"repeat_count": 0.0,
- "routers_loss": 0.00575100164860487,
+ "routers_loss": 0.005288850050419569,
"skip_count": 1.0,
"step": 2664,
"text_loss": 0.12312037497758865
@@ -25325,13 +25325,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.038330078125,
+ "grad_norm": 0.041259765625,
"learning_rate": 0.0008918787691603347,
- "loss": 0.0122,
+ "loss": 0.0121,
"macro_f1": 0.6666666865348816,
"num_tokens": 4298013.0,
"repeat_count": 0.0,
- "routers_loss": 0.004139711149036884,
+ "routers_loss": 0.004259659443050623,
"skip_count": 1.0,
"step": 2666,
"text_loss": 0.3070000112056732
@@ -25344,13 +25344,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0439453125,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.000891686463617498,
- "loss": 0.0072,
+ "loss": 0.0069,
"macro_f1": 0.6666666865348816,
"num_tokens": 4300799.0,
"repeat_count": 0.0,
- "routers_loss": 0.008856390602886677,
+ "routers_loss": 0.009489355608820915,
"skip_count": 1.0,
"step": 2668,
"text_loss": 0.18535588681697845
@@ -25363,13 +25363,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0576171875,
+ "grad_norm": 0.055908203125,
"learning_rate": 0.0008914940079796696,
- "loss": 0.0116,
+ "loss": 0.0114,
"macro_f1": 0.3333333432674408,
"num_tokens": 4304641.0,
"repeat_count": 0.0,
- "routers_loss": 0.002438562922179699,
+ "routers_loss": 0.0025417013093829155,
"skip_count": 0.0,
"step": 2670,
"text_loss": 0.482585072517395
@@ -25382,13 +25382,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0008913014023205988,
"loss": 0.0108,
"macro_f1": 0.3333333432674408,
"num_tokens": 4307462.0,
"repeat_count": 0.0,
- "routers_loss": 0.006435772404074669,
+ "routers_loss": 0.006371749565005302,
"skip_count": 0.0,
"step": 2672,
"text_loss": 0.7064456939697266
@@ -25401,13 +25401,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.0008911086467140925,
- "loss": 0.0069,
+ "loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 4310396.0,
"repeat_count": 0.0,
- "routers_loss": 0.002773779444396496,
+ "routers_loss": 0.0027512952219694853,
"skip_count": 0.0,
"step": 2674,
"text_loss": 0.23532851040363312
@@ -25420,13 +25420,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.05712890625,
"learning_rate": 0.000890915741234015,
- "loss": 0.0135,
+ "loss": 0.0133,
"macro_f1": 0.6666666865348816,
"num_tokens": 4314781.0,
"repeat_count": 0.0,
- "routers_loss": 0.00862761028110981,
+ "routers_loss": 0.008253013715147972,
"skip_count": 1.0,
"step": 2676,
"text_loss": 0.30950358510017395
@@ -25439,13 +25439,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033203125,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0008907226859542879,
- "loss": 0.0104,
+ "loss": 0.0105,
"macro_f1": 0.6666666865348816,
"num_tokens": 4317988.0,
"repeat_count": 0.0,
- "routers_loss": 0.005587176885455847,
+ "routers_loss": 0.005409995559602976,
"skip_count": 2.0,
"step": 2678,
"text_loss": 0.4930732846260071
@@ -25458,13 +25458,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.042236328125,
+ "grad_norm": 0.060546875,
"learning_rate": 0.0008905294809488907,
- "loss": 0.0082,
+ "loss": 0.0084,
"macro_f1": 1.0,
"num_tokens": 4321014.0,
"repeat_count": 1.0,
- "routers_loss": 0.0033104203175753355,
+ "routers_loss": 0.0029942214023321867,
"skip_count": 1.0,
"step": 2680,
"text_loss": 0.6224040389060974
@@ -25477,13 +25477,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0008903361262918595,
- "loss": 0.0117,
+ "loss": 0.0115,
"macro_f1": 0.6666666865348816,
"num_tokens": 4324268.0,
"repeat_count": 0.0,
- "routers_loss": 0.008205405436456203,
+ "routers_loss": 0.008411120623350143,
"skip_count": 1.0,
"step": 2682,
"text_loss": 0.16296671330928802
@@ -25496,13 +25496,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.052734375,
+ "grad_norm": 0.05126953125,
"learning_rate": 0.0008901426220572884,
- "loss": 0.0142,
+ "loss": 0.0138,
"macro_f1": 1.0,
"num_tokens": 4327494.0,
"repeat_count": 2.0,
- "routers_loss": 0.007884894497692585,
+ "routers_loss": 0.01039006095379591,
"skip_count": 4.0,
"step": 2684,
"text_loss": 0.43866512179374695
@@ -25515,13 +25515,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.060791015625,
"learning_rate": 0.0008899489683193286,
- "loss": 0.011,
+ "loss": 0.0107,
"macro_f1": 0.3333333432674408,
"num_tokens": 4330936.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009336905204690993,
+ "routers_loss": 0.0009329111780971289,
"skip_count": 0.0,
"step": 2686,
"text_loss": 0.44250962138175964
@@ -25534,13 +25534,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0810546875,
+ "grad_norm": 0.07421875,
"learning_rate": 0.0008897551651521885,
"loss": 0.0111,
"macro_f1": 0.3333333432674408,
"num_tokens": 4334123.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033622782211750746,
+ "routers_loss": 0.003197216661646962,
"skip_count": 0.0,
"step": 2688,
"text_loss": 0.48313501477241516
@@ -25553,13 +25553,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07763671875,
+ "grad_norm": 0.09716796875,
"learning_rate": 0.0008895612126301339,
"loss": 0.0157,
"macro_f1": 0.3333333432674408,
"num_tokens": 4337610.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034563415683805943,
+ "routers_loss": 0.0033548236824572086,
"skip_count": 0.0,
"step": 2690,
"text_loss": 0.4715327322483063
@@ -25572,13 +25572,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.051513671875,
"learning_rate": 0.0008893671108274877,
- "loss": 0.0115,
+ "loss": 0.0118,
"macro_f1": 0.3333333432674408,
"num_tokens": 4341026.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022277699317783117,
+ "routers_loss": 0.0024757643695920706,
"skip_count": 0.0,
"step": 2692,
"text_loss": 0.43402785062789917
@@ -25591,13 +25591,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0008891728598186302,
- "loss": 0.011,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 4344422.0,
"repeat_count": 0.0,
- "routers_loss": 0.003892304375767708,
+ "routers_loss": 0.003317243419587612,
"skip_count": 0.0,
"step": 2694,
"text_loss": 0.8498559594154358
@@ -25610,13 +25610,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.0380859375,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0008889784596779986,
- "loss": 0.0092,
+ "loss": 0.009,
"macro_f1": 0.5934640765190125,
"num_tokens": 4347507.0,
"repeat_count": 0.0,
- "routers_loss": 0.015058296732604504,
+ "routers_loss": 0.01577926240861416,
"skip_count": 3.0,
"step": 2696,
"text_loss": 0.5646669864654541
@@ -25629,13 +25629,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.11328125,
"learning_rate": 0.0008887839104800876,
- "loss": 0.0118,
+ "loss": 0.0124,
"macro_f1": 0.3333333432674408,
"num_tokens": 4350414.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033561652526259422,
+ "routers_loss": 0.002953822258859873,
"skip_count": 0.0,
"step": 2698,
"text_loss": 0.5145012140274048
@@ -25648,13 +25648,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.05029296875,
"learning_rate": 0.0008885892122994486,
- "loss": 0.0116,
+ "loss": 0.0112,
"macro_f1": 0.3333333432674408,
"num_tokens": 4354110.0,
"repeat_count": 0.0,
- "routers_loss": 0.0062471418641507626,
+ "routers_loss": 0.005849295295774937,
"skip_count": 0.0,
"step": 2700,
"text_loss": 0.580982506275177
@@ -25667,13 +25667,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.0008883943652106903,
"loss": 0.0086,
"macro_f1": 1.0,
"num_tokens": 4357323.0,
"repeat_count": 1.0,
- "routers_loss": 0.011802209541201591,
+ "routers_loss": 0.012347398325800896,
"skip_count": 2.0,
"step": 2702,
"text_loss": 0.2234988808631897
@@ -25686,13 +25686,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.0008881993692884787,
- "loss": 0.0132,
+ "loss": 0.0128,
"macro_f1": 0.6666666865348816,
"num_tokens": 4360228.0,
"repeat_count": 0.0,
- "routers_loss": 0.0041528744623064995,
+ "routers_loss": 0.003574999049305916,
"skip_count": 1.0,
"step": 2704,
"text_loss": 0.4261806607246399
@@ -25705,13 +25705,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0439453125,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0008880042246075365,
- "loss": 0.0094,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 4363905.0,
"repeat_count": 0.0,
- "routers_loss": 0.003151095937937498,
+ "routers_loss": 0.0031574300955981016,
"skip_count": 0.0,
"step": 2706,
"text_loss": 0.691118061542511
@@ -25724,13 +25724,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.0008878089312426433,
"loss": 0.0091,
"macro_f1": 0.3333333432674408,
"num_tokens": 4366736.0,
"repeat_count": 0.0,
- "routers_loss": 0.003142676781862974,
+ "routers_loss": 0.003195564029738307,
"skip_count": 0.0,
"step": 2708,
"text_loss": 0.613926112651825
@@ -25743,13 +25743,13 @@
"f1_execute": 0.9583333134651184,
"f1_repeat": 0.0,
"f1_skip": 0.75,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.054443359375,
"learning_rate": 0.0008876134892686363,
"loss": 0.011,
"macro_f1": 0.5694444179534912,
"num_tokens": 4370146.0,
"repeat_count": 0.0,
- "routers_loss": 0.032964516431093216,
+ "routers_loss": 0.038784291595220566,
"skip_count": 5.0,
"step": 2710,
"text_loss": 0.2723451852798462
@@ -25762,13 +25762,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.080078125,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.000887417898760409,
- "loss": 0.0123,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 4373653.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006848900229670107,
+ "routers_loss": 0.0006457131239585578,
"skip_count": 0.0,
"step": 2712,
"text_loss": 0.31667640805244446
@@ -25781,13 +25781,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.10498046875,
"learning_rate": 0.000887222159792912,
- "loss": 0.0156,
+ "loss": 0.0155,
"macro_f1": 0.6603773832321167,
"num_tokens": 4376993.0,
"repeat_count": 1.0,
- "routers_loss": 0.04388813674449921,
+ "routers_loss": 0.045078590512275696,
"skip_count": 1.0,
"step": 2714,
"text_loss": 0.5872798562049866
@@ -25800,13 +25800,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.034912109375,
"learning_rate": 0.0008870262724411528,
- "loss": 0.0122,
+ "loss": 0.012,
"macro_f1": 0.3333333432674408,
"num_tokens": 4380160.0,
"repeat_count": 0.0,
- "routers_loss": 0.003538437420502305,
+ "routers_loss": 0.003628545207902789,
"skip_count": 0.0,
"step": 2716,
"text_loss": 0.7468157410621643
@@ -25819,13 +25819,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1328125,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0008868302367801962,
- "loss": 0.0123,
+ "loss": 0.0118,
"macro_f1": 0.6598639488220215,
"num_tokens": 4383100.0,
"repeat_count": 1.0,
- "routers_loss": 0.05479869619011879,
+ "routers_loss": 0.05404464527964592,
"skip_count": 3.0,
"step": 2718,
"text_loss": 0.2970244884490967
@@ -25838,13 +25838,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0008866340528851629,
"loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 4386700.0,
"repeat_count": 0.0,
- "routers_loss": 0.0070296903140842915,
+ "routers_loss": 0.007000274024903774,
"skip_count": 0.0,
"step": 2720,
"text_loss": 0.34521186351776123
@@ -25857,13 +25857,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05810546875,
+ "grad_norm": 0.052978515625,
"learning_rate": 0.0008864377208312313,
- "loss": 0.0085,
+ "loss": 0.0082,
"macro_f1": 0.8823530077934265,
"num_tokens": 4390299.0,
"repeat_count": 1.0,
- "routers_loss": 0.02051853947341442,
+ "routers_loss": 0.02025366574525833,
"skip_count": 2.0,
"step": 2722,
"text_loss": 1.0536936521530151
@@ -25876,13 +25876,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.000886241240693636,
- "loss": 0.0096,
+ "loss": 0.0098,
"macro_f1": 0.3333333432674408,
"num_tokens": 4393353.0,
"repeat_count": 0.0,
- "routers_loss": 0.002662461483851075,
+ "routers_loss": 0.00251673418097198,
"skip_count": 0.0,
"step": 2724,
"text_loss": 0.5678093433380127
@@ -25895,13 +25895,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.05615234375,
+ "grad_norm": 0.052001953125,
"learning_rate": 0.0008860446125476686,
"loss": 0.0135,
"macro_f1": 0.6666666865348816,
"num_tokens": 4396446.0,
"repeat_count": 1.0,
- "routers_loss": 0.009321866557002068,
+ "routers_loss": 0.009532532654702663,
"skip_count": 0.0,
"step": 2726,
"text_loss": 0.23775041103363037
@@ -25914,13 +25914,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.109375,
+ "grad_norm": 0.091796875,
"learning_rate": 0.0008858478364686776,
- "loss": 0.0102,
+ "loss": 0.0099,
"macro_f1": 0.6666666865348816,
"num_tokens": 4399977.0,
"repeat_count": 1.0,
- "routers_loss": 0.01029124017804861,
+ "routers_loss": 0.008062181062996387,
"skip_count": 0.0,
"step": 2728,
"text_loss": 0.18888695538043976
@@ -25933,13 +25933,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037353515625,
+ "grad_norm": 0.035888671875,
"learning_rate": 0.0008856509125320678,
- "loss": 0.0082,
+ "loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 4404406.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008023424888961017,
+ "routers_loss": 0.0007731119985692203,
"skip_count": 0.0,
"step": 2730,
"text_loss": 0.47331541776657104
@@ -25952,13 +25952,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0517578125,
+ "grad_norm": 0.0498046875,
"learning_rate": 0.0008854538408133006,
- "loss": 0.0115,
+ "loss": 0.0114,
"macro_f1": 0.6666666865348816,
"num_tokens": 4407165.0,
"repeat_count": 0.0,
- "routers_loss": 0.003058656118810177,
+ "routers_loss": 0.003115242812782526,
"skip_count": 1.0,
"step": 2732,
"text_loss": 0.491370290517807
@@ -25971,13 +25971,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039794921875,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0008852566213878947,
- "loss": 0.0082,
+ "loss": 0.0081,
"macro_f1": 0.3333333432674408,
"num_tokens": 4410101.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010282890871167183,
+ "routers_loss": 0.0008958528051152825,
"skip_count": 0.0,
"step": 2734,
"text_loss": 0.42188262939453125
@@ -25990,13 +25990,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.07421875,
+ "grad_norm": 0.07763671875,
"learning_rate": 0.0008850592543314246,
- "loss": 0.0123,
+ "loss": 0.0118,
"macro_f1": 1.0,
"num_tokens": 4413015.0,
"repeat_count": 1.0,
- "routers_loss": 0.014785367995500565,
+ "routers_loss": 0.01139112375676632,
"skip_count": 1.0,
"step": 2736,
"text_loss": 0.4716498553752899
@@ -26009,13 +26009,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0654296875,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0008848617397195218,
- "loss": 0.0089,
+ "loss": 0.0084,
"macro_f1": 0.6603773832321167,
"num_tokens": 4416404.0,
"repeat_count": 1.0,
- "routers_loss": 0.017717093229293823,
+ "routers_loss": 0.01609630137681961,
"skip_count": 1.0,
"step": 2738,
"text_loss": 0.19490821659564972
@@ -26028,13 +26028,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0008846640776278745,
- "loss": 0.0067,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 4419408.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011861984385177493,
+ "routers_loss": 0.001489170710556209,
"skip_count": 0.0,
"step": 2740,
"text_loss": 0.6443108320236206
@@ -26047,13 +26047,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0008844662681322269,
"loss": 0.0144,
"macro_f1": 0.6666666865348816,
"num_tokens": 4422067.0,
"repeat_count": 1.0,
- "routers_loss": 0.0013843412743881345,
+ "routers_loss": 0.0014755792217329144,
"skip_count": 0.0,
"step": 2742,
"text_loss": 0.9150356650352478
@@ -26068,11 +26068,11 @@
"f1_skip": 1.0,
"grad_norm": 0.05078125,
"learning_rate": 0.0008842683113083801,
- "loss": 0.0154,
+ "loss": 0.0149,
"macro_f1": 0.6666666865348816,
"num_tokens": 4425647.0,
"repeat_count": 0.0,
- "routers_loss": 0.010318896733224392,
+ "routers_loss": 0.008962674997746944,
"skip_count": 1.0,
"step": 2744,
"text_loss": 0.7103227972984314
@@ -26085,13 +26085,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0008840702072321915,
- "loss": 0.0108,
+ "loss": 0.0104,
"macro_f1": 0.6598639488220215,
"num_tokens": 4428855.0,
"repeat_count": 1.0,
- "routers_loss": 0.029359478503465652,
+ "routers_loss": 0.02554207295179367,
"skip_count": 3.0,
"step": 2746,
"text_loss": 0.27141591906547546
@@ -26104,13 +26104,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0234375,
+ "grad_norm": 0.0230712890625,
"learning_rate": 0.0008838719559795751,
"loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 4432838.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014995118835940957,
+ "routers_loss": 0.0011747616808861494,
"skip_count": 0.0,
"step": 2748,
"text_loss": 0.4007738530635834
@@ -26123,13 +26123,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.03515625,
+ "grad_norm": 0.03466796875,
"learning_rate": 0.0008836735576265009,
- "loss": 0.0074,
+ "loss": 0.0073,
"macro_f1": 0.5492662787437439,
"num_tokens": 4435793.0,
"repeat_count": 0.0,
- "routers_loss": 0.017950648441910744,
+ "routers_loss": 0.017564335837960243,
"skip_count": 2.0,
"step": 2750,
"text_loss": 0.5972410440444946
@@ -26142,13 +26142,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.044921875,
"learning_rate": 0.0008834750122489956,
- "loss": 0.0083,
+ "loss": 0.0086,
"macro_f1": 0.6666666865348816,
"num_tokens": 4438871.0,
"repeat_count": 1.0,
- "routers_loss": 0.0069067892618477345,
+ "routers_loss": 0.007004009559750557,
"skip_count": 0.0,
"step": 2752,
"text_loss": 0.2294853925704956
@@ -26161,13 +26161,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.051513671875,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0008832763199231423,
- "loss": 0.0101,
+ "loss": 0.0107,
"macro_f1": 0.3333333432674408,
"num_tokens": 4441846.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013944554375484586,
+ "routers_loss": 0.0014562139986082911,
"skip_count": 0.0,
"step": 2754,
"text_loss": 0.722432017326355
@@ -26180,13 +26180,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0008830774807250802,
"loss": 0.013,
"macro_f1": 0.3272727429866791,
"num_tokens": 4444786.0,
"repeat_count": 1.0,
- "routers_loss": 0.025158623233437538,
+ "routers_loss": 0.024773593991994858,
"skip_count": 0.0,
"step": 2756,
"text_loss": 0.507905125617981
@@ -26199,13 +26199,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05419921875,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.0008828784947310049,
- "loss": 0.0131,
+ "loss": 0.0129,
"macro_f1": 0.8823530077934265,
"num_tokens": 4448442.0,
"repeat_count": 1.0,
- "routers_loss": 0.05205477401614189,
+ "routers_loss": 0.04959975928068161,
"skip_count": 2.0,
"step": 2758,
"text_loss": 0.3617522418498993
@@ -26218,13 +26218,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.000882679362017168,
"loss": 0.0149,
"macro_f1": 1.0,
"num_tokens": 4451401.0,
"repeat_count": 1.0,
- "routers_loss": 0.005898742936551571,
+ "routers_loss": 0.005783245898783207,
"skip_count": 2.0,
"step": 2760,
"text_loss": 0.49187400937080383
@@ -26237,13 +26237,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0008824800826598778,
- "loss": 0.0129,
+ "loss": 0.0127,
"macro_f1": 0.3333333432674408,
"num_tokens": 4454537.0,
"repeat_count": 0.0,
- "routers_loss": 0.006758298724889755,
+ "routers_loss": 0.00656260596588254,
"skip_count": 0.0,
"step": 2762,
"text_loss": 0.6823583245277405
@@ -26256,13 +26256,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.0546875,
"learning_rate": 0.0008822806567354983,
- "loss": 0.0109,
+ "loss": 0.0111,
"macro_f1": 0.6666666865348816,
"num_tokens": 4457706.0,
"repeat_count": 1.0,
- "routers_loss": 0.005730919074267149,
+ "routers_loss": 0.005298966076225042,
"skip_count": 0.0,
"step": 2764,
"text_loss": 0.554322361946106
@@ -26275,13 +26275,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.051025390625,
+ "grad_norm": 0.046630859375,
"learning_rate": 0.0008820810843204501,
- "loss": 0.0098,
+ "loss": 0.0096,
"macro_f1": 0.3272727429866791,
"num_tokens": 4460710.0,
"repeat_count": 0.0,
- "routers_loss": 0.03390989825129509,
+ "routers_loss": 0.03164982795715332,
"skip_count": 1.0,
"step": 2766,
"text_loss": 0.1656961441040039
@@ -26294,13 +26294,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0849609375,
+ "grad_norm": 0.072265625,
"learning_rate": 0.0008818813654912095,
- "loss": 0.0165,
+ "loss": 0.0162,
"macro_f1": 0.3333333432674408,
"num_tokens": 4464001.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007058497285470366,
+ "routers_loss": 0.000715116853825748,
"skip_count": 0.0,
"step": 2768,
"text_loss": 0.5818144083023071
@@ -26313,13 +26313,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.058837890625,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0008816815003243093,
- "loss": 0.0136,
+ "loss": 0.0133,
"macro_f1": 0.3333333432674408,
"num_tokens": 4467364.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027468691114336252,
+ "routers_loss": 0.002851625671610236,
"skip_count": 0.0,
"step": 2770,
"text_loss": 0.6068631410598755
@@ -26332,13 +26332,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.033203125,
"learning_rate": 0.0008814814888963383,
"loss": 0.0073,
"macro_f1": 0.6666666865348816,
"num_tokens": 4470681.0,
"repeat_count": 0.0,
- "routers_loss": 0.00443003186956048,
+ "routers_loss": 0.004729873035103083,
"skip_count": 1.0,
"step": 2772,
"text_loss": 0.5386646389961243
@@ -26351,13 +26351,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0439453125,
+ "grad_norm": 0.04296875,
"learning_rate": 0.000881281331283941,
"loss": 0.0091,
"macro_f1": 0.6666666865348816,
"num_tokens": 4473734.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031219064258038998,
+ "routers_loss": 0.0031853127293288708,
"skip_count": 1.0,
"step": 2774,
"text_loss": 0.5695263147354126
@@ -26370,13 +26370,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03369140625,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0008810810275638182,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 4478404.0,
"repeat_count": 0.0,
- "routers_loss": 0.000846695271320641,
+ "routers_loss": 0.0008977465913631022,
"skip_count": 0.0,
"step": 2776,
"text_loss": 0.4750773310661316
@@ -26389,13 +26389,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0008808805778127269,
- "loss": 0.0075,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 4481287.0,
"repeat_count": 0.0,
- "routers_loss": 0.0074167875573039055,
+ "routers_loss": 0.00469845999032259,
"skip_count": 0.0,
"step": 2778,
"text_loss": 0.14078612625598907
@@ -26408,13 +26408,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.04296875,
+ "grad_norm": 0.049560546875,
"learning_rate": 0.0008806799821074796,
- "loss": 0.0078,
+ "loss": 0.0079,
"macro_f1": 0.5492662787437439,
"num_tokens": 4483929.0,
"repeat_count": 0.0,
- "routers_loss": 0.018358726054430008,
+ "routers_loss": 0.01789761893451214,
"skip_count": 2.0,
"step": 2780,
"text_loss": 0.2167191207408905
@@ -26427,13 +26427,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0008804792405249451,
- "loss": 0.0124,
+ "loss": 0.0123,
"macro_f1": 0.3333333432674408,
"num_tokens": 4487468.0,
"repeat_count": 0.0,
- "routers_loss": 0.001094152103178203,
+ "routers_loss": 0.001018838956952095,
"skip_count": 0.0,
"step": 2782,
"text_loss": 0.5424665212631226
@@ -26446,13 +26446,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.0498046875,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.000880278353142048,
- "loss": 0.0075,
+ "loss": 0.0077,
"macro_f1": 0.8200000524520874,
"num_tokens": 4490942.0,
"repeat_count": 1.0,
- "routers_loss": 0.03035641834139824,
+ "routers_loss": 0.03260354697704315,
"skip_count": 3.0,
"step": 2784,
"text_loss": 0.20994654297828674
@@ -26465,13 +26465,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05517578125,
+ "grad_norm": 0.05322265625,
"learning_rate": 0.0008800773200357683,
- "loss": 0.0123,
+ "loss": 0.0122,
"macro_f1": 0.3333333432674408,
"num_tokens": 4493986.0,
"repeat_count": 0.0,
- "routers_loss": 0.002394269686192274,
+ "routers_loss": 0.003019835101440549,
"skip_count": 0.0,
"step": 2786,
"text_loss": 0.5709528923034668
@@ -26484,13 +26484,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.034423828125,
"learning_rate": 0.0008798761412831429,
"loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 4498232.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028274122159928083,
+ "routers_loss": 0.00285192858427763,
"skip_count": 0.0,
"step": 2788,
"text_loss": 0.5103896260261536
@@ -26503,13 +26503,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0439453125,
+ "grad_norm": 0.044921875,
"learning_rate": 0.0008796748169612634,
- "loss": 0.0088,
+ "loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 4501231.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012642849469557405,
+ "routers_loss": 0.0012469831854104996,
"skip_count": 0.0,
"step": 2790,
"text_loss": 0.43669697642326355
@@ -26522,13 +26522,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03662109375,
+ "grad_norm": 0.039794921875,
"learning_rate": 0.0008794733471472778,
"loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 4504208.0,
"repeat_count": 0.0,
- "routers_loss": 0.010966303758323193,
+ "routers_loss": 0.011512776836752892,
"skip_count": 1.0,
"step": 2792,
"text_loss": 0.2299770563840866
@@ -26541,13 +26541,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.03564453125,
"learning_rate": 0.0008792717319183899,
- "loss": 0.0064,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 4507013.0,
"repeat_count": 0.0,
- "routers_loss": 0.008194026537239552,
+ "routers_loss": 0.00834917277097702,
"skip_count": 0.0,
"step": 2794,
"text_loss": 0.2130603939294815
@@ -26560,13 +26560,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0283203125,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0008790699713518587,
- "loss": 0.008,
+ "loss": 0.0078,
"macro_f1": 0.6666666865348816,
"num_tokens": 4510286.0,
"repeat_count": 0.0,
- "routers_loss": 0.008828429505228996,
+ "routers_loss": 0.008616939187049866,
"skip_count": 2.0,
"step": 2796,
"text_loss": 0.4377101957798004
@@ -26579,13 +26579,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0274658203125,
+ "grad_norm": 0.02783203125,
"learning_rate": 0.0008788680655249994,
- "loss": 0.007,
+ "loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 4513762.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038230866193771362,
+ "routers_loss": 0.003408568911254406,
"skip_count": 0.0,
"step": 2798,
"text_loss": 0.435138463973999
@@ -26598,13 +26598,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0311279296875,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0008786660145151826,
- "loss": 0.009,
+ "loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 4516696.0,
"repeat_count": 1.0,
- "routers_loss": 0.0031088131945580244,
+ "routers_loss": 0.0029398901388049126,
"skip_count": 0.0,
"step": 2800,
"text_loss": 0.3195655047893524
@@ -26617,13 +26617,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.033203125,
"learning_rate": 0.0008784638183998348,
- "loss": 0.0083,
+ "loss": 0.0081,
"macro_f1": 0.3333333432674408,
"num_tokens": 4519760.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014194221002981067,
+ "routers_loss": 0.0013777425047010183,
"skip_count": 0.0,
"step": 2802,
"text_loss": 0.8129430413246155
@@ -26636,13 +26636,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.032470703125,
"learning_rate": 0.0008782614772564379,
- "loss": 0.0099,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 4522106.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031931858975440264,
+ "routers_loss": 0.0031694830395281315,
"skip_count": 0.0,
"step": 2804,
"text_loss": 0.18083660304546356
@@ -26655,13 +26655,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0008780589911625293,
- "loss": 0.0117,
+ "loss": 0.0114,
"macro_f1": 0.3333333432674408,
"num_tokens": 4525743.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021834284998476505,
+ "routers_loss": 0.002161208540201187,
"skip_count": 0.0,
"step": 2806,
"text_loss": 0.8228182792663574
@@ -26674,13 +26674,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0703125,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0008778563601957021,
- "loss": 0.0098,
+ "loss": 0.0095,
"macro_f1": 0.6666666865348816,
"num_tokens": 4529573.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035390176344662905,
+ "routers_loss": 0.0028444856870919466,
"skip_count": 1.0,
"step": 2808,
"text_loss": 0.3715563118457794
@@ -26693,13 +26693,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04296875,
+ "grad_norm": 0.044677734375,
"learning_rate": 0.0008776535844336049,
- "loss": 0.0095,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 4532452.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038604713045060635,
+ "routers_loss": 0.003807213855907321,
"skip_count": 0.0,
"step": 2810,
"text_loss": 0.6012523174285889
@@ -26712,13 +26712,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0008774506639539417,
- "loss": 0.0072,
+ "loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 4536077.0,
"repeat_count": 0.0,
- "routers_loss": 0.00669970503076911,
+ "routers_loss": 0.006698979996144772,
"skip_count": 0.0,
"step": 2812,
"text_loss": 0.27097949385643005
@@ -26731,13 +26731,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.099609375,
"learning_rate": 0.0008772475988344722,
- "loss": 0.0132,
+ "loss": 0.013,
"macro_f1": 0.6666666865348816,
"num_tokens": 4539057.0,
"repeat_count": 0.0,
- "routers_loss": 0.004594485275447369,
+ "routers_loss": 0.004849409218877554,
"skip_count": 1.0,
"step": 2814,
"text_loss": 1.026973843574524
@@ -26750,13 +26750,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0008770443891530109,
- "loss": 0.0116,
+ "loss": 0.0115,
"macro_f1": 0.5934640765190125,
"num_tokens": 4542253.0,
"repeat_count": 0.0,
- "routers_loss": 0.01891930215060711,
+ "routers_loss": 0.019148651510477066,
"skip_count": 3.0,
"step": 2816,
"text_loss": 0.2717585563659668
@@ -26769,13 +26769,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.054931640625,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0008768410349874286,
"loss": 0.0098,
"macro_f1": 0.6601307392120361,
"num_tokens": 4545047.0,
"repeat_count": 1.0,
- "routers_loss": 0.0247862096875906,
+ "routers_loss": 0.02231316640973091,
"skip_count": 2.0,
"step": 2818,
"text_loss": 0.274346262216568
@@ -26788,13 +26788,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0008766375364156508,
"loss": 0.0091,
"macro_f1": 0.6666666865348816,
"num_tokens": 4548371.0,
"repeat_count": 0.0,
- "routers_loss": 0.008566800504922867,
+ "routers_loss": 0.008014129474759102,
"skip_count": 2.0,
"step": 2820,
"text_loss": 0.22850871086120605
@@ -26807,13 +26807,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041748046875,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.0008764338935156586,
"loss": 0.0095,
"macro_f1": 0.3333333432674408,
"num_tokens": 4551276.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013546474510803819,
+ "routers_loss": 0.0014544493751600385,
"skip_count": 0.0,
"step": 2822,
"text_loss": 0.6308462023735046
@@ -26826,13 +26826,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.0390625,
"learning_rate": 0.000876230106365488,
- "loss": 0.0122,
+ "loss": 0.0123,
"macro_f1": 0.6666666865348816,
"num_tokens": 4554143.0,
"repeat_count": 0.0,
- "routers_loss": 0.009204468689858913,
+ "routers_loss": 0.00818584579974413,
"skip_count": 3.0,
"step": 2824,
"text_loss": 0.3484207093715668
@@ -26845,13 +26845,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03271484375,
+ "grad_norm": 0.0264892578125,
"learning_rate": 0.0008760261750432312,
- "loss": 0.0067,
+ "loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 4557256.0,
"repeat_count": 0.0,
- "routers_loss": 0.00787584763020277,
+ "routers_loss": 0.006275608204305172,
"skip_count": 3.0,
"step": 2826,
"text_loss": 0.1927330046892166
@@ -26864,13 +26864,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.0008758220996270348,
- "loss": 0.0102,
+ "loss": 0.0103,
"macro_f1": 1.0,
"num_tokens": 4560202.0,
"repeat_count": 2.0,
- "routers_loss": 0.0057869357988238335,
+ "routers_loss": 0.0055974251590669155,
"skip_count": 2.0,
"step": 2828,
"text_loss": 0.7796496748924255
@@ -26883,13 +26883,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.046142578125,
"learning_rate": 0.0008756178801951007,
- "loss": 0.0128,
+ "loss": 0.0129,
"macro_f1": 0.3333333432674408,
"num_tokens": 4563508.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018274546600878239,
+ "routers_loss": 0.0019799957517534494,
"skip_count": 0.0,
"step": 2830,
"text_loss": 0.49633297324180603
@@ -26902,13 +26902,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.0458984375,
"learning_rate": 0.0008754135168256865,
- "loss": 0.0094,
+ "loss": 0.0095,
"macro_f1": 0.3333333432674408,
"num_tokens": 4566776.0,
"repeat_count": 0.0,
- "routers_loss": 0.004527154844254255,
+ "routers_loss": 0.004538947716355324,
"skip_count": 0.0,
"step": 2832,
"text_loss": 0.5346745252609253
@@ -26921,13 +26921,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0008752090095971044,
"loss": 0.0091,
"macro_f1": 0.3333333432674408,
"num_tokens": 4569787.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018263199599459767,
+ "routers_loss": 0.001663343166001141,
"skip_count": 0.0,
"step": 2834,
"text_loss": 0.5524004697799683
@@ -26940,13 +26940,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.000875004358587722,
- "loss": 0.0088,
+ "loss": 0.0087,
"macro_f1": 0.3333333432674408,
"num_tokens": 4572813.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022649941965937614,
+ "routers_loss": 0.0022988212294876575,
"skip_count": 0.0,
"step": 2836,
"text_loss": 0.4232870042324066
@@ -26959,13 +26959,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.038330078125,
"learning_rate": 0.000874799563875962,
"loss": 0.0083,
"macro_f1": 0.6666666865348816,
"num_tokens": 4575563.0,
"repeat_count": 0.0,
- "routers_loss": 0.00791149027645588,
+ "routers_loss": 0.007781553082168102,
"skip_count": 1.0,
"step": 2838,
"text_loss": 0.19239822030067444
@@ -26978,13 +26978,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.03515625,
"learning_rate": 0.0008745946255403021,
"loss": 0.0072,
"macro_f1": 0.5492662787437439,
"num_tokens": 4578117.0,
"repeat_count": 0.0,
- "routers_loss": 0.016813624650239944,
+ "routers_loss": 0.01872488670051098,
"skip_count": 2.0,
"step": 2840,
"text_loss": 0.2148810178041458
@@ -26999,11 +26999,11 @@
"f1_skip": 1.0,
"grad_norm": 0.04296875,
"learning_rate": 0.0008743895436592749,
- "loss": 0.0079,
+ "loss": 0.0078,
"macro_f1": 1.0,
"num_tokens": 4582330.0,
"repeat_count": 1.0,
- "routers_loss": 0.004429332446306944,
+ "routers_loss": 0.005634195636957884,
"skip_count": 1.0,
"step": 2842,
"text_loss": 0.4929640591144562
@@ -27016,13 +27016,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.048583984375,
"learning_rate": 0.0008741843183114685,
- "loss": 0.0084,
+ "loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 4585765.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007147722644731402,
+ "routers_loss": 0.0008928569150157273,
"skip_count": 0.0,
"step": 2844,
"text_loss": 0.32702967524528503
@@ -27035,13 +27035,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.044189453125,
+ "grad_norm": 0.0439453125,
"learning_rate": 0.0008739789495755253,
- "loss": 0.0092,
+ "loss": 0.0094,
"macro_f1": 0.6666666865348816,
"num_tokens": 4589000.0,
"repeat_count": 0.0,
- "routers_loss": 0.015438012778759003,
+ "routers_loss": 0.014715569093823433,
"skip_count": 4.0,
"step": 2846,
"text_loss": 0.25125816464424133
@@ -27054,13 +27054,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.049560546875,
"learning_rate": 0.0008737734375301433,
- "loss": 0.0138,
+ "loss": 0.0135,
"macro_f1": 0.3333333432674408,
"num_tokens": 4592391.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015892626252025366,
+ "routers_loss": 0.0017551190685480833,
"skip_count": 0.0,
"step": 2848,
"text_loss": 0.6595172882080078
@@ -27073,13 +27073,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.027099609375,
"learning_rate": 0.0008735677822540749,
- "loss": 0.0086,
+ "loss": 0.0085,
"macro_f1": 0.3333333432674408,
"num_tokens": 4596662.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006934175617061555,
+ "routers_loss": 0.0006456313421949744,
"skip_count": 0.0,
"step": 2850,
"text_loss": 0.6290773153305054
@@ -27092,13 +27092,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.0008733619838261276,
"loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 4599682.0,
"repeat_count": 0.0,
- "routers_loss": 0.006811433006078005,
+ "routers_loss": 0.00765060493722558,
"skip_count": 2.0,
"step": 2852,
"text_loss": 0.3268161416053772
@@ -27111,13 +27111,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0008731560423251637,
- "loss": 0.0104,
+ "loss": 0.01,
"macro_f1": 1.0,
"num_tokens": 4603324.0,
"repeat_count": 1.0,
- "routers_loss": 0.012574959546327591,
+ "routers_loss": 0.01161442045122385,
"skip_count": 2.0,
"step": 2854,
"text_loss": 0.3029932975769043
@@ -27130,13 +27130,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 1.0,
"f1_skip": 0.888888955116272,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.0008729499578301005,
"loss": 0.0098,
"macro_f1": 0.9555556178092957,
"num_tokens": 4606975.0,
"repeat_count": 1.0,
- "routers_loss": 0.01913273334503174,
+ "routers_loss": 0.02055389992892742,
"skip_count": 5.0,
"step": 2856,
"text_loss": 0.6268532872200012
@@ -27149,13 +27149,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.05078125,
"learning_rate": 0.00087274373041991,
- "loss": 0.0082,
+ "loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 4609629.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012737065553665161,
+ "routers_loss": 0.0013911726418882608,
"skip_count": 0.0,
"step": 2858,
"text_loss": 0.534355640411377
@@ -27168,13 +27168,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0008725373601736188,
- "loss": 0.0079,
+ "loss": 0.0081,
"macro_f1": 0.6666666865348816,
"num_tokens": 4612913.0,
"repeat_count": 2.0,
- "routers_loss": 0.009088932536542416,
+ "routers_loss": 0.01010701060295105,
"skip_count": 0.0,
"step": 2860,
"text_loss": 0.3391380310058594
@@ -27187,13 +27187,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0260009765625,
+ "grad_norm": 0.0255126953125,
"learning_rate": 0.0008723308471703085,
- "loss": 0.0078,
+ "loss": 0.008,
"macro_f1": 0.6666666865348816,
"num_tokens": 4616718.0,
"repeat_count": 0.0,
- "routers_loss": 0.006364458240568638,
+ "routers_loss": 0.005969462916254997,
"skip_count": 1.0,
"step": 2862,
"text_loss": 0.47250816226005554
@@ -27206,13 +27206,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047607421875,
+ "grad_norm": 0.046630859375,
"learning_rate": 0.0008721241914891152,
- "loss": 0.0084,
+ "loss": 0.0083,
"macro_f1": 0.3333333432674408,
"num_tokens": 4619680.0,
"repeat_count": 0.0,
- "routers_loss": 0.002686808817088604,
+ "routers_loss": 0.0027780034579336643,
"skip_count": 0.0,
"step": 2864,
"text_loss": 0.3249278664588928
@@ -27225,13 +27225,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.0439453125,
"learning_rate": 0.0008719173932092295,
- "loss": 0.0047,
+ "loss": 0.0044,
"macro_f1": 0.3333333432674408,
"num_tokens": 4622700.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018892486114054918,
+ "routers_loss": 0.0015912104863673449,
"skip_count": 0.0,
"step": 2866,
"text_loss": 0.7789985537528992
@@ -27244,13 +27244,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.049072265625,
+ "grad_norm": 0.05126953125,
"learning_rate": 0.0008717104524098973,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 4626637.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035258810967206955,
+ "routers_loss": 0.0036539011634886265,
"skip_count": 0.0,
"step": 2868,
"text_loss": 0.619088351726532
@@ -27263,13 +27263,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.10400390625,
"learning_rate": 0.0008715033691704187,
- "loss": 0.0121,
+ "loss": 0.0118,
"macro_f1": 0.6666666865348816,
"num_tokens": 4629863.0,
"repeat_count": 0.0,
- "routers_loss": 0.007305602077394724,
+ "routers_loss": 0.008402476087212563,
"skip_count": 1.0,
"step": 2870,
"text_loss": 0.5550018548965454
@@ -27282,13 +27282,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.06298828125,
"learning_rate": 0.0008712961435701479,
- "loss": 0.0162,
+ "loss": 0.0161,
"macro_f1": 0.6666666865348816,
"num_tokens": 4632657.0,
"repeat_count": 0.0,
- "routers_loss": 0.012898211367428303,
+ "routers_loss": 0.01400839351117611,
"skip_count": 1.0,
"step": 2872,
"text_loss": 0.17368625104427338
@@ -27301,13 +27301,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.0008710887756884947,
- "loss": 0.0088,
+ "loss": 0.0086,
"macro_f1": 0.3333333432674408,
"num_tokens": 4635885.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013437134912237525,
+ "routers_loss": 0.0014573842054232955,
"skip_count": 0.0,
"step": 2874,
"text_loss": 0.5138643383979797
@@ -27320,13 +27320,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0008708812656049225,
- "loss": 0.0091,
+ "loss": 0.009,
"macro_f1": 0.6666666865348816,
"num_tokens": 4639341.0,
"repeat_count": 0.0,
- "routers_loss": 0.002090727211907506,
+ "routers_loss": 0.002810224425047636,
"skip_count": 1.0,
"step": 2876,
"text_loss": 0.70310378074646
@@ -27341,11 +27341,11 @@
"f1_skip": 0.8571428656578064,
"grad_norm": 0.03564453125,
"learning_rate": 0.0008706736133989497,
- "loss": 0.0107,
+ "loss": 0.0105,
"macro_f1": 0.9449735879898071,
"num_tokens": 4642163.0,
"repeat_count": 2.0,
- "routers_loss": 0.030176319181919098,
+ "routers_loss": 0.029783209785819054,
"skip_count": 4.0,
"step": 2878,
"text_loss": 0.26898008584976196
@@ -27358,13 +27358,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.04150390625,
"learning_rate": 0.0008704658191501491,
- "loss": 0.0091,
+ "loss": 0.0095,
"macro_f1": 0.3333333432674408,
"num_tokens": 4645858.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009633690933696926,
+ "routers_loss": 0.0009193966398015618,
"skip_count": 0.0,
"step": 2880,
"text_loss": 0.6047570705413818
@@ -27377,13 +27377,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.060302734375,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.0008702578829381475,
"loss": 0.0131,
"macro_f1": 0.8814815282821655,
"num_tokens": 4649237.0,
"repeat_count": 2.0,
- "routers_loss": 0.0568491593003273,
+ "routers_loss": 0.05698608607053757,
"skip_count": 4.0,
"step": 2882,
"text_loss": 0.10695219784975052
@@ -27396,13 +27396,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0306396484375,
+ "grad_norm": 0.0311279296875,
"learning_rate": 0.0008700498048426269,
- "loss": 0.0082,
+ "loss": 0.0083,
"macro_f1": 0.3333333432674408,
"num_tokens": 4652362.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012279651127755642,
+ "routers_loss": 0.0011786938412114978,
"skip_count": 0.0,
"step": 2884,
"text_loss": 0.4442957937717438
@@ -27415,13 +27415,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.046142578125,
"learning_rate": 0.0008698415849433229,
- "loss": 0.0097,
+ "loss": 0.0092,
"macro_f1": 0.5492662787437439,
"num_tokens": 4655616.0,
"repeat_count": 2.0,
- "routers_loss": 0.02166076935827732,
+ "routers_loss": 0.02142646163702011,
"skip_count": 0.0,
"step": 2886,
"text_loss": 0.5820964574813843
@@ -27434,13 +27434,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0008696332233200262,
- "loss": 0.012,
+ "loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 4659294.0,
"repeat_count": 0.0,
- "routers_loss": 0.003944257274270058,
+ "routers_loss": 0.004038636106997728,
"skip_count": 0.0,
"step": 2888,
"text_loss": 0.11847645789384842
@@ -27453,13 +27453,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.0478515625,
"learning_rate": 0.0008694247200525806,
- "loss": 0.0092,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 4662512.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013393335975706577,
+ "routers_loss": 0.0013256469974294305,
"skip_count": 0.0,
"step": 2890,
"text_loss": 0.4873582720756531
@@ -27472,13 +27472,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.0008692160752208856,
- "loss": 0.0128,
+ "loss": 0.0129,
"macro_f1": 0.3272727429866791,
"num_tokens": 4666190.0,
"repeat_count": 0.0,
- "routers_loss": 0.0443510003387928,
+ "routers_loss": 0.04477972164750099,
"skip_count": 1.0,
"step": 2892,
"text_loss": 0.44243401288986206
@@ -27491,13 +27491,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.083984375,
+ "grad_norm": 0.09521484375,
"learning_rate": 0.0008690072889048941,
- "loss": 0.0125,
+ "loss": 0.0127,
"macro_f1": 1.0,
"num_tokens": 4668884.0,
"repeat_count": 1.0,
- "routers_loss": 0.0047337980940938,
+ "routers_loss": 0.004407547414302826,
"skip_count": 2.0,
"step": 2894,
"text_loss": 0.6847127079963684
@@ -27510,13 +27510,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.041015625,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0008687983611846133,
- "loss": 0.0082,
+ "loss": 0.008,
"macro_f1": 0.6666666865348816,
"num_tokens": 4672093.0,
"repeat_count": 0.0,
- "routers_loss": 0.0055244253017008305,
+ "routers_loss": 0.005245382897555828,
"skip_count": 1.0,
"step": 2896,
"text_loss": 0.25583332777023315
@@ -27529,13 +27529,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.0458984375,
"learning_rate": 0.0008685892921401049,
- "loss": 0.011,
+ "loss": 0.0108,
"macro_f1": 0.3333333432674408,
"num_tokens": 4674917.0,
"repeat_count": 0.0,
- "routers_loss": 0.001250729663297534,
+ "routers_loss": 0.0010470855049788952,
"skip_count": 0.0,
"step": 2898,
"text_loss": 0.41998377442359924
@@ -27548,13 +27548,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0008683800818514844,
- "loss": 0.0061,
+ "loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 4677739.0,
"repeat_count": 0.0,
- "routers_loss": 0.00974183902144432,
+ "routers_loss": 0.009026622399687767,
"skip_count": 2.0,
"step": 2900,
"text_loss": 0.303053081035614
@@ -27567,13 +27567,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.078125,
+ "grad_norm": 0.09619140625,
"learning_rate": 0.0008681707303989215,
- "loss": 0.0111,
+ "loss": 0.0108,
"macro_f1": 0.3333333432674408,
"num_tokens": 4680721.0,
"repeat_count": 0.0,
- "routers_loss": 0.004882345907390118,
+ "routers_loss": 0.004500916693359613,
"skip_count": 0.0,
"step": 2902,
"text_loss": 0.5573288798332214
@@ -27586,13 +27586,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0008679612378626404,
"loss": 0.0098,
"macro_f1": 0.6666666865348816,
"num_tokens": 4683339.0,
"repeat_count": 0.0,
- "routers_loss": 0.00568242697045207,
+ "routers_loss": 0.005047840531915426,
"skip_count": 1.0,
"step": 2904,
"text_loss": 0.321353554725647
@@ -27605,13 +27605,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0306396484375,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0008677516043229187,
- "loss": 0.0082,
+ "loss": 0.0083,
"macro_f1": 0.3272727429866791,
"num_tokens": 4686453.0,
"repeat_count": 0.0,
- "routers_loss": 0.010831202380359173,
+ "routers_loss": 0.010256914421916008,
"skip_count": 1.0,
"step": 2906,
"text_loss": 0.4300784468650818
@@ -27624,13 +27624,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.05615234375,
+ "grad_norm": 0.05029296875,
"learning_rate": 0.0008675418298600883,
- "loss": 0.0087,
+ "loss": 0.0083,
"macro_f1": 0.6666666865348816,
"num_tokens": 4689645.0,
"repeat_count": 1.0,
- "routers_loss": 0.00235295994207263,
+ "routers_loss": 0.0022669637110084295,
"skip_count": 0.0,
"step": 2908,
"text_loss": 0.5064885020256042
@@ -27643,13 +27643,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0008673319145545358,
"loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 4692320.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011642680037766695,
+ "routers_loss": 0.0011188550852239132,
"skip_count": 0.0,
"step": 2910,
"text_loss": 0.7114819884300232
@@ -27662,13 +27662,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0008671218584867003,
- "loss": 0.0104,
+ "loss": 0.0102,
"macro_f1": 0.6666666865348816,
"num_tokens": 4695116.0,
"repeat_count": 0.0,
- "routers_loss": 0.00278888875618577,
+ "routers_loss": 0.002966561820358038,
"skip_count": 2.0,
"step": 2912,
"text_loss": 0.5662392973899841
@@ -27681,13 +27681,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.049560546875,
+ "grad_norm": 0.047607421875,
"learning_rate": 0.0008669116617370762,
- "loss": 0.008,
+ "loss": 0.0081,
"macro_f1": 0.3333333432674408,
"num_tokens": 4698040.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014630162622779608,
+ "routers_loss": 0.0012894890969619155,
"skip_count": 0.0,
"step": 2914,
"text_loss": 0.718977689743042
@@ -27700,13 +27700,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0008667013243862111,
- "loss": 0.0159,
+ "loss": 0.0162,
"macro_f1": 0.3333333432674408,
"num_tokens": 4700963.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011393720051273704,
+ "routers_loss": 0.0007232456118799746,
"skip_count": 0.0,
"step": 2916,
"text_loss": 0.3447718024253845
@@ -27719,13 +27719,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02978515625,
+ "grad_norm": 0.0289306640625,
"learning_rate": 0.000866490846514707,
- "loss": 0.0072,
+ "loss": 0.0075,
"macro_f1": 0.3272727429866791,
"num_tokens": 4704471.0,
"repeat_count": 1.0,
- "routers_loss": 0.014218449592590332,
+ "routers_loss": 0.015166680328547955,
"skip_count": 0.0,
"step": 2918,
"text_loss": 0.454946368932724
@@ -27738,13 +27738,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.052978515625,
+ "grad_norm": 0.04736328125,
"learning_rate": 0.000866280228203219,
"loss": 0.0073,
"macro_f1": 1.0,
"num_tokens": 4707238.0,
"repeat_count": 1.0,
- "routers_loss": 0.005367610137909651,
+ "routers_loss": 0.0061312485486269,
"skip_count": 1.0,
"step": 2920,
"text_loss": 0.721788227558136
@@ -27757,13 +27757,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048828125,
+ "grad_norm": 0.055908203125,
"learning_rate": 0.0008660694695324564,
- "loss": 0.0124,
+ "loss": 0.0125,
"macro_f1": 0.3333333432674408,
"num_tokens": 4711323.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020303199999034405,
+ "routers_loss": 0.00169933564029634,
"skip_count": 0.0,
"step": 2922,
"text_loss": 0.7562121748924255
@@ -27776,13 +27776,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06201171875,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0008658585705831829,
- "loss": 0.0123,
+ "loss": 0.0128,
"macro_f1": 0.3333333432674408,
"num_tokens": 4714417.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022230520844459534,
+ "routers_loss": 0.0022731393110007048,
"skip_count": 0.0,
"step": 2924,
"text_loss": 0.5726147890090942
@@ -27795,13 +27795,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.06787109375,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0008656475314362148,
- "loss": 0.0133,
+ "loss": 0.0131,
"macro_f1": 0.8817967176437378,
"num_tokens": 4717445.0,
"repeat_count": 2.0,
- "routers_loss": 0.06414645165205002,
+ "routers_loss": 0.06477782875299454,
"skip_count": 3.0,
"step": 2926,
"text_loss": 0.4505867660045624
@@ -27814,13 +27814,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 1.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.0625,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0008654363521724229,
- "loss": 0.0128,
+ "loss": 0.0129,
"macro_f1": 0.9449735879898071,
"num_tokens": 4722253.0,
"repeat_count": 2.0,
- "routers_loss": 0.022727061063051224,
+ "routers_loss": 0.027405790984630585,
"skip_count": 4.0,
"step": 2928,
"text_loss": 0.24767601490020752
@@ -27833,13 +27833,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.0537109375,
"learning_rate": 0.0008652250328727315,
- "loss": 0.0114,
+ "loss": 0.0112,
"macro_f1": 0.6666666865348816,
"num_tokens": 4725465.0,
"repeat_count": 0.0,
- "routers_loss": 0.006181784905493259,
+ "routers_loss": 0.006544729229062796,
"skip_count": 2.0,
"step": 2930,
"text_loss": 0.4478724002838135
@@ -27852,13 +27852,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.0517578125,
"learning_rate": 0.0008650135736181184,
- "loss": 0.0133,
+ "loss": 0.0134,
"macro_f1": 0.6666666865348816,
"num_tokens": 4729213.0,
"repeat_count": 1.0,
- "routers_loss": 0.005527070257812738,
+ "routers_loss": 0.0055119614116847515,
"skip_count": 0.0,
"step": 2932,
"text_loss": 0.6749323010444641
@@ -27871,13 +27871,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05517578125,
+ "grad_norm": 0.045166015625,
"learning_rate": 0.0008648019744896154,
- "loss": 0.0102,
+ "loss": 0.0101,
"macro_f1": 0.3333333432674408,
"num_tokens": 4732280.0,
"repeat_count": 0.0,
- "routers_loss": 0.008868738077580929,
+ "routers_loss": 0.008374541997909546,
"skip_count": 0.0,
"step": 2934,
"text_loss": 0.4647359251976013
@@ -27890,13 +27890,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.057373046875,
+ "grad_norm": 0.06201171875,
"learning_rate": 0.0008645902355683077,
- "loss": 0.0089,
+ "loss": 0.0091,
"macro_f1": 0.6595745086669922,
"num_tokens": 4736244.0,
"repeat_count": 1.0,
- "routers_loss": 0.07285884022712708,
+ "routers_loss": 0.068686343729496,
"skip_count": 4.0,
"step": 2936,
"text_loss": 0.5356017351150513
@@ -27909,13 +27909,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.043212890625,
+ "grad_norm": 0.042236328125,
"learning_rate": 0.0008643783569353339,
- "loss": 0.0072,
+ "loss": 0.007,
"macro_f1": 0.6666666865348816,
"num_tokens": 4739810.0,
"repeat_count": 2.0,
- "routers_loss": 0.019306030124425888,
+ "routers_loss": 0.017954571172595024,
"skip_count": 0.0,
"step": 2938,
"text_loss": 0.3145926296710968
@@ -27928,13 +27928,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.054443359375,
"learning_rate": 0.0008641663386718863,
- "loss": 0.0084,
+ "loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 4742720.0,
"repeat_count": 0.0,
- "routers_loss": 0.00626454409211874,
+ "routers_loss": 0.006261351052671671,
"skip_count": 1.0,
"step": 2940,
"text_loss": 0.3200613856315613
@@ -27949,11 +27949,11 @@
"f1_skip": 1.0,
"grad_norm": 0.04150390625,
"learning_rate": 0.0008639541808592109,
- "loss": 0.0091,
+ "loss": 0.0093,
"macro_f1": 1.0,
"num_tokens": 4745870.0,
"repeat_count": 1.0,
- "routers_loss": 0.0019172134343534708,
+ "routers_loss": 0.0025341357104480267,
"skip_count": 1.0,
"step": 2942,
"text_loss": 0.5020416378974915
@@ -27968,11 +27968,11 @@
"f1_skip": 1.0,
"grad_norm": 0.025634765625,
"learning_rate": 0.0008637418835786067,
- "loss": 0.0095,
+ "loss": 0.0094,
"macro_f1": 0.6666666865348816,
"num_tokens": 4748943.0,
"repeat_count": 0.0,
- "routers_loss": 0.009745351038873196,
+ "routers_loss": 0.008970048278570175,
"skip_count": 2.0,
"step": 2944,
"text_loss": 0.14517110586166382
@@ -27985,13 +27985,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.043701171875,
+ "grad_norm": 0.055908203125,
"learning_rate": 0.0008635294469114265,
- "loss": 0.011,
+ "loss": 0.0112,
"macro_f1": 0.3333333432674408,
"num_tokens": 4751360.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020624736789613962,
+ "routers_loss": 0.002133632078766823,
"skip_count": 0.0,
"step": 2946,
"text_loss": 0.5367856025695801
@@ -28004,13 +28004,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.08837890625,
"learning_rate": 0.0008633168709390766,
- "loss": 0.0118,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 4754403.0,
"repeat_count": 0.0,
- "routers_loss": 0.001082106726244092,
+ "routers_loss": 0.0011866620043292642,
"skip_count": 0.0,
"step": 2948,
"text_loss": 0.38302522897720337
@@ -28023,13 +28023,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.037109375,
"learning_rate": 0.0008631041557430163,
- "loss": 0.0061,
+ "loss": 0.0058,
"macro_f1": 0.6666666865348816,
"num_tokens": 4757867.0,
"repeat_count": 2.0,
- "routers_loss": 0.0026527612935751677,
+ "routers_loss": 0.0026854004245251417,
"skip_count": 0.0,
"step": 2950,
"text_loss": 0.43433454632759094
@@ -28042,13 +28042,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.05859375,
"learning_rate": 0.0008628913014047585,
"loss": 0.0102,
"macro_f1": 0.3333333432674408,
"num_tokens": 4761171.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027245471719652414,
+ "routers_loss": 0.002433479530736804,
"skip_count": 0.0,
"step": 2952,
"text_loss": 0.4725971519947052
@@ -28061,13 +28061,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0286865234375,
+ "grad_norm": 0.028564453125,
"learning_rate": 0.0008626783080058696,
- "loss": 0.0065,
+ "loss": 0.0066,
"macro_f1": 0.3272727429866791,
"num_tokens": 4764752.0,
"repeat_count": 1.0,
- "routers_loss": 0.01764744706451893,
+ "routers_loss": 0.017182493582367897,
"skip_count": 0.0,
"step": 2954,
"text_loss": 0.460641473531723
@@ -28080,13 +28080,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0859375,
+ "grad_norm": 0.12353515625,
"learning_rate": 0.0008624651756279687,
- "loss": 0.0196,
+ "loss": 0.0198,
"macro_f1": 0.3333333432674408,
"num_tokens": 4767453.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019560824148356915,
+ "routers_loss": 0.0018134774873033166,
"skip_count": 0.0,
"step": 2956,
"text_loss": 0.4091459810733795
@@ -28099,13 +28099,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 0.800000011920929,
"f1_skip": 1.0,
- "grad_norm": 0.051025390625,
+ "grad_norm": 0.053466796875,
"learning_rate": 0.000862251904352729,
"loss": 0.0108,
"macro_f1": 0.9259259104728699,
"num_tokens": 4771110.0,
"repeat_count": 3.0,
- "routers_loss": 0.03031078353524208,
+ "routers_loss": 0.0365753099322319,
"skip_count": 3.0,
"step": 2958,
"text_loss": 0.22408585250377655
@@ -28118,13 +28118,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05224609375,
+ "grad_norm": 0.05029296875,
"learning_rate": 0.000862038494261876,
"loss": 0.0109,
"macro_f1": 0.3272727429866791,
"num_tokens": 4774464.0,
"repeat_count": 0.0,
- "routers_loss": 0.024790454655885696,
+ "routers_loss": 0.024343067780137062,
"skip_count": 1.0,
"step": 2960,
"text_loss": 0.16483014822006226
@@ -28137,13 +28137,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052490234375,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0008618249454371891,
- "loss": 0.0099,
+ "loss": 0.01,
"macro_f1": 0.3333333432674408,
"num_tokens": 4777894.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008704765350557864,
+ "routers_loss": 0.0008310087723657489,
"skip_count": 0.0,
"step": 2962,
"text_loss": 0.5573428869247437
@@ -28156,13 +28156,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0008616112579605006,
- "loss": 0.0116,
+ "loss": 0.0117,
"macro_f1": 0.3333333432674408,
"num_tokens": 4781116.0,
"repeat_count": 0.0,
- "routers_loss": 0.0066874073818326,
+ "routers_loss": 0.0065494864247739315,
"skip_count": 0.0,
"step": 2964,
"text_loss": 0.18816794455051422
@@ -28175,13 +28175,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.04248046875,
"learning_rate": 0.0008613974319136957,
- "loss": 0.0091,
+ "loss": 0.009,
"macro_f1": 0.3333333432674408,
"num_tokens": 4784886.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021798228845000267,
+ "routers_loss": 0.0019726944155991077,
"skip_count": 0.0,
"step": 2966,
"text_loss": 0.5097305774688721
@@ -28194,13 +28194,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.076171875,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0008611834673787134,
"loss": 0.0118,
"macro_f1": 0.3333333432674408,
"num_tokens": 4787563.0,
"repeat_count": 0.0,
- "routers_loss": 0.0063707553781569,
+ "routers_loss": 0.006327496841549873,
"skip_count": 0.0,
"step": 2968,
"text_loss": 0.6953814029693604
@@ -28213,13 +28213,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.5,
"f1_skip": 1.0,
- "grad_norm": 0.0595703125,
+ "grad_norm": 0.056884765625,
"learning_rate": 0.0008609693644375449,
- "loss": 0.0088,
+ "loss": 0.0086,
"macro_f1": 0.8200000524520874,
"num_tokens": 4790421.0,
"repeat_count": 3.0,
- "routers_loss": 0.044509731233119965,
+ "routers_loss": 0.042896661907434464,
"skip_count": 1.0,
"step": 2970,
"text_loss": 0.2573051154613495
@@ -28227,18 +28227,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 1.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 13.953331376577633,
- "f1_execute": 0.9795917868614197,
+ "f1_execute": 1.0,
"f1_repeat": 1.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.1640625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.14453125,
"learning_rate": 0.000860755123172235,
- "loss": 0.01,
- "macro_f1": 0.8820862174034119,
+ "loss": 0.0096,
+ "macro_f1": 1.0,
"num_tokens": 4793786.0,
"repeat_count": 2.0,
- "routers_loss": 0.01667599380016327,
+ "routers_loss": 0.013228793628513813,
"skip_count": 1.0,
"step": 2972,
"text_loss": 0.46614497900009155
@@ -28251,13 +28251,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0274658203125,
+ "grad_norm": 0.0296630859375,
"learning_rate": 0.0008605407436648815,
- "loss": 0.0069,
+ "loss": 0.007,
"macro_f1": 0.6666666865348816,
"num_tokens": 4796864.0,
"repeat_count": 0.0,
- "routers_loss": 0.008433761075139046,
+ "routers_loss": 0.007294759154319763,
"skip_count": 2.0,
"step": 2974,
"text_loss": 0.21555091440677643
@@ -28270,13 +28270,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.057861328125,
"learning_rate": 0.0008603262259976348,
- "loss": 0.0131,
+ "loss": 0.0129,
"macro_f1": 1.0,
"num_tokens": 4800080.0,
"repeat_count": 1.0,
- "routers_loss": 0.002439796691760421,
+ "routers_loss": 0.0024024227168411016,
"skip_count": 5.0,
"step": 2976,
"text_loss": 0.7855485081672668
@@ -28289,13 +28289,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05126953125,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0008601115702526987,
- "loss": 0.0112,
+ "loss": 0.0113,
"macro_f1": 0.3333333432674408,
"num_tokens": 4802899.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015027766348794103,
+ "routers_loss": 0.001433031284250319,
"skip_count": 0.0,
"step": 2978,
"text_loss": 0.6777765154838562
@@ -28308,13 +28308,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06103515625,
+ "grad_norm": 0.04931640625,
"learning_rate": 0.0008598967765123293,
- "loss": 0.0091,
+ "loss": 0.0088,
"macro_f1": 0.3333333432674408,
"num_tokens": 4805835.0,
"repeat_count": 0.0,
- "routers_loss": 0.003235677955672145,
+ "routers_loss": 0.003073975909501314,
"skip_count": 0.0,
"step": 2980,
"text_loss": 0.5926910638809204
@@ -28322,18 +28322,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.5,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 14.0,
- "f1_execute": 0.9090908765792847,
- "f1_repeat": 0.6666666865348816,
+ "f1_execute": 0.9333333373069763,
+ "f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.052734375,
+ "grad_norm": 0.05322265625,
"learning_rate": 0.0008596818448588364,
- "loss": 0.0141,
- "macro_f1": 0.7474747896194458,
+ "loss": 0.0139,
+ "macro_f1": 0.8666667342185974,
"num_tokens": 4809028.0,
"repeat_count": 1.0,
- "routers_loss": 0.063179150223732,
+ "routers_loss": 0.06438573449850082,
"skip_count": 6.0,
"step": 2982,
"text_loss": 0.23975612223148346
@@ -28346,13 +28346,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0299072265625,
+ "grad_norm": 0.0302734375,
"learning_rate": 0.0008594667753745821,
- "loss": 0.0055,
+ "loss": 0.0054,
"macro_f1": 0.3272727429866791,
"num_tokens": 4812831.0,
"repeat_count": 0.0,
- "routers_loss": 0.015444152988493443,
+ "routers_loss": 0.014817612245678902,
"skip_count": 1.0,
"step": 2984,
"text_loss": 0.17292268574237823
@@ -28365,13 +28365,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.060546875,
+ "grad_norm": 0.07421875,
"learning_rate": 0.0008592515681419813,
- "loss": 0.0079,
+ "loss": 0.0078,
"macro_f1": 0.5492662787437439,
"num_tokens": 4816005.0,
"repeat_count": 2.0,
- "routers_loss": 0.02485196851193905,
+ "routers_loss": 0.025407327339053154,
"skip_count": 0.0,
"step": 2986,
"text_loss": 0.6403061151504517
@@ -28384,13 +28384,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0008590362232435018,
- "loss": 0.0102,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 4818901.0,
"repeat_count": 0.0,
- "routers_loss": 0.006175600457936525,
+ "routers_loss": 0.006826757453382015,
"skip_count": 0.0,
"step": 2988,
"text_loss": 0.2572069466114044
@@ -28403,13 +28403,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041748046875,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0008588207407616644,
- "loss": 0.0085,
+ "loss": 0.0086,
"macro_f1": 0.3333333432674408,
"num_tokens": 4823120.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008576468680985272,
+ "routers_loss": 0.0009054148104041815,
"skip_count": 0.0,
"step": 2990,
"text_loss": 0.4827076196670532
@@ -28422,13 +28422,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02392578125,
+ "grad_norm": 0.0247802734375,
"learning_rate": 0.0008586051207790422,
- "loss": 0.0059,
+ "loss": 0.0055,
"macro_f1": 0.3333333432674408,
"num_tokens": 4825774.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011548360344022512,
+ "routers_loss": 0.0012294676853343844,
"skip_count": 0.0,
"step": 2992,
"text_loss": 0.40157821774482727
@@ -28441,13 +28441,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.056396484375,
+ "grad_norm": 0.052734375,
"learning_rate": 0.0008583893633782612,
- "loss": 0.0085,
+ "loss": 0.0084,
"macro_f1": 0.5492662787437439,
"num_tokens": 4828841.0,
"repeat_count": 0.0,
- "routers_loss": 0.01307896338403225,
+ "routers_loss": 0.011474622413516045,
"skip_count": 2.0,
"step": 2994,
"text_loss": 0.14842072129249573
@@ -28460,13 +28460,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0615234375,
+ "grad_norm": 0.058837890625,
"learning_rate": 0.0008581734686419999,
"loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 4831458.0,
"repeat_count": 0.0,
- "routers_loss": 0.009716883301734924,
+ "routers_loss": 0.009154081344604492,
"skip_count": 2.0,
"step": 2996,
"text_loss": 0.365400105714798
@@ -28479,13 +28479,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031494140625,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.00085795743665299,
"loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 4834609.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026114562060683966,
+ "routers_loss": 0.002899336162954569,
"skip_count": 0.0,
"step": 2998,
"text_loss": 0.5574684143066406
@@ -28498,13 +28498,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052001953125,
+ "grad_norm": 0.0517578125,
"learning_rate": 0.0008577412674940152,
"loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 4838324.0,
"repeat_count": 0.0,
- "routers_loss": 0.003787368768826127,
+ "routers_loss": 0.0034664268605411053,
"skip_count": 0.0,
"step": 3000,
"text_loss": 0.6752855777740479
@@ -28517,13 +28517,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0281982421875,
+ "grad_norm": 0.03466796875,
"learning_rate": 0.0008575249612479117,
"loss": 0.0127,
"macro_f1": 0.6666666865348816,
"num_tokens": 4841877.0,
"repeat_count": 0.0,
- "routers_loss": 0.004202218260616064,
+ "routers_loss": 0.0036425739526748657,
"skip_count": 2.0,
"step": 3002,
"text_loss": 0.6332980394363403
@@ -28536,13 +28536,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0517578125,
+ "grad_norm": 0.048095703125,
"learning_rate": 0.0008573085179975685,
- "loss": 0.0066,
+ "loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 4845840.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012371218763291836,
+ "routers_loss": 0.0013783496106043458,
"skip_count": 0.0,
"step": 3004,
"text_loss": 0.4219617545604706
@@ -28555,13 +28555,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0008570919378259274,
"loss": 0.007,
"macro_f1": 0.6666666865348816,
"num_tokens": 4848766.0,
"repeat_count": 0.0,
- "routers_loss": 0.005013706628233194,
+ "routers_loss": 0.004823608323931694,
"skip_count": 1.0,
"step": 3006,
"text_loss": 0.7987180948257446
@@ -28574,13 +28574,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.029052734375,
+ "grad_norm": 0.0302734375,
"learning_rate": 0.000856875220815982,
- "loss": 0.0069,
+ "loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 4852310.0,
"repeat_count": 0.0,
- "routers_loss": 0.001336073037236929,
+ "routers_loss": 0.0014760984340682626,
"skip_count": 0.0,
"step": 3008,
"text_loss": 0.35592713952064514
@@ -28593,13 +28593,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0008566583670507788,
"loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 4856146.0,
"repeat_count": 0.0,
- "routers_loss": 0.003256940981373191,
+ "routers_loss": 0.0031717263627797365,
"skip_count": 1.0,
"step": 3010,
"text_loss": 0.19379083812236786
@@ -28612,13 +28612,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041748046875,
+ "grad_norm": 0.0517578125,
"learning_rate": 0.0008564413766134164,
- "loss": 0.0091,
+ "loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 4859386.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038389062974601984,
+ "routers_loss": 0.003361492184922099,
"skip_count": 0.0,
"step": 3012,
"text_loss": 0.39129266142845154
@@ -28631,13 +28631,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052734375,
+ "grad_norm": 0.048583984375,
"learning_rate": 0.0008562242495870463,
- "loss": 0.0119,
+ "loss": 0.0113,
"macro_f1": 0.3333333432674408,
"num_tokens": 4862661.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007799214799888432,
+ "routers_loss": 0.0010563990799710155,
"skip_count": 0.0,
"step": 3014,
"text_loss": 0.5966938734054565
@@ -28650,13 +28650,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0213623046875,
+ "grad_norm": 0.0234375,
"learning_rate": 0.0008560069860548716,
- "loss": 0.006,
+ "loss": 0.0059,
"macro_f1": 0.3333333432674408,
"num_tokens": 4865410.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010348912328481674,
+ "routers_loss": 0.001233913702890277,
"skip_count": 0.0,
"step": 3016,
"text_loss": 0.3386077880859375
@@ -28669,13 +28669,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056884765625,
+ "grad_norm": 0.055419921875,
"learning_rate": 0.0008557895861001484,
- "loss": 0.006,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 4868931.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018167694797739387,
+ "routers_loss": 0.0018066301709041,
"skip_count": 0.0,
"step": 3018,
"text_loss": 0.5222050547599792
@@ -28688,13 +28688,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037353515625,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.0008555720498061845,
- "loss": 0.0078,
+ "loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 4873492.0,
"repeat_count": 0.0,
- "routers_loss": 0.005788089707493782,
+ "routers_loss": 0.0050385501235723495,
"skip_count": 1.0,
"step": 3020,
"text_loss": 0.4558849334716797
@@ -28707,13 +28707,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0008553543772563403,
- "loss": 0.0092,
+ "loss": 0.009,
"macro_f1": 0.3333333432674408,
"num_tokens": 4877026.0,
"repeat_count": 0.0,
- "routers_loss": 0.004194240085780621,
+ "routers_loss": 0.004828717093914747,
"skip_count": 0.0,
"step": 3022,
"text_loss": 0.36598992347717285
@@ -28726,13 +28726,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 1.0,
"f1_skip": 0.888888955116272,
- "grad_norm": 0.05712890625,
+ "grad_norm": 0.06103515625,
"learning_rate": 0.0008551365685340285,
"loss": 0.0084,
"macro_f1": 0.9555556178092957,
"num_tokens": 4879655.0,
"repeat_count": 1.0,
- "routers_loss": 0.019211066886782646,
+ "routers_loss": 0.02049369551241398,
"skip_count": 5.0,
"step": 3024,
"text_loss": 0.5069093704223633
@@ -28745,13 +28745,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0008549186237227138,
- "loss": 0.0092,
+ "loss": 0.0088,
"macro_f1": 0.8823530077934265,
"num_tokens": 4882606.0,
"repeat_count": 1.0,
- "routers_loss": 0.041074834764003754,
+ "routers_loss": 0.03947242721915245,
"skip_count": 2.0,
"step": 3026,
"text_loss": 0.2600715458393097
@@ -28764,13 +28764,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.031982421875,
+ "grad_norm": 0.030029296875,
"learning_rate": 0.0008547005429059128,
- "loss": 0.0075,
+ "loss": 0.0073,
"macro_f1": 0.6666666865348816,
"num_tokens": 4885246.0,
"repeat_count": 2.0,
- "routers_loss": 0.0027008953038603067,
+ "routers_loss": 0.0026363315992057323,
"skip_count": 0.0,
"step": 3028,
"text_loss": 0.37642326951026917
@@ -28783,13 +28783,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.046630859375,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0008544823261671948,
- "loss": 0.0074,
+ "loss": 0.0073,
"macro_f1": 0.3333333432674408,
"num_tokens": 4888109.0,
"repeat_count": 0.0,
- "routers_loss": 0.00402502017095685,
+ "routers_loss": 0.003858231008052826,
"skip_count": 0.0,
"step": 3030,
"text_loss": 0.5875385999679565
@@ -28802,13 +28802,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.0008542639735901804,
- "loss": 0.007,
+ "loss": 0.0067,
"macro_f1": 1.0,
"num_tokens": 4891168.0,
"repeat_count": 1.0,
- "routers_loss": 0.00628731120377779,
+ "routers_loss": 0.004789089784026146,
"skip_count": 1.0,
"step": 3032,
"text_loss": 0.6417325139045715
@@ -28821,32 +28821,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.035888671875,
"learning_rate": 0.0008540454852585434,
- "loss": 0.0117,
+ "loss": 0.0115,
"macro_f1": 0.6666666865348816,
"num_tokens": 4894355.0,
"repeat_count": 0.0,
- "routers_loss": 0.007284072227776051,
+ "routers_loss": 0.007334680762141943,
"skip_count": 2.0,
"step": 3034,
"text_loss": 0.23697198927402496
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6666666865348816,
- "avg_layers": 26.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
"epoch": 14.253595538597006,
- "f1_execute": 0.9803921580314636,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.800000011920929,
- "grad_norm": 0.033203125,
+ "f1_skip": 0.5,
+ "grad_norm": 0.034423828125,
"learning_rate": 0.0008538268612560084,
- "loss": 0.0059,
- "macro_f1": 0.5934640765190125,
+ "loss": 0.0058,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 4897543.0,
"repeat_count": 0.0,
- "routers_loss": 0.020328659564256668,
+ "routers_loss": 0.022096361964941025,
"skip_count": 3.0,
"step": 3036,
"text_loss": 0.1989550143480301
@@ -28859,13 +28859,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.047119140625,
"learning_rate": 0.0008536081016663527,
- "loss": 0.0102,
+ "loss": 0.0101,
"macro_f1": 1.0,
"num_tokens": 4900752.0,
"repeat_count": 1.0,
- "routers_loss": 0.002338571473956108,
+ "routers_loss": 0.0037680594250559807,
"skip_count": 2.0,
"step": 3038,
"text_loss": 0.5001366138458252
@@ -28878,13 +28878,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0008533892065734055,
- "loss": 0.0083,
+ "loss": 0.008,
"macro_f1": 0.6666666865348816,
"num_tokens": 4903581.0,
"repeat_count": 0.0,
- "routers_loss": 0.003033763263374567,
+ "routers_loss": 0.0032373068388551474,
"skip_count": 1.0,
"step": 3040,
"text_loss": 0.5019411444664001
@@ -28897,13 +28897,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.042724609375,
"learning_rate": 0.0008531701760610476,
- "loss": 0.012,
+ "loss": 0.0121,
"macro_f1": 1.0,
"num_tokens": 4907108.0,
"repeat_count": 1.0,
- "routers_loss": 0.00831629242748022,
+ "routers_loss": 0.0078013185411691666,
"skip_count": 2.0,
"step": 3042,
"text_loss": 0.3460627794265747
@@ -28916,13 +28916,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.04736328125,
+ "grad_norm": 0.04833984375,
"learning_rate": 0.000852951010213212,
- "loss": 0.0087,
+ "loss": 0.0089,
"macro_f1": 0.8200000524520874,
"num_tokens": 4911269.0,
"repeat_count": 1.0,
- "routers_loss": 0.03200878947973251,
+ "routers_loss": 0.03576689213514328,
"skip_count": 3.0,
"step": 3044,
"text_loss": 0.268994003534317
@@ -28935,13 +28935,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0283203125,
+ "grad_norm": 0.02685546875,
"learning_rate": 0.0008527317091138835,
- "loss": 0.0068,
+ "loss": 0.0066,
"macro_f1": 1.0,
"num_tokens": 4914203.0,
"repeat_count": 1.0,
- "routers_loss": 0.003899211063981056,
+ "routers_loss": 0.0032140621915459633,
"skip_count": 1.0,
"step": 3046,
"text_loss": 0.9998719692230225
@@ -28954,13 +28954,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.040771484375,
"learning_rate": 0.0008525122728470987,
"loss": 0.0102,
"macro_f1": 1.0,
"num_tokens": 4918562.0,
"repeat_count": 1.0,
- "routers_loss": 0.00883556716144085,
+ "routers_loss": 0.008559177629649639,
"skip_count": 3.0,
"step": 3048,
"text_loss": 0.3062439560890198
@@ -28973,13 +28973,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03173828125,
+ "grad_norm": 0.03125,
"learning_rate": 0.0008522927014969459,
- "loss": 0.0064,
+ "loss": 0.006,
"macro_f1": 0.6666666865348816,
"num_tokens": 4921940.0,
"repeat_count": 0.0,
- "routers_loss": 0.009054492227733135,
+ "routers_loss": 0.008735597133636475,
"skip_count": 2.0,
"step": 3050,
"text_loss": 0.3637430965900421
@@ -28992,13 +28992,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.05517578125,
"learning_rate": 0.0008520729951475652,
- "loss": 0.0082,
+ "loss": 0.0085,
"macro_f1": 0.3333333432674408,
"num_tokens": 4925416.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011907420121133327,
+ "routers_loss": 0.0012709591537714005,
"skip_count": 0.0,
"step": 3052,
"text_loss": 0.542036235332489
@@ -29011,13 +29011,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0008518531538831488,
"loss": 0.0096,
"macro_f1": 0.6666666865348816,
"num_tokens": 4928695.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013618353987112641,
+ "routers_loss": 0.0010660928674042225,
"skip_count": 1.0,
"step": 3054,
"text_loss": 0.43144503235816956
@@ -29030,13 +29030,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.060546875,
+ "grad_norm": 0.059326171875,
"learning_rate": 0.00085163317778794,
- "loss": 0.0102,
+ "loss": 0.0096,
"macro_f1": 0.6666666865348816,
"num_tokens": 4931504.0,
"repeat_count": 0.0,
- "routers_loss": 0.004202015232294798,
+ "routers_loss": 0.004558971151709557,
"skip_count": 2.0,
"step": 3056,
"text_loss": 0.5257010459899902
@@ -29049,32 +29049,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0498046875,
+ "grad_norm": 0.04931640625,
"learning_rate": 0.0008514130669462341,
- "loss": 0.0109,
+ "loss": 0.0105,
"macro_f1": 0.6666666865348816,
"num_tokens": 4934935.0,
"repeat_count": 0.0,
- "routers_loss": 0.01060314942151308,
+ "routers_loss": 0.010774781927466393,
"skip_count": 2.0,
"step": 3058,
"text_loss": 0.26061776280403137
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.0,
"acc_skip": 1.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 14.366304666862343,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.0390625,
"learning_rate": 0.0008511928214423782,
"loss": 0.0103,
- "macro_f1": 1.0,
+ "macro_f1": 0.6601307392120361,
"num_tokens": 4938047.0,
"repeat_count": 1.0,
- "routers_loss": 0.012400983832776546,
+ "routers_loss": 0.014763157814741135,
"skip_count": 2.0,
"step": 3060,
"text_loss": 0.2856905460357666
@@ -29087,13 +29087,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.046875,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0008509724413607705,
"loss": 0.0087,
"macro_f1": 0.6666666865348816,
"num_tokens": 4941041.0,
"repeat_count": 1.0,
- "routers_loss": 0.004353851079940796,
+ "routers_loss": 0.004613345488905907,
"skip_count": 0.0,
"step": 3062,
"text_loss": 0.2870287001132965
@@ -29106,13 +29106,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.053955078125,
+ "grad_norm": 0.06298828125,
"learning_rate": 0.0008507519267858612,
- "loss": 0.0148,
+ "loss": 0.015,
"macro_f1": 1.0,
"num_tokens": 4944708.0,
"repeat_count": 1.0,
- "routers_loss": 0.009858032688498497,
+ "routers_loss": 0.008584189228713512,
"skip_count": 2.0,
"step": 3064,
"text_loss": 0.15828095376491547
@@ -29125,13 +29125,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0306396484375,
+ "grad_norm": 0.029052734375,
"learning_rate": 0.0008505312778021519,
"loss": 0.006,
"macro_f1": 0.3333333432674408,
"num_tokens": 4948295.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016502789221704006,
+ "routers_loss": 0.0014670816017314792,
"skip_count": 0.0,
"step": 3066,
"text_loss": 0.36697930097579956
@@ -29144,13 +29144,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0008503104944941958,
- "loss": 0.0108,
+ "loss": 0.0107,
"macro_f1": 0.6666666865348816,
"num_tokens": 4951983.0,
"repeat_count": 0.0,
- "routers_loss": 0.00573746208101511,
+ "routers_loss": 0.005348859820514917,
"skip_count": 2.0,
"step": 3068,
"text_loss": 0.21612997353076935
@@ -29163,13 +29163,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0008500895769465972,
- "loss": 0.0113,
+ "loss": 0.0111,
"macro_f1": 0.3333333432674408,
"num_tokens": 4955023.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012014979729428887,
+ "routers_loss": 0.0013203793205320835,
"skip_count": 0.0,
"step": 3070,
"text_loss": 0.9757798314094543
@@ -29182,13 +29182,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.0478515625,
"learning_rate": 0.0008498685252440124,
- "loss": 0.0067,
+ "loss": 0.0065,
"macro_f1": 0.3333333432674408,
"num_tokens": 4957600.0,
"repeat_count": 0.0,
- "routers_loss": 0.006400141399353743,
+ "routers_loss": 0.006907356437295675,
"skip_count": 0.0,
"step": 3072,
"text_loss": 0.356107234954834
@@ -29201,13 +29201,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.046630859375,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.0008496473394711487,
- "loss": 0.0117,
+ "loss": 0.0116,
"macro_f1": 0.6666666865348816,
"num_tokens": 4960746.0,
"repeat_count": 0.0,
- "routers_loss": 0.0030972862150520086,
+ "routers_loss": 0.0027704904787242413,
"skip_count": 1.0,
"step": 3074,
"text_loss": 0.6812908053398132
@@ -29220,13 +29220,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05517578125,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0008494260197127649,
- "loss": 0.0092,
+ "loss": 0.0093,
"macro_f1": 0.6666666865348816,
"num_tokens": 4963845.0,
"repeat_count": 0.0,
- "routers_loss": 0.004087577573955059,
+ "routers_loss": 0.0036796489730477333,
"skip_count": 2.0,
"step": 3076,
"text_loss": 0.7215370535850525
@@ -29239,13 +29239,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.0556640625,
"learning_rate": 0.0008492045660536712,
- "loss": 0.0085,
+ "loss": 0.0084,
"macro_f1": 0.6666666865348816,
"num_tokens": 4966887.0,
"repeat_count": 0.0,
- "routers_loss": 0.003797230776399374,
+ "routers_loss": 0.0037137691397219896,
"skip_count": 1.0,
"step": 3078,
"text_loss": 0.8700299859046936
@@ -29258,13 +29258,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0008489829785787291,
- "loss": 0.0081,
+ "loss": 0.0078,
"macro_f1": 0.8823530077934265,
"num_tokens": 4969859.0,
"repeat_count": 1.0,
- "routers_loss": 0.020377423614263535,
+ "routers_loss": 0.016492314636707306,
"skip_count": 2.0,
"step": 3080,
"text_loss": 0.6520360112190247
@@ -29277,13 +29277,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.043701171875,
"learning_rate": 0.0008487612573728513,
- "loss": 0.0096,
+ "loss": 0.0094,
"macro_f1": 0.6666666865348816,
"num_tokens": 4972628.0,
"repeat_count": 0.0,
- "routers_loss": 0.003695295425131917,
+ "routers_loss": 0.004022917244583368,
"skip_count": 2.0,
"step": 3082,
"text_loss": 0.17498187720775604
@@ -29296,13 +29296,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.044677734375,
"learning_rate": 0.0008485394025210016,
- "loss": 0.0078,
+ "loss": 0.0076,
"macro_f1": 0.6666666865348816,
"num_tokens": 4975475.0,
"repeat_count": 0.0,
- "routers_loss": 0.008704355917870998,
+ "routers_loss": 0.009141159243881702,
"skip_count": 1.0,
"step": 3084,
"text_loss": 0.5975366234779358
@@ -29315,13 +29315,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.045166015625,
"learning_rate": 0.0008483174141081956,
- "loss": 0.0111,
+ "loss": 0.0113,
"macro_f1": 0.3333333432674408,
"num_tokens": 4978858.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031532018911093473,
+ "routers_loss": 0.0031561285723000765,
"skip_count": 0.0,
"step": 3086,
"text_loss": 0.18748866021633148
@@ -29334,13 +29334,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.04150390625,
"learning_rate": 0.0008480952922194991,
"loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 4982142.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007620530668646097,
+ "routers_loss": 0.0007894713780842721,
"skip_count": 0.0,
"step": 3088,
"text_loss": 0.42083197832107544
@@ -29353,13 +29353,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037841796875,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.0008478730369400302,
- "loss": 0.0086,
+ "loss": 0.0083,
"macro_f1": 0.3333333432674408,
"num_tokens": 4984872.0,
"repeat_count": 0.0,
- "routers_loss": 0.000692489615175873,
+ "routers_loss": 0.0005908289458602667,
"skip_count": 0.0,
"step": 3090,
"text_loss": 0.45337188243865967
@@ -29372,13 +29372,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0240478515625,
+ "grad_norm": 0.02392578125,
"learning_rate": 0.0008476506483549573,
- "loss": 0.0103,
+ "loss": 0.0101,
"macro_f1": 1.0,
"num_tokens": 4988137.0,
"repeat_count": 1.0,
- "routers_loss": 0.001856967923231423,
+ "routers_loss": 0.0016509373672306538,
"skip_count": 2.0,
"step": 3092,
"text_loss": 0.6397262811660767
@@ -29391,13 +29391,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.031982421875,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.0008474281265495002,
- "loss": 0.0075,
+ "loss": 0.0076,
"macro_f1": 0.6666666865348816,
"num_tokens": 4991164.0,
"repeat_count": 0.0,
- "routers_loss": 0.004027622286230326,
+ "routers_loss": 0.004088304936885834,
"skip_count": 1.0,
"step": 3094,
"text_loss": 0.18352322280406952
@@ -29410,32 +29410,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03857421875,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.0008472054716089295,
- "loss": 0.0061,
+ "loss": 0.0059,
"macro_f1": 0.3333333432674408,
"num_tokens": 4993876.0,
"repeat_count": 0.0,
- "routers_loss": 0.004844399634748697,
+ "routers_loss": 0.005200014915317297,
"skip_count": 0.0,
"step": 3096,
"text_loss": 0.2776511013507843
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.0,
"acc_skip": 1.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 14.544760786615791,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0286865234375,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.0008469826836185673,
"loss": 0.01,
- "macro_f1": 1.0,
+ "macro_f1": 0.6601307392120361,
"num_tokens": 4997068.0,
"repeat_count": 1.0,
- "routers_loss": 0.012379852123558521,
+ "routers_loss": 0.012686059810221195,
"skip_count": 2.0,
"step": 3098,
"text_loss": 0.23209233582019806
@@ -29448,13 +29448,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.055419921875,
"learning_rate": 0.0008467597626637858,
- "loss": 0.0076,
+ "loss": 0.0074,
"macro_f1": 1.0,
"num_tokens": 5000038.0,
"repeat_count": 1.0,
- "routers_loss": 0.00575951999053359,
+ "routers_loss": 0.006401528604328632,
"skip_count": 2.0,
"step": 3100,
"text_loss": 0.45936745405197144
@@ -29467,13 +29467,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0008465367088300093,
"loss": 0.0075,
"macro_f1": 0.3272727429866791,
"num_tokens": 5002870.0,
"repeat_count": 0.0,
- "routers_loss": 0.013157932087779045,
+ "routers_loss": 0.016640547662973404,
"skip_count": 1.0,
"step": 3102,
"text_loss": 0.44502779841423035
@@ -29486,13 +29486,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0283203125,
+ "grad_norm": 0.0272216796875,
"learning_rate": 0.0008463135222027124,
- "loss": 0.0052,
+ "loss": 0.0054,
"macro_f1": 0.6666666865348816,
"num_tokens": 5006357.0,
"repeat_count": 0.0,
- "routers_loss": 0.008679390884935856,
+ "routers_loss": 0.008411331102252007,
"skip_count": 2.0,
"step": 3104,
"text_loss": 0.3414570391178131
@@ -29505,13 +29505,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0008460902028674204,
- "loss": 0.0059,
+ "loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 5009059.0,
"repeat_count": 0.0,
- "routers_loss": 0.001076352084055543,
+ "routers_loss": 0.0010406570509076118,
"skip_count": 0.0,
"step": 3106,
"text_loss": 0.5931221842765808
@@ -29524,13 +29524,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.030029296875,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.0008458667509097098,
- "loss": 0.0112,
+ "loss": 0.0115,
"macro_f1": 0.3333333432674408,
"num_tokens": 5012327.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021328055299818516,
+ "routers_loss": 0.001959054498001933,
"skip_count": 0.0,
"step": 3108,
"text_loss": 0.5191171169281006
@@ -29543,13 +29543,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07470703125,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0008456431664152078,
- "loss": 0.0129,
+ "loss": 0.0127,
"macro_f1": 0.3333333432674408,
"num_tokens": 5015472.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010206506121903658,
+ "routers_loss": 0.000994380097836256,
"skip_count": 0.0,
"step": 3110,
"text_loss": 0.4455361068248749
@@ -29562,13 +29562,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0263671875,
+ "grad_norm": 0.0264892578125,
"learning_rate": 0.0008454194494695923,
- "loss": 0.0111,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 5018901.0,
"repeat_count": 0.0,
- "routers_loss": 0.0041310288943350315,
+ "routers_loss": 0.0037662344984710217,
"skip_count": 0.0,
"step": 3112,
"text_loss": 0.5335362553596497
@@ -29581,13 +29581,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0240478515625,
+ "grad_norm": 0.02294921875,
"learning_rate": 0.0008451956001585923,
- "loss": 0.0066,
+ "loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 5022520.0,
"repeat_count": 0.0,
- "routers_loss": 0.00994859915226698,
+ "routers_loss": 0.008664715103805065,
"skip_count": 3.0,
"step": 3114,
"text_loss": 0.16230148077011108
@@ -29600,13 +29600,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0419921875,
+ "grad_norm": 0.0498046875,
"learning_rate": 0.000844971618567987,
- "loss": 0.0087,
+ "loss": 0.0086,
"macro_f1": 0.3333333432674408,
"num_tokens": 5025505.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016823343466967344,
+ "routers_loss": 0.0015904927859082818,
"skip_count": 0.0,
"step": 3116,
"text_loss": 0.6989432573318481
@@ -29619,13 +29619,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03369140625,
+ "grad_norm": 0.033935546875,
"learning_rate": 0.0008447475047836068,
- "loss": 0.0061,
+ "loss": 0.0064,
"macro_f1": 0.6666666865348816,
"num_tokens": 5028767.0,
"repeat_count": 0.0,
- "routers_loss": 0.005725692491978407,
+ "routers_loss": 0.005853322334587574,
"skip_count": 1.0,
"step": 3118,
"text_loss": 0.31420737504959106
@@ -29638,13 +29638,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05712890625,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0008445232588913325,
- "loss": 0.0116,
+ "loss": 0.0115,
"macro_f1": 0.3272727429866791,
"num_tokens": 5032577.0,
"repeat_count": 0.0,
- "routers_loss": 0.016534095630049706,
+ "routers_loss": 0.012760105542838573,
"skip_count": 0.0,
"step": 3120,
"text_loss": 0.5534627437591553
@@ -29657,13 +29657,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.0008442988809770953,
- "loss": 0.0097,
+ "loss": 0.0095,
"macro_f1": 0.3333333432674408,
"num_tokens": 5035381.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023590524215251207,
+ "routers_loss": 0.0022257440723478794,
"skip_count": 0.0,
"step": 3122,
"text_loss": 0.42492759227752686
@@ -29676,13 +29676,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.03955078125,
"learning_rate": 0.0008440743711268775,
- "loss": 0.0084,
+ "loss": 0.0083,
"macro_f1": 0.3333333432674408,
"num_tokens": 5038743.0,
"repeat_count": 0.0,
- "routers_loss": 0.004739012103527784,
+ "routers_loss": 0.004648433532565832,
"skip_count": 0.0,
"step": 3124,
"text_loss": 0.16404685378074646
@@ -29695,13 +29695,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.043212890625,
+ "grad_norm": 0.03955078125,
"learning_rate": 0.0008438497294267117,
- "loss": 0.0069,
+ "loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 5041492.0,
"repeat_count": 0.0,
- "routers_loss": 0.006212939508259296,
+ "routers_loss": 0.006313877180218697,
"skip_count": 0.0,
"step": 3126,
"text_loss": 0.23191484808921814
@@ -29714,13 +29714,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0008436249559626807,
"loss": 0.0046,
"macro_f1": 0.6666666865348816,
"num_tokens": 5043955.0,
"repeat_count": 1.0,
- "routers_loss": 0.0036408400628715754,
+ "routers_loss": 0.0036270488053560257,
"skip_count": 0.0,
"step": 3128,
"text_loss": 0.5782018303871155
@@ -29733,13 +29733,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.04345703125,
"learning_rate": 0.0008434000508209187,
"loss": 0.0095,
"macro_f1": 0.6666666865348816,
"num_tokens": 5047571.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038875883910804987,
+ "routers_loss": 0.003809858812019229,
"skip_count": 1.0,
"step": 3130,
"text_loss": 0.7129825949668884
@@ -29752,13 +29752,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.03955078125,
"learning_rate": 0.0008431750140876092,
- "loss": 0.0129,
+ "loss": 0.0128,
"macro_f1": 0.3333333432674408,
"num_tokens": 5051608.0,
"repeat_count": 0.0,
- "routers_loss": 0.002172809559851885,
+ "routers_loss": 0.0022369057405740023,
"skip_count": 0.0,
"step": 3132,
"text_loss": 0.4433445930480957
@@ -29773,11 +29773,11 @@
"f1_skip": 0.0,
"grad_norm": 0.0654296875,
"learning_rate": 0.000842949845848987,
- "loss": 0.0134,
+ "loss": 0.0135,
"macro_f1": 0.32098764181137085,
"num_tokens": 5054656.0,
"repeat_count": 0.0,
- "routers_loss": 0.04427836462855339,
+ "routers_loss": 0.0425117202103138,
"skip_count": 2.0,
"step": 3134,
"text_loss": 0.38721024990081787
@@ -29790,13 +29790,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.0712890625,
"learning_rate": 0.0008427245461913368,
"loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 5059108.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016648605233058333,
+ "routers_loss": 0.0018077283166348934,
"skip_count": 0.0,
"step": 3136,
"text_loss": 0.7496368885040283
@@ -29809,13 +29809,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.12109375,
"learning_rate": 0.0008424991152009941,
- "loss": 0.0113,
+ "loss": 0.0111,
"macro_f1": 1.0,
"num_tokens": 5062371.0,
"repeat_count": 1.0,
- "routers_loss": 0.008457986637949944,
+ "routers_loss": 0.008801834657788277,
"skip_count": 2.0,
"step": 3138,
"text_loss": 0.5337086319923401
@@ -29828,13 +29828,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.04296875,
"learning_rate": 0.0008422735529643444,
- "loss": 0.0099,
+ "loss": 0.0097,
"macro_f1": 0.6666666865348816,
"num_tokens": 5065593.0,
"repeat_count": 0.0,
- "routers_loss": 0.004939604084938765,
+ "routers_loss": 0.00548676960170269,
"skip_count": 3.0,
"step": 3140,
"text_loss": 0.2561623156070709
@@ -29847,13 +29847,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031982421875,
+ "grad_norm": 0.032958984375,
"learning_rate": 0.0008420478595678233,
- "loss": 0.0077,
+ "loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 5068271.0,
"repeat_count": 0.0,
- "routers_loss": 0.006254551466554403,
+ "routers_loss": 0.006389956455677748,
"skip_count": 0.0,
"step": 3142,
"text_loss": 0.15605193376541138
@@ -29866,13 +29866,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.07958984375,
"learning_rate": 0.0008418220350979175,
"loss": 0.0128,
"macro_f1": 1.0,
"num_tokens": 5071358.0,
"repeat_count": 1.0,
- "routers_loss": 0.01132921315729618,
+ "routers_loss": 0.012387622147798538,
"skip_count": 2.0,
"step": 3144,
"text_loss": 0.3085838258266449
@@ -29885,13 +29885,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0008415960796411628,
"loss": 0.0081,
"macro_f1": 0.6666666865348816,
"num_tokens": 5075584.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026424501556903124,
+ "routers_loss": 0.00311864772811532,
"skip_count": 1.0,
"step": 3146,
"text_loss": 0.4786977469921112
@@ -29904,13 +29904,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.0008413699932841461,
- "loss": 0.0093,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 5078388.0,
"repeat_count": 0.0,
- "routers_loss": 0.0036633017007261515,
+ "routers_loss": 0.0030679800547659397,
"skip_count": 0.0,
"step": 3148,
"text_loss": 0.5222916603088379
@@ -29923,13 +29923,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.039794921875,
+ "grad_norm": 0.0390625,
"learning_rate": 0.0008411437761135039,
- "loss": 0.0112,
+ "loss": 0.011,
"macro_f1": 1.0,
"num_tokens": 5081584.0,
"repeat_count": 1.0,
- "routers_loss": 0.012777967378497124,
+ "routers_loss": 0.012907958589494228,
"skip_count": 2.0,
"step": 3150,
"text_loss": 0.5369884371757507
@@ -29942,13 +29942,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.03759765625,
"learning_rate": 0.0008409174282159232,
- "loss": 0.0074,
+ "loss": 0.0071,
"macro_f1": 0.6666666865348816,
"num_tokens": 5084450.0,
"repeat_count": 0.0,
- "routers_loss": 0.013694444671273232,
+ "routers_loss": 0.012314042076468468,
"skip_count": 2.0,
"step": 3152,
"text_loss": 0.25685277581214905
@@ -29961,13 +29961,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.041015625,
"learning_rate": 0.000840690949678141,
"loss": 0.0091,
"macro_f1": 0.6666666865348816,
"num_tokens": 5087865.0,
"repeat_count": 1.0,
- "routers_loss": 0.008412595838308334,
+ "routers_loss": 0.00899206381291151,
"skip_count": 0.0,
"step": 3154,
"text_loss": 0.1717093288898468
@@ -29980,13 +29980,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.049560546875,
+ "grad_norm": 0.06103515625,
"learning_rate": 0.0008404643405869441,
"loss": 0.0098,
"macro_f1": 0.3333333432674408,
"num_tokens": 5090857.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011648585787042975,
+ "routers_loss": 0.0013312003575265408,
"skip_count": 0.0,
"step": 3156,
"text_loss": 0.27446436882019043
@@ -29999,13 +29999,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0008402376010291695,
- "loss": 0.0127,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 5093917.0,
"repeat_count": 0.0,
- "routers_loss": 0.002915408927947283,
+ "routers_loss": 0.002653320087119937,
"skip_count": 0.0,
"step": 3158,
"text_loss": 0.4237489402294159
@@ -30018,13 +30018,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0498046875,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0008400107310917045,
- "loss": 0.0096,
+ "loss": 0.0095,
"macro_f1": 0.6666666865348816,
"num_tokens": 5096656.0,
"repeat_count": 0.0,
- "routers_loss": 0.013139770366251469,
+ "routers_loss": 0.012976993806660175,
"skip_count": 2.0,
"step": 3160,
"text_loss": 0.42361980676651
@@ -30037,13 +30037,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.054931640625,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.000839783730861486,
"loss": 0.0097,
"macro_f1": 0.6666666865348816,
"num_tokens": 5099582.0,
"repeat_count": 0.0,
- "routers_loss": 0.0070426687598228455,
+ "routers_loss": 0.006936746649444103,
"skip_count": 2.0,
"step": 3162,
"text_loss": 0.26656073331832886
@@ -30056,13 +30056,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04345703125,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.0008395566004255008,
"loss": 0.0127,
"macro_f1": 0.6666666865348816,
"num_tokens": 5102908.0,
"repeat_count": 0.0,
- "routers_loss": 0.006271707359701395,
+ "routers_loss": 0.006619359832257032,
"skip_count": 1.0,
"step": 3164,
"text_loss": 0.590774416923523
@@ -30075,13 +30075,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.057373046875,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0008393293398707858,
"loss": 0.0076,
"macro_f1": 0.6666666865348816,
"num_tokens": 5105829.0,
"repeat_count": 0.0,
- "routers_loss": 0.010571467690169811,
+ "routers_loss": 0.010120268911123276,
"skip_count": 2.0,
"step": 3166,
"text_loss": 0.605930507183075
@@ -30094,13 +30094,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03564453125,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.0008391019492844275,
"loss": 0.0108,
"macro_f1": 0.6666666865348816,
"num_tokens": 5109850.0,
"repeat_count": 0.0,
- "routers_loss": 0.005877034272998571,
+ "routers_loss": 0.004940980114042759,
"skip_count": 2.0,
"step": 3168,
"text_loss": 0.12973152101039886
@@ -30115,11 +30115,11 @@
"f1_skip": 1.0,
"grad_norm": 0.037353515625,
"learning_rate": 0.0008388744287535627,
- "loss": 0.0093,
+ "loss": 0.0094,
"macro_f1": 0.6666666865348816,
"num_tokens": 5113353.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031909283716231585,
+ "routers_loss": 0.0031777634285390377,
"skip_count": 1.0,
"step": 3170,
"text_loss": 0.18577200174331665
@@ -30132,13 +30132,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.052734375,
"learning_rate": 0.0008386467783653775,
- "loss": 0.0104,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 5116421.0,
"repeat_count": 0.0,
- "routers_loss": 0.005338824819773436,
+ "routers_loss": 0.005431659985333681,
"skip_count": 0.0,
"step": 3172,
"text_loss": 0.2302747517824173
@@ -30151,13 +30151,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.046142578125,
"learning_rate": 0.000838418998207108,
- "loss": 0.0073,
+ "loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 5119457.0,
"repeat_count": 0.0,
- "routers_loss": 0.008522412739694118,
+ "routers_loss": 0.0077286697924137115,
"skip_count": 4.0,
"step": 3174,
"text_loss": 0.19606637954711914
@@ -30170,13 +30170,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.0008381910883660399,
- "loss": 0.0068,
+ "loss": 0.007,
"macro_f1": 0.3333333432674408,
"num_tokens": 5123201.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035330590326339006,
+ "routers_loss": 0.003982985392212868,
"skip_count": 0.0,
"step": 3176,
"text_loss": 0.716376006603241
@@ -30189,13 +30189,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.09375,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0008379630489295089,
- "loss": 0.0106,
+ "loss": 0.0109,
"macro_f1": 0.6666666865348816,
"num_tokens": 5126035.0,
"repeat_count": 0.0,
- "routers_loss": 0.006332095246762037,
+ "routers_loss": 0.005626026075333357,
"skip_count": 1.0,
"step": 3178,
"text_loss": 0.5144625902175903
@@ -30208,13 +30208,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0008377348799849,
"loss": 0.0086,
"macro_f1": 0.6666666865348816,
"num_tokens": 5129179.0,
"repeat_count": 0.0,
- "routers_loss": 0.017295993864536285,
+ "routers_loss": 0.015458245761692524,
"skip_count": 2.0,
"step": 3180,
"text_loss": 0.29887503385543823
@@ -30227,13 +30227,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0703125,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0008375065816196479,
- "loss": 0.0088,
+ "loss": 0.0086,
"macro_f1": 0.5492662787437439,
"num_tokens": 5132149.0,
"repeat_count": 0.0,
- "routers_loss": 0.017241213470697403,
+ "routers_loss": 0.012210468761622906,
"skip_count": 2.0,
"step": 3182,
"text_loss": 0.8981851935386658
@@ -30246,13 +30246,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04345703125,
+ "grad_norm": 0.044677734375,
"learning_rate": 0.0008372781539212371,
"loss": 0.0058,
"macro_f1": 0.3333333432674408,
"num_tokens": 5135287.0,
"repeat_count": 0.0,
- "routers_loss": 0.00516276340931654,
+ "routers_loss": 0.0052537876181304455,
"skip_count": 0.0,
"step": 3184,
"text_loss": 0.4245666563510895
@@ -30265,13 +30265,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.022705078125,
+ "grad_norm": 0.0240478515625,
"learning_rate": 0.0008370495969772014,
- "loss": 0.0077,
+ "loss": 0.0075,
"macro_f1": 0.6666666865348816,
"num_tokens": 5138589.0,
"repeat_count": 0.0,
- "routers_loss": 0.012517380528151989,
+ "routers_loss": 0.012873421423137188,
"skip_count": 2.0,
"step": 3186,
"text_loss": 0.40581050515174866
@@ -30284,13 +30284,13 @@
"f1_execute": 0.95652174949646,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07177734375,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0008368209108751244,
- "loss": 0.0129,
+ "loss": 0.0127,
"macro_f1": 0.6521739363670349,
"num_tokens": 5141635.0,
"repeat_count": 2.0,
- "routers_loss": 0.0810512825846672,
+ "routers_loss": 0.07720445841550827,
"skip_count": 4.0,
"step": 3188,
"text_loss": 0.3755173981189728
@@ -30303,13 +30303,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.01953125,
+ "grad_norm": 0.02197265625,
"learning_rate": 0.0008365920957026389,
- "loss": 0.0076,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 5144728.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014350182609632611,
+ "routers_loss": 0.001440995605662465,
"skip_count": 0.0,
"step": 3190,
"text_loss": 0.5067034363746643
@@ -30322,13 +30322,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.046142578125,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0008363631515474275,
- "loss": 0.0091,
+ "loss": 0.0089,
"macro_f1": 0.6538461446762085,
"num_tokens": 5147963.0,
"repeat_count": 1.0,
- "routers_loss": 0.018022676929831505,
+ "routers_loss": 0.018752984702587128,
"skip_count": 2.0,
"step": 3192,
"text_loss": 0.20224551856517792
@@ -30341,13 +30341,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042236328125,
+ "grad_norm": 0.037353515625,
"learning_rate": 0.0008361340784972217,
- "loss": 0.0092,
+ "loss": 0.0093,
"macro_f1": 0.3333333432674408,
"num_tokens": 5151184.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005097229732200503,
+ "routers_loss": 0.0005360354552976787,
"skip_count": 0.0,
"step": 3194,
"text_loss": 0.4588058292865753
@@ -30360,13 +30360,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03173828125,
+ "grad_norm": 0.0390625,
"learning_rate": 0.0008359048766398031,
"loss": 0.0079,
"macro_f1": 0.6666666865348816,
"num_tokens": 5153889.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009840037673711777,
+ "routers_loss": 0.0009184491937048733,
"skip_count": 1.0,
"step": 3196,
"text_loss": 0.2980220317840576
@@ -30379,13 +30379,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02685546875,
+ "grad_norm": 0.027099609375,
"learning_rate": 0.000835675546063002,
- "loss": 0.0058,
+ "loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 5156758.0,
"repeat_count": 0.0,
- "routers_loss": 0.001269801170565188,
+ "routers_loss": 0.001252970308996737,
"skip_count": 0.0,
"step": 3198,
"text_loss": 0.6775755882263184
@@ -30398,13 +30398,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0008354460868546985,
- "loss": 0.0071,
+ "loss": 0.0072,
"macro_f1": 0.3333333432674408,
"num_tokens": 5160247.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034889329690486193,
+ "routers_loss": 0.0037315806839615107,
"skip_count": 0.0,
"step": 3200,
"text_loss": 0.35867011547088623
@@ -30417,13 +30417,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.034912109375,
"learning_rate": 0.0008352164991028217,
- "loss": 0.0091,
+ "loss": 0.0092,
"macro_f1": 0.6666666865348816,
"num_tokens": 5163456.0,
"repeat_count": 1.0,
- "routers_loss": 0.001520772697404027,
+ "routers_loss": 0.001497485558502376,
"skip_count": 0.0,
"step": 3202,
"text_loss": 0.690290093421936
@@ -30436,13 +30436,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03662109375,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.0008349867828953501,
"loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 5166139.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011800233041867614,
+ "routers_loss": 0.001051135826855898,
"skip_count": 0.0,
"step": 3204,
"text_loss": 0.3340415954589844
@@ -30455,13 +30455,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031494140625,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0008347569383203113,
- "loss": 0.01,
+ "loss": 0.0098,
"macro_f1": 0.3333333432674408,
"num_tokens": 5169009.0,
"repeat_count": 0.0,
- "routers_loss": 0.001043233904056251,
+ "routers_loss": 0.0010544003453105688,
"skip_count": 0.0,
"step": 3206,
"text_loss": 0.8584878444671631
@@ -30474,13 +30474,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0008345269654657823,
- "loss": 0.0084,
+ "loss": 0.0085,
"macro_f1": 1.0,
"num_tokens": 5172618.0,
"repeat_count": 1.0,
- "routers_loss": 0.007460868917405605,
+ "routers_loss": 0.007312417030334473,
"skip_count": 1.0,
"step": 3208,
"text_loss": 0.19500218331813812
@@ -30493,13 +30493,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0361328125,
+ "grad_norm": 0.03466796875,
"learning_rate": 0.0008342968644198892,
- "loss": 0.0067,
+ "loss": 0.0065,
"macro_f1": 0.3333333432674408,
"num_tokens": 5175857.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027419133111834526,
+ "routers_loss": 0.00276504410430789,
"skip_count": 0.0,
"step": 3210,
"text_loss": 0.5446314215660095
@@ -30512,13 +30512,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.037109375,
"learning_rate": 0.0008340666352708068,
- "loss": 0.0089,
+ "loss": 0.0088,
"macro_f1": 0.3333333432674408,
"num_tokens": 5178585.0,
"repeat_count": 0.0,
- "routers_loss": 0.002764733275398612,
+ "routers_loss": 0.002669303445145488,
"skip_count": 0.0,
"step": 3212,
"text_loss": 0.3687484860420227
@@ -30531,13 +30531,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0284423828125,
+ "grad_norm": 0.035888671875,
"learning_rate": 0.0008338362781067596,
"loss": 0.0075,
"macro_f1": 0.3333333432674408,
"num_tokens": 5181777.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032288613729178905,
+ "routers_loss": 0.0031585274264216423,
"skip_count": 0.0,
"step": 3214,
"text_loss": 0.27325859665870667
@@ -30550,13 +30550,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.040283203125,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.000833605793016021,
"loss": 0.009,
"macro_f1": 0.6666666865348816,
"num_tokens": 5184312.0,
"repeat_count": 0.0,
- "routers_loss": 0.008322423323988914,
+ "routers_loss": 0.008807534351944923,
"skip_count": 2.0,
"step": 3216,
"text_loss": 0.4466548562049866
@@ -30569,13 +30569,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.040283203125,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.0008333751800869133,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 5187497.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034384531900286674,
+ "routers_loss": 0.003171310294419527,
"skip_count": 0.0,
"step": 3218,
"text_loss": 0.5423526763916016
@@ -30588,13 +30588,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0228271484375,
+ "grad_norm": 0.025634765625,
"learning_rate": 0.0008331444394078076,
- "loss": 0.0081,
+ "loss": 0.008,
"macro_f1": 0.6666666865348816,
"num_tokens": 5190982.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015023534651845694,
+ "routers_loss": 0.0016481258207932115,
"skip_count": 2.0,
"step": 3220,
"text_loss": 0.48984917998313904
@@ -30607,13 +30607,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.03173828125,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.000832913571067124,
- "loss": 0.0108,
+ "loss": 0.0107,
"macro_f1": 1.0,
"num_tokens": 5194044.0,
"repeat_count": 1.0,
- "routers_loss": 0.0043489462696015835,
+ "routers_loss": 0.003957313951104879,
"skip_count": 1.0,
"step": 3222,
"text_loss": 0.4533331096172333
@@ -30626,13 +30626,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.040283203125,
"learning_rate": 0.0008326825751533322,
- "loss": 0.0076,
+ "loss": 0.0075,
"macro_f1": 0.3333333432674408,
"num_tokens": 5197092.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012065734481438994,
+ "routers_loss": 0.0016904744552448392,
"skip_count": 0.0,
"step": 3224,
"text_loss": 0.5538802742958069
@@ -30645,13 +30645,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06005859375,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0008324514517549501,
- "loss": 0.0084,
+ "loss": 0.0081,
"macro_f1": 0.6666666865348816,
"num_tokens": 5199941.0,
"repeat_count": 0.0,
- "routers_loss": 0.006849290337413549,
+ "routers_loss": 0.005608258303254843,
"skip_count": 1.0,
"step": 3226,
"text_loss": 0.416242778301239
@@ -30664,32 +30664,32 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.03857421875,
+ "grad_norm": 0.040771484375,
"learning_rate": 0.0008322202009605444,
- "loss": 0.0073,
+ "loss": 0.0072,
"macro_f1": 0.8823530077934265,
"num_tokens": 5202618.0,
"repeat_count": 1.0,
- "routers_loss": 0.020665202289819717,
+ "routers_loss": 0.020965175703167915,
"skip_count": 2.0,
"step": 3228,
"text_loss": 0.17496295273303986
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 1.0,
- "avg_layers": 23.0,
+ "avg_layers": 24.0,
"epoch": 15.164367478720282,
- "f1_execute": 0.9777777791023254,
- "f1_repeat": 0.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 1.0,
"grad_norm": 0.04052734375,
"learning_rate": 0.0008319888228587311,
"loss": 0.0063,
- "macro_f1": 0.6592592597007751,
+ "macro_f1": 1.0,
"num_tokens": 5206414.0,
"repeat_count": 1.0,
- "routers_loss": 0.026284674182534218,
+ "routers_loss": 0.021259209141135216,
"skip_count": 5.0,
"step": 3230,
"text_loss": 0.22471418976783752
@@ -30702,13 +30702,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03076171875,
+ "grad_norm": 0.029541015625,
"learning_rate": 0.0008317573175381745,
"loss": 0.0115,
"macro_f1": 0.3333333432674408,
"num_tokens": 5209768.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018494570394977927,
+ "routers_loss": 0.0018647604156285524,
"skip_count": 0.0,
"step": 3232,
"text_loss": 0.4415269196033478
@@ -30721,13 +30721,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.027099609375,
+ "grad_norm": 0.0283203125,
"learning_rate": 0.0008315256850875881,
- "loss": 0.0061,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 5213257.0,
"repeat_count": 0.0,
- "routers_loss": 0.002610588213428855,
+ "routers_loss": 0.002345515415072441,
"skip_count": 0.0,
"step": 3234,
"text_loss": 0.347247838973999
@@ -30740,13 +30740,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.048828125,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0008312939255957336,
- "loss": 0.0084,
+ "loss": 0.0086,
"macro_f1": 0.6666666865348816,
"num_tokens": 5215800.0,
"repeat_count": 0.0,
- "routers_loss": 0.007061914075165987,
+ "routers_loss": 0.007112892810255289,
"skip_count": 3.0,
"step": 3236,
"text_loss": 0.31091734766960144
@@ -30759,13 +30759,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.033203125,
"learning_rate": 0.0008310620391514219,
- "loss": 0.0083,
+ "loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 5219205.0,
"repeat_count": 0.0,
- "routers_loss": 0.004094691481441259,
+ "routers_loss": 0.00432228296995163,
"skip_count": 0.0,
"step": 3238,
"text_loss": 0.3421775996685028
@@ -30778,13 +30778,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.024658203125,
+ "grad_norm": 0.027099609375,
"learning_rate": 0.0008308300258435124,
"loss": 0.0085,
"macro_f1": 0.6666666865348816,
"num_tokens": 5222422.0,
"repeat_count": 0.0,
- "routers_loss": 0.007662596181035042,
+ "routers_loss": 0.0076514314860105515,
"skip_count": 2.0,
"step": 3240,
"text_loss": 0.22378318011760712
@@ -30797,13 +30797,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0264892578125,
+ "grad_norm": 0.028564453125,
"learning_rate": 0.0008305978857609128,
- "loss": 0.0073,
+ "loss": 0.0072,
"macro_f1": 0.3333333432674408,
"num_tokens": 5225625.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008108283509500325,
+ "routers_loss": 0.0007617069641128182,
"skip_count": 0.0,
"step": 3242,
"text_loss": 0.5880323648452759
@@ -30816,13 +30816,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0281982421875,
+ "grad_norm": 0.02734375,
"learning_rate": 0.0008303656189925799,
- "loss": 0.0084,
+ "loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 5229113.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018137742299586535,
+ "routers_loss": 0.0017418119823560119,
"skip_count": 0.0,
"step": 3244,
"text_loss": 0.3302813768386841
@@ -30835,13 +30835,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.042724609375,
"learning_rate": 0.0008301332256275183,
"loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 5232061.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025301240384578705,
+ "routers_loss": 0.0026667986530810595,
"skip_count": 0.0,
"step": 3246,
"text_loss": 0.5679706335067749
@@ -30854,13 +30854,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.052001953125,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0008299007057547821,
- "loss": 0.0101,
+ "loss": 0.0106,
"macro_f1": 1.0,
"num_tokens": 5235279.0,
"repeat_count": 1.0,
- "routers_loss": 0.011231686919927597,
+ "routers_loss": 0.011016624979674816,
"skip_count": 2.0,
"step": 3248,
"text_loss": 0.5081504583358765
@@ -30873,13 +30873,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.033203125,
"learning_rate": 0.0008296680594634731,
- "loss": 0.0074,
+ "loss": 0.0073,
"macro_f1": 0.6666666865348816,
"num_tokens": 5239655.0,
"repeat_count": 1.0,
- "routers_loss": 0.005881415214389563,
+ "routers_loss": 0.005492044147104025,
"skip_count": 0.0,
"step": 3250,
"text_loss": 0.14675180613994598
@@ -30892,13 +30892,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0277099609375,
+ "grad_norm": 0.0269775390625,
"learning_rate": 0.0008294352868427418,
- "loss": 0.0056,
+ "loss": 0.0055,
"macro_f1": 0.6666666865348816,
"num_tokens": 5243579.0,
"repeat_count": 0.0,
- "routers_loss": 0.004495301283895969,
+ "routers_loss": 0.00404445780441165,
"skip_count": 1.0,
"step": 3252,
"text_loss": 0.4201085865497589
@@ -30911,13 +30911,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0208740234375,
+ "grad_norm": 0.0242919921875,
"learning_rate": 0.0008292023879817871,
- "loss": 0.0052,
+ "loss": 0.0053,
"macro_f1": 0.6666666865348816,
"num_tokens": 5247059.0,
"repeat_count": 0.0,
- "routers_loss": 0.007394428364932537,
+ "routers_loss": 0.006886140909045935,
"skip_count": 1.0,
"step": 3254,
"text_loss": 0.2289208322763443
@@ -30930,32 +30930,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06201171875,
+ "grad_norm": 0.057861328125,
"learning_rate": 0.0008289693629698564,
- "loss": 0.0077,
+ "loss": 0.0073,
"macro_f1": 0.3333333432674408,
"num_tokens": 5249940.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006736332434229553,
+ "routers_loss": 0.0005736657767556608,
"skip_count": 0.0,
"step": 3256,
"text_loss": 0.5670450925827026
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 15.295861461696507,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.0224609375,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
"learning_rate": 0.0008287362118962452,
- "loss": 0.0062,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.006,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 5253580.0,
"repeat_count": 0.0,
- "routers_loss": 0.009847268462181091,
+ "routers_loss": 0.011349895037710667,
"skip_count": 1.0,
"step": 3258,
"text_loss": 0.5042323470115662
@@ -30968,13 +30968,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.026611328125,
+ "grad_norm": 0.0267333984375,
"learning_rate": 0.0008285029348502973,
"loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 5257080.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013670918997377157,
+ "routers_loss": 0.0013626761501654983,
"skip_count": 0.0,
"step": 3260,
"text_loss": 0.3227672874927521
@@ -30987,13 +30987,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02587890625,
+ "grad_norm": 0.0245361328125,
"learning_rate": 0.0008282695319214053,
"loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 5259951.0,
"repeat_count": 0.0,
- "routers_loss": 0.004696785472333431,
+ "routers_loss": 0.00471635302528739,
"skip_count": 0.0,
"step": 3262,
"text_loss": 0.20773714780807495
@@ -31006,13 +31006,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04345703125,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.0008280360031990093,
- "loss": 0.0108,
+ "loss": 0.0107,
"macro_f1": 0.6666666865348816,
"num_tokens": 5263314.0,
"repeat_count": 0.0,
- "routers_loss": 0.010588239878416061,
+ "routers_loss": 0.010472415015101433,
"skip_count": 2.0,
"step": 3264,
"text_loss": 0.34397366642951965
@@ -31025,13 +31025,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.000827802348772598,
- "loss": 0.0084,
+ "loss": 0.0083,
"macro_f1": 0.3333333432674408,
"num_tokens": 5267358.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010326795745640993,
+ "routers_loss": 0.0007814752752892673,
"skip_count": 0.0,
"step": 3266,
"text_loss": 0.747342586517334
@@ -31044,13 +31044,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.049560546875,
+ "grad_norm": 0.0498046875,
"learning_rate": 0.0008275685687317084,
- "loss": 0.0087,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 5270400.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010199147509410977,
+ "routers_loss": 0.000902949133887887,
"skip_count": 0.0,
"step": 3268,
"text_loss": 0.43782034516334534
@@ -31063,13 +31063,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03173828125,
+ "grad_norm": 0.03564453125,
"learning_rate": 0.0008273346631659252,
- "loss": 0.0069,
+ "loss": 0.007,
"macro_f1": 0.3333333432674408,
"num_tokens": 5273147.0,
"repeat_count": 0.0,
- "routers_loss": 0.00046372212818823755,
+ "routers_loss": 0.00043462219764478505,
"skip_count": 0.0,
"step": 3270,
"text_loss": 0.6358205080032349
@@ -31082,13 +31082,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0380859375,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0008271006321648816,
- "loss": 0.0088,
+ "loss": 0.0085,
"macro_f1": 0.3333333432674408,
"num_tokens": 5277638.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022951713763177395,
+ "routers_loss": 0.002211218234151602,
"skip_count": 0.0,
"step": 3272,
"text_loss": 0.20220105350017548
@@ -31101,13 +31101,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.0008268664758182589,
- "loss": 0.0077,
+ "loss": 0.0075,
"macro_f1": 0.6666666865348816,
"num_tokens": 5280638.0,
"repeat_count": 1.0,
- "routers_loss": 0.008325734175741673,
+ "routers_loss": 0.010536720044910908,
"skip_count": 0.0,
"step": 3274,
"text_loss": 0.7579061388969421
@@ -31120,32 +31120,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.0439453125,
"learning_rate": 0.0008266321942157859,
- "loss": 0.007,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 5283847.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017014809418469667,
+ "routers_loss": 0.0017158017726615071,
"skip_count": 0.0,
"step": 3276,
"text_loss": 0.669302761554718
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.800000011920929,
"acc_skip": 1.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 15.389785735250953,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9743589162826538,
+ "f1_repeat": 0.888888955116272,
"f1_skip": 1.0,
- "grad_norm": 0.06787109375,
+ "grad_norm": 0.06005859375,
"learning_rate": 0.0008263977874472399,
- "loss": 0.0089,
- "macro_f1": 1.0,
+ "loss": 0.0088,
+ "macro_f1": 0.9544159770011902,
"num_tokens": 5286627.0,
"repeat_count": 5.0,
- "routers_loss": 0.009527196176350117,
+ "routers_loss": 0.011220700107514858,
"skip_count": 4.0,
"step": 3278,
"text_loss": 0.8703984022140503
@@ -31158,13 +31158,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.060546875,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0008261632556024461,
- "loss": 0.01,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 5289766.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025269081816077232,
+ "routers_loss": 0.0020442772656679153,
"skip_count": 0.0,
"step": 3280,
"text_loss": 0.5009346008300781
@@ -31177,13 +31177,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11474609375,
+ "grad_norm": 0.10107421875,
"learning_rate": 0.0008259285987712774,
- "loss": 0.0108,
+ "loss": 0.0106,
"macro_f1": 0.3333333432674408,
"num_tokens": 5293010.0,
"repeat_count": 0.0,
- "routers_loss": 0.005710822530090809,
+ "routers_loss": 0.005645765457302332,
"skip_count": 0.0,
"step": 3282,
"text_loss": 0.2546011209487915
@@ -31196,13 +31196,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0419921875,
+ "grad_norm": 0.042236328125,
"learning_rate": 0.0008256938170436549,
- "loss": 0.0114,
+ "loss": 0.0111,
"macro_f1": 0.6666666865348816,
"num_tokens": 5296732.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028946297243237495,
+ "routers_loss": 0.0027385836001485586,
"skip_count": 2.0,
"step": 3284,
"text_loss": 0.5244000554084778
@@ -31217,11 +31217,11 @@
"f1_skip": 1.0,
"grad_norm": 0.0296630859375,
"learning_rate": 0.0008254589105095473,
- "loss": 0.0059,
+ "loss": 0.0061,
"macro_f1": 1.0,
"num_tokens": 5299926.0,
"repeat_count": 1.0,
- "routers_loss": 0.007981270551681519,
+ "routers_loss": 0.007451715879142284,
"skip_count": 1.0,
"step": 3286,
"text_loss": 0.28979742527008057
@@ -31234,13 +31234,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0238037109375,
+ "grad_norm": 0.0218505859375,
"learning_rate": 0.0008252238792589711,
- "loss": 0.0085,
+ "loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 5303006.0,
"repeat_count": 0.0,
- "routers_loss": 0.005524218548089266,
+ "routers_loss": 0.004805843345820904,
"skip_count": 2.0,
"step": 3288,
"text_loss": 0.5131978392601013
@@ -31253,13 +31253,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03857421875,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.000824988723381991,
- "loss": 0.0092,
+ "loss": 0.0091,
"macro_f1": 0.3272727429866791,
"num_tokens": 5306953.0,
"repeat_count": 0.0,
- "routers_loss": 0.01160401664674282,
+ "routers_loss": 0.010639613494277,
"skip_count": 1.0,
"step": 3290,
"text_loss": 0.4901447296142578
@@ -31272,13 +31272,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.0008247534429687191,
- "loss": 0.0069,
+ "loss": 0.007,
"macro_f1": 0.5492662787437439,
"num_tokens": 5310516.0,
"repeat_count": 0.0,
- "routers_loss": 0.014068983495235443,
+ "routers_loss": 0.013625577092170715,
"skip_count": 2.0,
"step": 3292,
"text_loss": 0.2124534696340561
@@ -31291,13 +31291,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0008245180381093152,
- "loss": 0.0116,
+ "loss": 0.0114,
"macro_f1": 0.6666666865348816,
"num_tokens": 5313959.0,
"repeat_count": 0.0,
- "routers_loss": 0.00520911393687129,
+ "routers_loss": 0.004958513658493757,
"skip_count": 1.0,
"step": 3294,
"text_loss": 0.46682238578796387
@@ -31310,13 +31310,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0008242825088939867,
- "loss": 0.0085,
+ "loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 5316609.0,
"repeat_count": 0.0,
- "routers_loss": 0.004490343388170004,
+ "routers_loss": 0.003962756600230932,
"skip_count": 0.0,
"step": 3296,
"text_loss": 0.7010108232498169
@@ -31329,13 +31329,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0008240468554129892,
- "loss": 0.0078,
+ "loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 5319638.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006864524912089109,
+ "routers_loss": 0.0006996620795689523,
"skip_count": 0.0,
"step": 3298,
"text_loss": 0.4966355860233307
@@ -31348,13 +31348,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0008238110777566255,
"loss": 0.0101,
"macro_f1": 0.3333333432674408,
"num_tokens": 5323019.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017158432165160775,
+ "routers_loss": 0.0016031896229833364,
"skip_count": 0.0,
"step": 3300,
"text_loss": 0.38668957352638245
@@ -31367,13 +31367,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.0303955078125,
"learning_rate": 0.0008235751760152459,
- "loss": 0.0064,
+ "loss": 0.0063,
"macro_f1": 1.0,
"num_tokens": 5326099.0,
"repeat_count": 2.0,
- "routers_loss": 0.0037166383117437363,
+ "routers_loss": 0.00344281829893589,
"skip_count": 2.0,
"step": 3302,
"text_loss": 0.5330720543861389
@@ -31386,13 +31386,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05126953125,
+ "grad_norm": 0.06005859375,
"learning_rate": 0.0008233391502792484,
- "loss": 0.0073,
+ "loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 5328993.0,
"repeat_count": 0.0,
- "routers_loss": 0.008341175504028797,
+ "routers_loss": 0.007886730134487152,
"skip_count": 1.0,
"step": 3304,
"text_loss": 0.5470269322395325
@@ -31405,13 +31405,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03271484375,
+ "grad_norm": 0.034423828125,
"learning_rate": 0.0008231030006390786,
"loss": 0.0067,
"macro_f1": 0.6666666865348816,
"num_tokens": 5331554.0,
"repeat_count": 0.0,
- "routers_loss": 0.008380163460969925,
+ "routers_loss": 0.008180000819265842,
"skip_count": 1.0,
"step": 3306,
"text_loss": 0.4023340344429016
@@ -31424,13 +31424,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0242919921875,
+ "grad_norm": 0.02587890625,
"learning_rate": 0.0008228667271852294,
- "loss": 0.0062,
+ "loss": 0.0059,
"macro_f1": 0.3333333432674408,
"num_tokens": 5335712.0,
"repeat_count": 0.0,
- "routers_loss": 0.00030099941068328917,
+ "routers_loss": 0.0002942821884062141,
"skip_count": 0.0,
"step": 3308,
"text_loss": 0.5306711792945862
@@ -31443,13 +31443,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0615234375,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.0008226303300082414,
- "loss": 0.0095,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 5338701.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006003376329317689,
+ "routers_loss": 0.0006134595023468137,
"skip_count": 0.0,
"step": 3310,
"text_loss": 0.5906263589859009
@@ -31462,13 +31462,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.02880859375,
"learning_rate": 0.0008223938091987022,
- "loss": 0.0073,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 5342274.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017984671285375953,
+ "routers_loss": 0.0016656654188409448,
"skip_count": 0.0,
"step": 3312,
"text_loss": 0.5201764106750488
@@ -31481,13 +31481,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.055419921875,
+ "grad_norm": 0.052001953125,
"learning_rate": 0.0008221571648472472,
- "loss": 0.0066,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 5345185.0,
"repeat_count": 0.0,
- "routers_loss": 0.003994898404926062,
+ "routers_loss": 0.0038612703792750835,
"skip_count": 0.0,
"step": 3314,
"text_loss": 0.36633720993995667
@@ -31500,13 +31500,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0008219203970445589,
"loss": 0.011,
"macro_f1": 0.3272727429866791,
"num_tokens": 5348804.0,
"repeat_count": 0.0,
- "routers_loss": 0.009415820240974426,
+ "routers_loss": 0.009782899171113968,
"skip_count": 1.0,
"step": 3316,
"text_loss": 0.3117460012435913
@@ -31519,13 +31519,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.053955078125,
+ "grad_norm": 0.055908203125,
"learning_rate": 0.0008216835058813672,
- "loss": 0.0091,
+ "loss": 0.0093,
"macro_f1": 0.3333333432674408,
"num_tokens": 5351896.0,
"repeat_count": 0.0,
- "routers_loss": 0.006483082659542561,
+ "routers_loss": 0.007713229861110449,
"skip_count": 0.0,
"step": 3318,
"text_loss": 0.253496378660202
@@ -31538,13 +31538,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02880859375,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0008214464914484492,
"loss": 0.0062,
"macro_f1": 0.6666666865348816,
"num_tokens": 5355058.0,
"repeat_count": 0.0,
- "routers_loss": 0.006275791209191084,
+ "routers_loss": 0.006227815989404917,
"skip_count": 2.0,
"step": 3320,
"text_loss": 0.32693132758140564
@@ -31557,13 +31557,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0008212093538366292,
"loss": 0.0099,
"macro_f1": 0.3333333432674408,
"num_tokens": 5358365.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027182933408766985,
+ "routers_loss": 0.002601418411359191,
"skip_count": 0.0,
"step": 3322,
"text_loss": 0.40394455194473267
@@ -31576,13 +31576,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.000820972093136779,
"loss": 0.0079,
"macro_f1": 0.6666666865348816,
"num_tokens": 5360981.0,
"repeat_count": 0.0,
- "routers_loss": 0.005600054748356342,
+ "routers_loss": 0.005545300897210836,
"skip_count": 3.0,
"step": 3324,
"text_loss": 0.6758295893669128
@@ -31595,13 +31595,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.05078125,
"learning_rate": 0.0008207347094398172,
"loss": 0.0096,
"macro_f1": 0.6666666865348816,
"num_tokens": 5364018.0,
"repeat_count": 1.0,
- "routers_loss": 0.0020965971052646637,
+ "routers_loss": 0.001924700103700161,
"skip_count": 0.0,
"step": 3326,
"text_loss": 0.5196860432624817
@@ -31614,13 +31614,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0311279296875,
+ "grad_norm": 0.0299072265625,
"learning_rate": 0.0008204972028367097,
- "loss": 0.006,
+ "loss": 0.0057,
"macro_f1": 0.6666666865348816,
"num_tokens": 5366986.0,
"repeat_count": 0.0,
- "routers_loss": 0.011729889549314976,
+ "routers_loss": 0.012254828587174416,
"skip_count": 1.0,
"step": 3328,
"text_loss": 0.24661913514137268
@@ -31633,13 +31633,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.0008202595734184694,
"loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 5371463.0,
"repeat_count": 0.0,
- "routers_loss": 0.004913534037768841,
+ "routers_loss": 0.005094083491712809,
"skip_count": 0.0,
"step": 3330,
"text_loss": 0.2525769770145416
@@ -31652,13 +31652,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.039794921875,
+ "grad_norm": 0.040283203125,
"learning_rate": 0.0008200218212761566,
- "loss": 0.0111,
+ "loss": 0.0108,
"macro_f1": 0.6666666865348816,
"num_tokens": 5374823.0,
"repeat_count": 1.0,
- "routers_loss": 0.0028079606126993895,
+ "routers_loss": 0.0025883198250085115,
"skip_count": 0.0,
"step": 3332,
"text_loss": 0.21849912405014038
@@ -31671,13 +31671,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031982421875,
+ "grad_norm": 0.030029296875,
"learning_rate": 0.000819783946500878,
"loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 5377640.0,
"repeat_count": 0.0,
- "routers_loss": 0.008404970169067383,
+ "routers_loss": 0.008240507915616035,
"skip_count": 0.0,
"step": 3334,
"text_loss": 0.2662734091281891
@@ -31690,13 +31690,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.048583984375,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.000819545949183788,
- "loss": 0.0101,
+ "loss": 0.01,
"macro_f1": 0.5934640765190125,
"num_tokens": 5380593.0,
"repeat_count": 0.0,
- "routers_loss": 0.040179044008255005,
+ "routers_loss": 0.038378193974494934,
"skip_count": 3.0,
"step": 3336,
"text_loss": 0.2431795746088028
@@ -31709,13 +31709,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.040283203125,
"learning_rate": 0.0008193078294160874,
- "loss": 0.0096,
+ "loss": 0.0097,
"macro_f1": 1.0,
"num_tokens": 5384487.0,
"repeat_count": 1.0,
- "routers_loss": 0.005122583359479904,
+ "routers_loss": 0.005926199723035097,
"skip_count": 1.0,
"step": 3338,
"text_loss": 0.5663705468177795
@@ -31728,13 +31728,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.031494140625,
+ "grad_norm": 0.032470703125,
"learning_rate": 0.0008190695872890242,
- "loss": 0.0056,
+ "loss": 0.0055,
"macro_f1": 0.6666666865348816,
"num_tokens": 5387511.0,
"repeat_count": 0.0,
- "routers_loss": 0.012232085689902306,
+ "routers_loss": 0.010842559859156609,
"skip_count": 2.0,
"step": 3340,
"text_loss": 0.11517292261123657
@@ -31747,13 +31747,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.029296875,
+ "grad_norm": 0.0283203125,
"learning_rate": 0.0008188312228938933,
- "loss": 0.009,
+ "loss": 0.0088,
"macro_f1": 0.3333333432674408,
"num_tokens": 5390698.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011168667115271091,
+ "routers_loss": 0.001304097007960081,
"skip_count": 0.0,
"step": 3342,
"text_loss": 0.4827076196670532
@@ -31766,13 +31766,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.03515625,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0008185927363220363,
- "loss": 0.0088,
+ "loss": 0.0087,
"macro_f1": 0.6666666865348816,
"num_tokens": 5393778.0,
"repeat_count": 1.0,
- "routers_loss": 0.005202370695769787,
+ "routers_loss": 0.005354117136448622,
"skip_count": 0.0,
"step": 3344,
"text_loss": 0.44467049837112427
@@ -31785,13 +31785,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.040771484375,
"learning_rate": 0.0008183541276648418,
- "loss": 0.0081,
+ "loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 5396925.0,
"repeat_count": 0.0,
- "routers_loss": 0.005000839475542307,
+ "routers_loss": 0.004800073802471161,
"skip_count": 2.0,
"step": 3346,
"text_loss": 0.2032834142446518
@@ -31804,13 +31804,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025634765625,
+ "grad_norm": 0.027587890625,
"learning_rate": 0.0008181153970137449,
- "loss": 0.0059,
+ "loss": 0.006,
"macro_f1": 0.3333333432674408,
"num_tokens": 5400522.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020684092305600643,
+ "routers_loss": 0.0021674633026123047,
"skip_count": 0.0,
"step": 3348,
"text_loss": 0.4507528841495514
@@ -31823,13 +31823,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.0439453125,
+ "grad_norm": 0.051513671875,
"learning_rate": 0.0008178765444602278,
"loss": 0.0117,
"macro_f1": 0.8820862174034119,
"num_tokens": 5403526.0,
"repeat_count": 2.0,
- "routers_loss": 0.040753237903118134,
+ "routers_loss": 0.04263930395245552,
"skip_count": 2.0,
"step": 3350,
"text_loss": 0.3606615960597992
@@ -31842,13 +31842,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.03564453125,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0008176375700958194,
- "loss": 0.0089,
+ "loss": 0.0087,
"macro_f1": 0.6666666865348816,
"num_tokens": 5407127.0,
"repeat_count": 1.0,
- "routers_loss": 0.007767915725708008,
+ "routers_loss": 0.006953123956918716,
"skip_count": 0.0,
"step": 3352,
"text_loss": 0.2290353775024414
@@ -31861,13 +31861,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0008173984740120948,
"loss": 0.0055,
"macro_f1": 0.3333333432674408,
"num_tokens": 5410829.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016073459992185235,
+ "routers_loss": 0.0014363783411681652,
"skip_count": 0.0,
"step": 3354,
"text_loss": 0.4220392405986786
@@ -31880,13 +31880,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02880859375,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0008171592563006762,
- "loss": 0.0078,
+ "loss": 0.0079,
"macro_f1": 0.6666666865348816,
"num_tokens": 5414152.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016132282325997949,
+ "routers_loss": 0.00202389364130795,
"skip_count": 1.0,
"step": 3356,
"text_loss": 0.37729766964912415
@@ -31899,13 +31899,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037353515625,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0008169199170532323,
- "loss": 0.007,
+ "loss": 0.0067,
"macro_f1": 0.6666666865348816,
"num_tokens": 5417312.0,
"repeat_count": 0.0,
- "routers_loss": 0.007077203597873449,
+ "routers_loss": 0.006253739818930626,
"skip_count": 2.0,
"step": 3358,
"text_loss": 0.1304289996623993
@@ -31918,13 +31918,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0008166804563614785,
- "loss": 0.0088,
+ "loss": 0.0084,
"macro_f1": 1.0,
"num_tokens": 5421227.0,
"repeat_count": 2.0,
- "routers_loss": 0.01628093235194683,
+ "routers_loss": 0.01622140221297741,
"skip_count": 2.0,
"step": 3360,
"text_loss": 0.298664391040802
@@ -31937,13 +31937,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0250244140625,
+ "grad_norm": 0.024169921875,
"learning_rate": 0.0008164408743171763,
- "loss": 0.0064,
+ "loss": 0.0062,
"macro_f1": 1.0,
"num_tokens": 5424646.0,
"repeat_count": 1.0,
- "routers_loss": 0.003795142285525799,
+ "routers_loss": 0.0037176944315433502,
"skip_count": 2.0,
"step": 3362,
"text_loss": 0.12147632241249084
@@ -31956,13 +31956,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037841796875,
+ "grad_norm": 0.046630859375,
"learning_rate": 0.0008162011710121339,
"loss": 0.0076,
"macro_f1": 0.6666666865348816,
"num_tokens": 5427897.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024164009373635054,
+ "routers_loss": 0.0020403533708304167,
"skip_count": 1.0,
"step": 3364,
"text_loss": 0.2656533420085907
@@ -31975,32 +31975,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0008159613465382066,
- "loss": 0.0071,
+ "loss": 0.007,
"macro_f1": 0.3333333432674408,
"num_tokens": 5430474.0,
"repeat_count": 0.0,
- "routers_loss": 0.002314126119017601,
+ "routers_loss": 0.0018634048756211996,
"skip_count": 0.0,
"step": 3366,
"text_loss": 0.9133086204528809
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 15.812444966245964,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.058837890625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0008157214009872951,
- "loss": 0.008,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 5433113.0,
"repeat_count": 0.0,
- "routers_loss": 0.014630996622145176,
+ "routers_loss": 0.012944488786160946,
"skip_count": 2.0,
"step": 3368,
"text_loss": 0.24352453649044037
@@ -32013,13 +32013,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.05712890625,
"learning_rate": 0.0008154813344513472,
- "loss": 0.0141,
+ "loss": 0.0143,
"macro_f1": 0.6666666865348816,
"num_tokens": 5436259.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023453824687749147,
+ "routers_loss": 0.002347963862121105,
"skip_count": 2.0,
"step": 3370,
"text_loss": 0.7601244449615479
@@ -32032,13 +32032,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0291748046875,
+ "grad_norm": 0.031494140625,
"learning_rate": 0.0008152411470223568,
- "loss": 0.0078,
+ "loss": 0.0077,
"macro_f1": 0.3333333432674408,
"num_tokens": 5439126.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015595925506204367,
+ "routers_loss": 0.0016609140438959002,
"skip_count": 0.0,
"step": 3372,
"text_loss": 0.5551947355270386
@@ -32051,13 +32051,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.04345703125,
"learning_rate": 0.0008150008387923643,
- "loss": 0.0067,
+ "loss": 0.0064,
"macro_f1": 0.3333333432674408,
"num_tokens": 5442739.0,
"repeat_count": 0.0,
- "routers_loss": 0.008187411352992058,
+ "routers_loss": 0.008321396075189114,
"skip_count": 0.0,
"step": 3374,
"text_loss": 0.25028282403945923
@@ -32070,13 +32070,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.10302734375,
+ "grad_norm": 0.08544921875,
"learning_rate": 0.000814760409853456,
- "loss": 0.0109,
+ "loss": 0.0105,
"macro_f1": 1.0,
"num_tokens": 5445247.0,
"repeat_count": 2.0,
- "routers_loss": 0.009705786593258381,
+ "routers_loss": 0.009738070890307426,
"skip_count": 1.0,
"step": 3376,
"text_loss": 0.37271201610565186
@@ -32089,13 +32089,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0439453125,
+ "grad_norm": 0.042236328125,
"learning_rate": 0.0008145198602977651,
- "loss": 0.0084,
+ "loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 5449044.0,
"repeat_count": 0.0,
- "routers_loss": 0.003062802366912365,
+ "routers_loss": 0.0028421466704458,
"skip_count": 0.0,
"step": 3378,
"text_loss": 0.1458655595779419
@@ -32108,13 +32108,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.0008142791902174701,
- "loss": 0.008,
+ "loss": 0.0081,
"macro_f1": 0.3333333432674408,
"num_tokens": 5453063.0,
"repeat_count": 0.0,
- "routers_loss": 0.001539172139018774,
+ "routers_loss": 0.0015170135302469134,
"skip_count": 0.0,
"step": 3380,
"text_loss": 0.5548722743988037
@@ -32127,13 +32127,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0008140383997047966,
- "loss": 0.0082,
+ "loss": 0.008,
"macro_f1": 0.6666666865348816,
"num_tokens": 5455814.0,
"repeat_count": 0.0,
- "routers_loss": 0.002227923832833767,
+ "routers_loss": 0.0022444510832428932,
"skip_count": 1.0,
"step": 3382,
"text_loss": 0.8034513592720032
@@ -32146,13 +32146,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.000813797488852016,
- "loss": 0.0063,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 5459392.0,
"repeat_count": 0.0,
- "routers_loss": 0.0003921810712199658,
+ "routers_loss": 0.00038578867679461837,
"skip_count": 0.0,
"step": 3384,
"text_loss": 0.6940088868141174
@@ -32165,13 +32165,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0517578125,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0008135564577514458,
- "loss": 0.0116,
+ "loss": 0.011,
"macro_f1": 0.3333333432674408,
"num_tokens": 5462413.0,
"repeat_count": 0.0,
- "routers_loss": 0.001971066929399967,
+ "routers_loss": 0.0019727381877601147,
"skip_count": 0.0,
"step": 3386,
"text_loss": 0.5124650597572327
@@ -32184,13 +32184,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.099609375,
"learning_rate": 0.0008133153064954495,
- "loss": 0.0108,
+ "loss": 0.0107,
"macro_f1": 0.3333333432674408,
"num_tokens": 5465552.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018206594977527857,
+ "routers_loss": 0.0019896167796105146,
"skip_count": 0.0,
"step": 3388,
"text_loss": 0.4292517900466919
@@ -32203,13 +32203,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0008130740351764367,
- "loss": 0.0068,
+ "loss": 0.007,
"macro_f1": 1.0,
"num_tokens": 5468573.0,
"repeat_count": 1.0,
- "routers_loss": 0.003323496552184224,
+ "routers_loss": 0.0030118159484118223,
"skip_count": 1.0,
"step": 3390,
"text_loss": 0.48903173208236694
@@ -32222,13 +32222,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.024658203125,
+ "grad_norm": 0.0216064453125,
"learning_rate": 0.000812832643886863,
- "loss": 0.0058,
+ "loss": 0.0057,
"macro_f1": 0.6666666865348816,
"num_tokens": 5471547.0,
"repeat_count": 0.0,
- "routers_loss": 0.006201856769621372,
+ "routers_loss": 0.005084246397018433,
"skip_count": 2.0,
"step": 3392,
"text_loss": 0.35789889097213745
@@ -32241,13 +32241,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.0390625,
"learning_rate": 0.0008125911327192299,
- "loss": 0.009,
+ "loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 5474331.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009058464202098548,
+ "routers_loss": 0.0008874498889781535,
"skip_count": 0.0,
"step": 3394,
"text_loss": 0.6267408728599548
@@ -32260,13 +32260,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0008123495017660851,
- "loss": 0.0059,
+ "loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 5477633.0,
"repeat_count": 0.0,
- "routers_loss": 0.00202162005007267,
+ "routers_loss": 0.001794386887922883,
"skip_count": 0.0,
"step": 3396,
"text_loss": 0.3701885938644409
@@ -32279,13 +32279,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04296875,
+ "grad_norm": 0.042724609375,
"learning_rate": 0.0008121077511200221,
"loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 5481277.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022049983963370323,
+ "routers_loss": 0.002140481723472476,
"skip_count": 0.0,
"step": 3398,
"text_loss": 0.6362857818603516
@@ -32298,13 +32298,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05322265625,
+ "grad_norm": 0.0556640625,
"learning_rate": 0.00081186588087368,
- "loss": 0.0115,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 5484237.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008255304419435561,
+ "routers_loss": 0.000867189432028681,
"skip_count": 0.0,
"step": 3400,
"text_loss": 1.0847382545471191
@@ -32317,13 +32317,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0274658203125,
+ "grad_norm": 0.0296630859375,
"learning_rate": 0.0008116238911197442,
- "loss": 0.0067,
+ "loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 5487423.0,
"repeat_count": 0.0,
- "routers_loss": 0.0029532560147345066,
+ "routers_loss": 0.0029817656613886356,
"skip_count": 0.0,
"step": 3402,
"text_loss": 0.3813740313053131
@@ -32336,13 +32336,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.049560546875,
"learning_rate": 0.0008113817819509454,
"loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 5490155.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038054194301366806,
+ "routers_loss": 0.0035141287371516228,
"skip_count": 0.0,
"step": 3404,
"text_loss": 0.2113083451986313
@@ -32355,13 +32355,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042236328125,
+ "grad_norm": 0.04443359375,
"learning_rate": 0.0008111395534600603,
"loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 5493415.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034561967477202415,
+ "routers_loss": 0.003317659953609109,
"skip_count": 0.0,
"step": 3406,
"text_loss": 0.5869330167770386
@@ -32374,13 +32374,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.052001953125,
"learning_rate": 0.0008108972057399114,
- "loss": 0.0131,
+ "loss": 0.0123,
"macro_f1": 0.6666666865348816,
"num_tokens": 5496032.0,
"repeat_count": 0.0,
- "routers_loss": 0.0036799898371100426,
+ "routers_loss": 0.003833734430372715,
"skip_count": 2.0,
"step": 3408,
"text_loss": 0.2938928008079529
@@ -32393,13 +32393,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.11328125,
"learning_rate": 0.0008106547388833669,
- "loss": 0.006,
+ "loss": 0.0061,
"macro_f1": 0.6666666865348816,
"num_tokens": 5498890.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026391225401312113,
+ "routers_loss": 0.002622978063300252,
"skip_count": 1.0,
"step": 3410,
"text_loss": 0.3130980432033539
@@ -32412,13 +32412,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.03564453125,
"learning_rate": 0.0008104121529833402,
"loss": 0.0062,
"macro_f1": 0.6666666865348816,
"num_tokens": 5502010.0,
"repeat_count": 1.0,
- "routers_loss": 0.00991886481642723,
+ "routers_loss": 0.007447598036378622,
"skip_count": 0.0,
"step": 3412,
"text_loss": 0.4413072466850281
@@ -32431,13 +32431,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.000810169448132791,
- "loss": 0.0096,
+ "loss": 0.0093,
"macro_f1": 0.6666666865348816,
"num_tokens": 5505212.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031243201810866594,
+ "routers_loss": 0.0031087708193808794,
"skip_count": 1.0,
"step": 3414,
"text_loss": 0.2910428047180176
@@ -32450,13 +32450,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.04345703125,
"learning_rate": 0.0008099266244247243,
- "loss": 0.0083,
+ "loss": 0.0082,
"macro_f1": 0.3272727429866791,
"num_tokens": 5508755.0,
"repeat_count": 0.0,
- "routers_loss": 0.02572118304669857,
+ "routers_loss": 0.02510393038392067,
"skip_count": 1.0,
"step": 3416,
"text_loss": 0.33022749423980713
@@ -32469,13 +32469,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0306396484375,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0008096836819521903,
"loss": 0.0057,
"macro_f1": 0.6666666865348816,
"num_tokens": 5512034.0,
"repeat_count": 0.0,
- "routers_loss": 0.001839894917793572,
+ "routers_loss": 0.0020537273958325386,
"skip_count": 1.0,
"step": 3418,
"text_loss": 0.4731218218803406
@@ -32488,32 +32488,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.031494140625,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0008094406208082853,
"loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 5515707.0,
"repeat_count": 0.0,
- "routers_loss": 0.0039922320283949375,
+ "routers_loss": 0.004218162503093481,
"skip_count": 2.0,
"step": 3420,
"text_loss": 0.23429590463638306
},
{
"acc_repeat": 1.0,
- "acc_skip": 1.0,
- "avg_layers": 26.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
"epoch": 16.065746991488112,
- "f1_execute": 1.0,
+ "f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
- "f1_skip": 1.0,
- "grad_norm": 0.0703125,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0869140625,
"learning_rate": 0.0008091974410861507,
- "loss": 0.0066,
- "macro_f1": 1.0,
+ "loss": 0.0069,
+ "macro_f1": 0.9265305995941162,
"num_tokens": 5518436.0,
"repeat_count": 1.0,
- "routers_loss": 0.012939191423356533,
+ "routers_loss": 0.013488355092704296,
"skip_count": 3.0,
"step": 3422,
"text_loss": 0.45768749713897705
@@ -32526,13 +32526,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0008089541428789733,
- "loss": 0.01,
+ "loss": 0.0097,
"macro_f1": 0.6666666865348816,
"num_tokens": 5522368.0,
"repeat_count": 0.0,
- "routers_loss": 0.001064157928340137,
+ "routers_loss": 0.0010335417464375496,
"skip_count": 1.0,
"step": 3424,
"text_loss": 0.43423423171043396
@@ -32545,13 +32545,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0299072265625,
+ "grad_norm": 0.0306396484375,
"learning_rate": 0.0008087107262799855,
- "loss": 0.0047,
+ "loss": 0.0046,
"macro_f1": 0.3333333432674408,
"num_tokens": 5526061.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024185231886804104,
+ "routers_loss": 0.002134323585778475,
"skip_count": 0.0,
"step": 3426,
"text_loss": 0.4031757414340973
@@ -32564,13 +32564,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.0008084671913824651,
"loss": 0.006,
"macro_f1": 0.6666666865348816,
"num_tokens": 5529284.0,
"repeat_count": 0.0,
- "routers_loss": 0.009645994752645493,
+ "routers_loss": 0.0097216060385108,
"skip_count": 2.0,
"step": 3428,
"text_loss": 0.2836039960384369
@@ -32583,13 +32583,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.022705078125,
+ "grad_norm": 0.0220947265625,
"learning_rate": 0.000808223538279735,
- "loss": 0.0051,
+ "loss": 0.0049,
"macro_f1": 0.3333333432674408,
"num_tokens": 5532159.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017972104251384735,
+ "routers_loss": 0.001684269867837429,
"skip_count": 0.0,
"step": 3430,
"text_loss": 0.5804527401924133
@@ -32602,13 +32602,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.04248046875,
+ "grad_norm": 0.0390625,
"learning_rate": 0.0008079797670651637,
"loss": 0.008,
"macro_f1": 1.0,
"num_tokens": 5536050.0,
"repeat_count": 1.0,
- "routers_loss": 0.015138664282858372,
+ "routers_loss": 0.013918434269726276,
"skip_count": 1.0,
"step": 3432,
"text_loss": 0.31325826048851013
@@ -32621,13 +32621,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0008077358778321647,
- "loss": 0.0114,
+ "loss": 0.011,
"macro_f1": 0.3333333432674408,
"num_tokens": 5538885.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007666898309253156,
+ "routers_loss": 0.0007751787197776139,
"skip_count": 0.0,
"step": 3434,
"text_loss": 0.783108115196228
@@ -32640,13 +32640,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.800000011920929,
"f1_skip": 1.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.033935546875,
"learning_rate": 0.0008074918706741966,
"loss": 0.0063,
"macro_f1": 0.9262410998344421,
"num_tokens": 5541909.0,
"repeat_count": 3.0,
- "routers_loss": 0.024132754653692245,
+ "routers_loss": 0.021819550544023514,
"skip_count": 2.0,
"step": 3436,
"text_loss": 0.6558083295822144
@@ -32659,13 +32659,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03173828125,
+ "grad_norm": 0.02880859375,
"learning_rate": 0.0008072477456847638,
- "loss": 0.0061,
+ "loss": 0.0057,
"macro_f1": 0.3272727429866791,
"num_tokens": 5545101.0,
"repeat_count": 1.0,
- "routers_loss": 0.03225114569067955,
+ "routers_loss": 0.03309348225593567,
"skip_count": 0.0,
"step": 3438,
"text_loss": 0.9877075552940369
@@ -32678,13 +32678,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.04931640625,
"learning_rate": 0.0008070035029574151,
- "loss": 0.0062,
+ "loss": 0.006,
"macro_f1": 1.0,
"num_tokens": 5548971.0,
"repeat_count": 1.0,
- "routers_loss": 0.008569693192839622,
+ "routers_loss": 0.008696741424500942,
"skip_count": 1.0,
"step": 3440,
"text_loss": 0.24766330420970917
@@ -32697,13 +32697,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.000806759142585745,
"loss": 0.0056,
"macro_f1": 0.6666666865348816,
"num_tokens": 5552174.0,
"repeat_count": 0.0,
- "routers_loss": 0.004438123665750027,
+ "routers_loss": 0.004240929149091244,
"skip_count": 3.0,
"step": 3442,
"text_loss": 0.37255001068115234
@@ -32716,13 +32716,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0615234375,
+ "grad_norm": 0.05322265625,
"learning_rate": 0.0008065146646633927,
- "loss": 0.0091,
+ "loss": 0.0088,
"macro_f1": 0.6666666865348816,
"num_tokens": 5555005.0,
"repeat_count": 0.0,
- "routers_loss": 0.013728363439440727,
+ "routers_loss": 0.014345484785735607,
"skip_count": 1.0,
"step": 3444,
"text_loss": 0.26157206296920776
@@ -32735,13 +32735,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.05810546875,
+ "grad_norm": 0.06005859375,
"learning_rate": 0.0008062700692840428,
"loss": 0.0083,
"macro_f1": 1.0,
"num_tokens": 5559127.0,
"repeat_count": 1.0,
- "routers_loss": 0.008383825421333313,
+ "routers_loss": 0.008315163664519787,
"skip_count": 2.0,
"step": 3446,
"text_loss": 0.21971040964126587
@@ -32754,13 +32754,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0008060253565414246,
"loss": 0.009,
"macro_f1": 0.5934640765190125,
"num_tokens": 5562254.0,
"repeat_count": 0.0,
- "routers_loss": 0.009948022663593292,
+ "routers_loss": 0.009582413360476494,
"skip_count": 3.0,
"step": 3448,
"text_loss": 0.6758295893669128
@@ -32773,13 +32773,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0361328125,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.0008057805265293124,
"loss": 0.006,
"macro_f1": 0.3333333432674408,
"num_tokens": 5565515.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025822422467172146,
+ "routers_loss": 0.002429503947496414,
"skip_count": 0.0,
"step": 3450,
"text_loss": 0.696592390537262
@@ -32792,13 +32792,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0008055355793415257,
- "loss": 0.0091,
+ "loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 5568392.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008777108159847558,
+ "routers_loss": 0.0007724192109890282,
"skip_count": 0.0,
"step": 3452,
"text_loss": 0.7092870473861694
@@ -32811,13 +32811,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0008052905150719285,
- "loss": 0.01,
+ "loss": 0.0099,
"macro_f1": 0.3333333432674408,
"num_tokens": 5571090.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009592860005795956,
+ "routers_loss": 0.0010859938338398933,
"skip_count": 0.0,
"step": 3454,
"text_loss": 0.6593860387802124
@@ -32832,11 +32832,11 @@
"f1_skip": 1.0,
"grad_norm": 0.04150390625,
"learning_rate": 0.0008050453338144301,
- "loss": 0.0077,
+ "loss": 0.0072,
"macro_f1": 1.0,
"num_tokens": 5574552.0,
"repeat_count": 1.0,
- "routers_loss": 0.0029973683413118124,
+ "routers_loss": 0.0030258705373853445,
"skip_count": 1.0,
"step": 3456,
"text_loss": 0.3479384481906891
@@ -32849,13 +32849,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.0008048000356629844,
- "loss": 0.0068,
+ "loss": 0.0066,
"macro_f1": 0.6666666865348816,
"num_tokens": 5577484.0,
"repeat_count": 0.0,
- "routers_loss": 0.005223365034908056,
+ "routers_loss": 0.005052885971963406,
"skip_count": 2.0,
"step": 3458,
"text_loss": 0.21858671307563782
@@ -32868,13 +32868,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.029541015625,
"learning_rate": 0.0008045546207115901,
- "loss": 0.0074,
+ "loss": 0.0068,
"macro_f1": 1.0,
"num_tokens": 5581605.0,
"repeat_count": 1.0,
- "routers_loss": 0.010660176165401936,
+ "routers_loss": 0.009976249188184738,
"skip_count": 3.0,
"step": 3460,
"text_loss": 0.16868001222610474
@@ -32887,13 +32887,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.032958984375,
"learning_rate": 0.0008043090890542904,
- "loss": 0.008,
+ "loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 5584994.0,
"repeat_count": 0.0,
- "routers_loss": 0.003038279013708234,
+ "routers_loss": 0.00270817126147449,
"skip_count": 0.0,
"step": 3462,
"text_loss": 0.785690426826477
@@ -32906,13 +32906,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03125,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0008040634407851739,
- "loss": 0.0057,
+ "loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 5588067.0,
"repeat_count": 0.0,
- "routers_loss": 0.001855011098086834,
+ "routers_loss": 0.0018436965765431523,
"skip_count": 0.0,
"step": 3464,
"text_loss": 0.5006644129753113
@@ -32925,13 +32925,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.030029296875,
+ "grad_norm": 0.028076171875,
"learning_rate": 0.0008038176759983731,
- "loss": 0.0064,
+ "loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 5590789.0,
"repeat_count": 0.0,
- "routers_loss": 0.008276397362351418,
+ "routers_loss": 0.008516279980540276,
"skip_count": 2.0,
"step": 3466,
"text_loss": 0.20963478088378906
@@ -32944,13 +32944,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04052734375,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0008035717947880659,
- "loss": 0.0092,
+ "loss": 0.0091,
"macro_f1": 0.3333333432674408,
"num_tokens": 5593472.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016371201490983367,
+ "routers_loss": 0.0016293043736368418,
"skip_count": 0.0,
"step": 3468,
"text_loss": 0.7376078963279724
@@ -32963,13 +32963,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.036376953125,
"learning_rate": 0.0008033257972484742,
- "loss": 0.0081,
+ "loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 5596108.0,
"repeat_count": 0.0,
- "routers_loss": 0.002605364890769124,
+ "routers_loss": 0.002364142332226038,
"skip_count": 0.0,
"step": 3470,
"text_loss": 0.5156455039978027
@@ -32982,13 +32982,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0008030796834738649,
- "loss": 0.0083,
+ "loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 5599103.0,
"repeat_count": 0.0,
- "routers_loss": 0.00892016664147377,
+ "routers_loss": 0.008872323669493198,
"skip_count": 0.0,
"step": 3472,
"text_loss": 0.2996419668197632
@@ -33001,13 +33001,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037841796875,
+ "grad_norm": 0.043701171875,
"learning_rate": 0.0008028334535585491,
- "loss": 0.0089,
+ "loss": 0.0087,
"macro_f1": 0.6666666865348816,
"num_tokens": 5602410.0,
"repeat_count": 0.0,
- "routers_loss": 0.01095602847635746,
+ "routers_loss": 0.011508257128298283,
"skip_count": 3.0,
"step": 3474,
"text_loss": 0.25438693165779114
@@ -33020,13 +33020,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.037353515625,
+ "grad_norm": 0.038330078125,
"learning_rate": 0.0008025871075968827,
- "loss": 0.0105,
+ "loss": 0.0106,
"macro_f1": 1.0,
"num_tokens": 5605424.0,
"repeat_count": 2.0,
- "routers_loss": 0.016052749007940292,
+ "routers_loss": 0.017225435003638268,
"skip_count": 2.0,
"step": 3476,
"text_loss": 0.2549574077129364
@@ -33039,13 +33039,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.800000011920929,
"f1_skip": 1.0,
- "grad_norm": 0.02880859375,
+ "grad_norm": 0.028564453125,
"learning_rate": 0.0008023406456832657,
- "loss": 0.0116,
+ "loss": 0.0111,
"macro_f1": 0.9262410998344421,
"num_tokens": 5608266.0,
"repeat_count": 3.0,
- "routers_loss": 0.04047509655356407,
+ "routers_loss": 0.039165645837783813,
"skip_count": 2.0,
"step": 3478,
"text_loss": 0.1797947734594345
@@ -33058,13 +33058,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0272216796875,
+ "grad_norm": 0.026123046875,
"learning_rate": 0.0008020940679121429,
- "loss": 0.0073,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 5611471.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010115962941199541,
+ "routers_loss": 0.0009718866203911602,
"skip_count": 0.0,
"step": 3480,
"text_loss": 0.8267702460289001
@@ -33077,13 +33077,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0008018473743780036,
- "loss": 0.0095,
+ "loss": 0.0093,
"macro_f1": 0.6666666865348816,
"num_tokens": 5615046.0,
"repeat_count": 0.0,
- "routers_loss": 0.006490753497928381,
+ "routers_loss": 0.006087122485041618,
"skip_count": 2.0,
"step": 3482,
"text_loss": 0.7267677187919617
@@ -33096,13 +33096,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.000801600565175381,
- "loss": 0.0088,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 5618350.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008378152851946652,
+ "routers_loss": 0.0007539413054473698,
"skip_count": 0.0,
"step": 3484,
"text_loss": 0.5910211801528931
@@ -33115,13 +33115,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048583984375,
+ "grad_norm": 0.046142578125,
"learning_rate": 0.0008013536403988529,
- "loss": 0.0087,
+ "loss": 0.0085,
"macro_f1": 0.3333333432674408,
"num_tokens": 5621381.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007683819276280701,
+ "routers_loss": 0.0008076327503658831,
"skip_count": 0.0,
"step": 3486,
"text_loss": 0.30616798996925354
@@ -33134,13 +33134,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.047607421875,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.0008011066001430412,
"loss": 0.0086,
"macro_f1": 0.6122449040412903,
"num_tokens": 5624617.0,
"repeat_count": 0.0,
- "routers_loss": 0.02481125481426716,
+ "routers_loss": 0.023835813626646996,
"skip_count": 4.0,
"step": 3488,
"text_loss": 0.3376443088054657
@@ -33153,13 +33153,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0311279296875,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0008008594445026122,
- "loss": 0.0082,
+ "loss": 0.0083,
"macro_f1": 0.6666666865348816,
"num_tokens": 5627989.0,
"repeat_count": 0.0,
- "routers_loss": 0.005174005404114723,
+ "routers_loss": 0.004226419143378735,
"skip_count": 2.0,
"step": 3490,
"text_loss": 0.8185343146324158
@@ -33172,13 +33172,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.044677734375,
"learning_rate": 0.0008006121735722767,
"loss": 0.0084,
"macro_f1": 0.32098764181137085,
"num_tokens": 5632286.0,
"repeat_count": 0.0,
- "routers_loss": 0.03602224588394165,
+ "routers_loss": 0.0366671048104763,
"skip_count": 2.0,
"step": 3492,
"text_loss": 0.2209547609090805
@@ -33191,13 +33191,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.03466796875,
"learning_rate": 0.0008003647874467892,
- "loss": 0.0087,
+ "loss": 0.0084,
"macro_f1": 0.6666666865348816,
"num_tokens": 5635368.0,
"repeat_count": 1.0,
- "routers_loss": 0.012145630083978176,
+ "routers_loss": 0.012956378981471062,
"skip_count": 0.0,
"step": 3494,
"text_loss": 0.20468664169311523
@@ -33210,13 +33210,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.057861328125,
+ "grad_norm": 0.059814453125,
"learning_rate": 0.0008001172862209485,
"loss": 0.0103,
"macro_f1": 0.6666666865348816,
"num_tokens": 5638440.0,
"repeat_count": 1.0,
- "routers_loss": 0.001456267898902297,
+ "routers_loss": 0.0017375422175973654,
"skip_count": 0.0,
"step": 3496,
"text_loss": 0.6647221446037292
@@ -33229,13 +33229,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0247802734375,
+ "grad_norm": 0.0244140625,
"learning_rate": 0.0007998696699895976,
- "loss": 0.0093,
+ "loss": 0.0091,
"macro_f1": 0.6592592597007751,
"num_tokens": 5641996.0,
"repeat_count": 1.0,
- "routers_loss": 0.028984347358345985,
+ "routers_loss": 0.025240756571292877,
"skip_count": 5.0,
"step": 3498,
"text_loss": 0.23892143368721008
@@ -33248,13 +33248,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02294921875,
+ "grad_norm": 0.021728515625,
"learning_rate": 0.0007996219388476236,
- "loss": 0.0077,
+ "loss": 0.0075,
"macro_f1": 0.6666666865348816,
"num_tokens": 5645071.0,
"repeat_count": 0.0,
- "routers_loss": 0.006859986111521721,
+ "routers_loss": 0.007436830550432205,
"skip_count": 1.0,
"step": 3500,
"text_loss": 0.7580804228782654
@@ -33267,13 +33267,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.024169921875,
+ "grad_norm": 0.0242919921875,
"learning_rate": 0.0007993740928899571,
- "loss": 0.0055,
+ "loss": 0.0054,
"macro_f1": 0.3333333432674408,
"num_tokens": 5648175.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011989293852820992,
+ "routers_loss": 0.001126602990552783,
"skip_count": 0.0,
"step": 3502,
"text_loss": 0.5281378626823425
@@ -33286,13 +33286,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031982421875,
+ "grad_norm": 0.04443359375,
"learning_rate": 0.0007991261322115737,
- "loss": 0.0056,
+ "loss": 0.0055,
"macro_f1": 0.3333333432674408,
"num_tokens": 5650973.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007974735926836729,
+ "routers_loss": 0.0007907263352535665,
"skip_count": 0.0,
"step": 3504,
"text_loss": 0.25220927596092224
@@ -33305,13 +33305,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0274658203125,
+ "grad_norm": 0.0262451171875,
"learning_rate": 0.000798878056907492,
- "loss": 0.0049,
+ "loss": 0.0048,
"macro_f1": 1.0,
"num_tokens": 5654252.0,
"repeat_count": 2.0,
- "routers_loss": 0.007121780421584845,
+ "routers_loss": 0.006263538729399443,
"skip_count": 2.0,
"step": 3506,
"text_loss": 0.46569153666496277
@@ -33324,13 +33324,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0007986298670727752,
- "loss": 0.0101,
+ "loss": 0.0098,
"macro_f1": 0.6666666865348816,
"num_tokens": 5657229.0,
"repeat_count": 0.0,
- "routers_loss": 0.00414140522480011,
+ "routers_loss": 0.004049144219607115,
"skip_count": 3.0,
"step": 3508,
"text_loss": 0.15174436569213867
@@ -33343,13 +33343,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 1.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.058837890625,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0007983815628025301,
- "loss": 0.0073,
+ "loss": 0.0074,
"macro_f1": 0.9262410998344421,
"num_tokens": 5659974.0,
"repeat_count": 2.0,
- "routers_loss": 0.04618353769183159,
+ "routers_loss": 0.0471976138651371,
"skip_count": 3.0,
"step": 3510,
"text_loss": 0.39072203636169434
@@ -33362,13 +33362,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.000798133144191907,
- "loss": 0.0084,
+ "loss": 0.0082,
"macro_f1": 0.3272727429866791,
"num_tokens": 5662893.0,
"repeat_count": 0.0,
- "routers_loss": 0.04054548963904381,
+ "routers_loss": 0.04030488431453705,
"skip_count": 1.0,
"step": 3512,
"text_loss": 0.3562147617340088
@@ -33381,13 +33381,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.0595703125,
"learning_rate": 0.0007978846113361009,
- "loss": 0.0067,
+ "loss": 0.0069,
"macro_f1": 0.6666666865348816,
"num_tokens": 5666476.0,
"repeat_count": 0.0,
- "routers_loss": 0.007785080466419458,
+ "routers_loss": 0.007475079502910376,
"skip_count": 1.0,
"step": 3514,
"text_loss": 0.26518192887306213
@@ -33400,13 +33400,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0400390625,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.0007976359643303497,
- "loss": 0.0128,
+ "loss": 0.013,
"macro_f1": 0.6666666865348816,
"num_tokens": 5669647.0,
"repeat_count": 0.0,
- "routers_loss": 0.0057366108521819115,
+ "routers_loss": 0.00558585487306118,
"skip_count": 2.0,
"step": 3516,
"text_loss": 0.29284560680389404
@@ -33419,13 +33419,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0458984375,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0007973872032699354,
- "loss": 0.0088,
+ "loss": 0.0082,
"macro_f1": 1.0,
"num_tokens": 5673491.0,
"repeat_count": 1.0,
- "routers_loss": 0.002753519220277667,
+ "routers_loss": 0.0026981087867170572,
"skip_count": 1.0,
"step": 3518,
"text_loss": 0.35089045763015747
@@ -33438,32 +33438,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.033203125,
"learning_rate": 0.000797138328250184,
"loss": 0.0058,
"macro_f1": 0.6666666865348816,
"num_tokens": 5676529.0,
"repeat_count": 1.0,
- "routers_loss": 0.0027982397004961967,
+ "routers_loss": 0.0027328627184033394,
"skip_count": 0.0,
"step": 3520,
"text_loss": 0.41077399253845215
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.800000011920929,
- "avg_layers": 24.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
"epoch": 16.535368359260346,
- "f1_execute": 0.95652174949646,
+ "f1_execute": 0.9777777791023254,
"f1_repeat": 0.0,
- "f1_skip": 0.888888955116272,
- "grad_norm": 0.055419921875,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0007968893393664646,
- "loss": 0.0105,
- "macro_f1": 0.6151369214057922,
+ "loss": 0.01,
+ "macro_f1": 0.6592592597007751,
"num_tokens": 5679987.0,
"repeat_count": 1.0,
- "routers_loss": 0.03294458985328674,
+ "routers_loss": 0.02695014327764511,
"skip_count": 5.0,
"step": 3522,
"text_loss": 0.44942837953567505
@@ -33476,13 +33476,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0007966402367141903,
- "loss": 0.0073,
+ "loss": 0.0072,
"macro_f1": 0.6666666865348816,
"num_tokens": 5683185.0,
"repeat_count": 0.0,
- "routers_loss": 0.007946476340293884,
+ "routers_loss": 0.00817026849836111,
"skip_count": 2.0,
"step": 3524,
"text_loss": 0.14528048038482666
@@ -33495,13 +33495,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.021240234375,
+ "grad_norm": 0.0216064453125,
"learning_rate": 0.0007963910203888176,
- "loss": 0.0043,
+ "loss": 0.0042,
"macro_f1": 0.3333333432674408,
"num_tokens": 5686544.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021326798014342785,
+ "routers_loss": 0.0021973433904349804,
"skip_count": 0.0,
"step": 3526,
"text_loss": 0.22358648478984833
@@ -33514,13 +33514,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0556640625,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0007961416904858469,
- "loss": 0.0079,
+ "loss": 0.0078,
"macro_f1": 0.3272727429866791,
"num_tokens": 5689579.0,
"repeat_count": 0.0,
- "routers_loss": 0.03373958170413971,
+ "routers_loss": 0.033712416887283325,
"skip_count": 1.0,
"step": 3528,
"text_loss": 0.3083649277687073
@@ -33533,13 +33533,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033203125,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0007958922471008217,
- "loss": 0.007,
+ "loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 5692869.0,
"repeat_count": 0.0,
- "routers_loss": 0.010963297449052334,
+ "routers_loss": 0.011182719841599464,
"skip_count": 2.0,
"step": 3530,
"text_loss": 0.21288011968135834
@@ -33552,13 +33552,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0286865234375,
+ "grad_norm": 0.0267333984375,
"learning_rate": 0.0007956426903293292,
"loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 5696007.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014243065379559994,
+ "routers_loss": 0.0015808293828740716,
"skip_count": 0.0,
"step": 3532,
"text_loss": 0.6068631410598755
@@ -33571,13 +33571,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.059326171875,
+ "grad_norm": 0.052734375,
"learning_rate": 0.0007953930202670001,
- "loss": 0.0066,
+ "loss": 0.0062,
"macro_f1": 0.5492662787437439,
"num_tokens": 5699474.0,
"repeat_count": 2.0,
- "routers_loss": 0.038375116884708405,
+ "routers_loss": 0.03205178305506706,
"skip_count": 0.0,
"step": 3534,
"text_loss": 0.4317135512828827
@@ -33590,13 +33590,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.062255859375,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0007951432370095084,
"loss": 0.0105,
"macro_f1": 0.3333333432674408,
"num_tokens": 5703483.0,
"repeat_count": 0.0,
- "routers_loss": 0.0041501945815980434,
+ "routers_loss": 0.003518853336572647,
"skip_count": 0.0,
"step": 3536,
"text_loss": 0.5432273149490356
@@ -33609,13 +33609,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.08349609375,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0007948933406525715,
"loss": 0.01,
"macro_f1": 1.0,
"num_tokens": 5707301.0,
"repeat_count": 1.0,
- "routers_loss": 0.00536845438182354,
+ "routers_loss": 0.004982157610356808,
"skip_count": 1.0,
"step": 3538,
"text_loss": 0.40061065554618835
@@ -33628,13 +33628,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0007946433312919502,
- "loss": 0.0076,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 5710847.0,
"repeat_count": 0.0,
- "routers_loss": 0.0030090278014540672,
+ "routers_loss": 0.003067734418436885,
"skip_count": 0.0,
"step": 3540,
"text_loss": 0.5396234393119812
@@ -33647,13 +33647,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.055419921875,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0007943932090234486,
- "loss": 0.0098,
+ "loss": 0.0097,
"macro_f1": 0.5492662787437439,
"num_tokens": 5713683.0,
"repeat_count": 0.0,
- "routers_loss": 0.03756432980298996,
+ "routers_loss": 0.03728383034467697,
"skip_count": 2.0,
"step": 3542,
"text_loss": 0.18310914933681488
@@ -33666,13 +33666,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.027587890625,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0007941429739429138,
- "loss": 0.0037,
+ "loss": 0.0036,
"macro_f1": 0.6666666865348816,
"num_tokens": 5716397.0,
"repeat_count": 0.0,
- "routers_loss": 0.002606320893391967,
+ "routers_loss": 0.0025092530995607376,
"skip_count": 3.0,
"step": 3544,
"text_loss": 0.5806207060813904
@@ -33685,13 +33685,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0361328125,
+ "grad_norm": 0.040283203125,
"learning_rate": 0.0007938926261462366,
- "loss": 0.007,
+ "loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 5719984.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025650030001997948,
+ "routers_loss": 0.002493767999112606,
"skip_count": 0.0,
"step": 3546,
"text_loss": 0.38606807589530945
@@ -33704,13 +33704,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.05078125,
"learning_rate": 0.0007936421657293507,
"loss": 0.0094,
"macro_f1": 0.8823530077934265,
"num_tokens": 5723571.0,
"repeat_count": 1.0,
- "routers_loss": 0.013521218672394753,
+ "routers_loss": 0.014810923486948013,
"skip_count": 2.0,
"step": 3548,
"text_loss": 0.49558472633361816
@@ -33723,13 +33723,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0240478515625,
+ "grad_norm": 0.0284423828125,
"learning_rate": 0.0007933915927882327,
- "loss": 0.0071,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 5726405.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014581449795514345,
+ "routers_loss": 0.00152928801253438,
"skip_count": 0.0,
"step": 3550,
"text_loss": 0.8674797415733337
@@ -33742,13 +33742,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.0390625,
"learning_rate": 0.000793140907418903,
- "loss": 0.0077,
+ "loss": 0.0075,
"macro_f1": 0.6666666865348816,
"num_tokens": 5729955.0,
"repeat_count": 0.0,
- "routers_loss": 0.005775467026978731,
+ "routers_loss": 0.005522782914340496,
"skip_count": 2.0,
"step": 3552,
"text_loss": 0.3274473249912262
@@ -33761,13 +33761,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.0007928901097174248,
- "loss": 0.0083,
+ "loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 5733030.0,
"repeat_count": 0.0,
- "routers_loss": 0.008668854832649231,
+ "routers_loss": 0.009207013063132763,
"skip_count": 2.0,
"step": 3554,
"text_loss": 0.18237128853797913
@@ -33780,13 +33780,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056884765625,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0007926391997799039,
- "loss": 0.0068,
+ "loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 5735978.0,
"repeat_count": 0.0,
- "routers_loss": 0.007210119627416134,
+ "routers_loss": 0.00695531303063035,
"skip_count": 0.0,
"step": 3556,
"text_loss": 0.3266434967517853
@@ -33799,13 +33799,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.048583984375,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0007923881777024898,
- "loss": 0.0065,
+ "loss": 0.0061,
"macro_f1": 0.6666666865348816,
"num_tokens": 5738901.0,
"repeat_count": 0.0,
- "routers_loss": 0.00165808224119246,
+ "routers_loss": 0.002743212040513754,
"skip_count": 1.0,
"step": 3558,
"text_loss": 0.4971913695335388
@@ -33818,13 +33818,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.049560546875,
+ "grad_norm": 0.04931640625,
"learning_rate": 0.0007921370435813741,
- "loss": 0.0081,
+ "loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 5741946.0,
"repeat_count": 1.0,
- "routers_loss": 0.007618873380124569,
+ "routers_loss": 0.007037297356873751,
"skip_count": 0.0,
"step": 3560,
"text_loss": 0.5645473599433899
@@ -33837,13 +33837,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047607421875,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0007918857975127924,
"loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 5744987.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031584161333739758,
+ "routers_loss": 0.0030746585689485073,
"skip_count": 0.0,
"step": 3562,
"text_loss": 0.17717665433883667
@@ -33856,13 +33856,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0537109375,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0007916344395930224,
- "loss": 0.0079,
+ "loss": 0.0077,
"macro_f1": 0.3333333432674408,
"num_tokens": 5747837.0,
"repeat_count": 0.0,
- "routers_loss": 0.005207436624914408,
+ "routers_loss": 0.004522138275206089,
"skip_count": 0.0,
"step": 3564,
"text_loss": 0.7676118612289429
@@ -33875,13 +33875,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.000791382969918385,
- "loss": 0.0074,
+ "loss": 0.0075,
"macro_f1": 0.3333333432674408,
"num_tokens": 5750716.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023729163222014904,
+ "routers_loss": 0.0026240211445838213,
"skip_count": 0.0,
"step": 3566,
"text_loss": 0.4975173771381378
@@ -33894,13 +33894,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.000791131388585244,
- "loss": 0.0115,
+ "loss": 0.011,
"macro_f1": 0.8820862174034119,
"num_tokens": 5754368.0,
"repeat_count": 2.0,
- "routers_loss": 0.021537931635975838,
+ "routers_loss": 0.021831991150975227,
"skip_count": 2.0,
"step": 3568,
"text_loss": 0.9670342206954956
@@ -33913,13 +33913,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0007908796956900055,
"loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 5757076.0,
"repeat_count": 1.0,
- "routers_loss": 0.001752255018800497,
+ "routers_loss": 0.0017586691537871957,
"skip_count": 0.0,
"step": 3570,
"text_loss": 0.3057977259159088
@@ -33932,13 +33932,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.043701171875,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.000790627891329119,
- "loss": 0.006,
+ "loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 5760613.0,
"repeat_count": 0.0,
- "routers_loss": 0.00557586969807744,
+ "routers_loss": 0.005515786819159985,
"skip_count": 0.0,
"step": 3572,
"text_loss": 0.5860086679458618
@@ -33951,13 +33951,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.04296875,
"learning_rate": 0.0007903759755990763,
"loss": 0.0061,
"macro_f1": 0.3333333432674408,
"num_tokens": 5763557.0,
"repeat_count": 0.0,
- "routers_loss": 0.004236271139234304,
+ "routers_loss": 0.004096484277397394,
"skip_count": 0.0,
"step": 3574,
"text_loss": 0.17175781726837158
@@ -33970,13 +33970,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.000790123948596412,
"loss": 0.0119,
"macro_f1": 0.6666666865348816,
"num_tokens": 5767430.0,
"repeat_count": 1.0,
- "routers_loss": 0.003505093976855278,
+ "routers_loss": 0.005216122139245272,
"skip_count": 0.0,
"step": 3576,
"text_loss": 0.7520374059677124
@@ -33989,13 +33989,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0007898718104177031,
- "loss": 0.011,
+ "loss": 0.0108,
"macro_f1": 0.3333333432674408,
"num_tokens": 5770175.0,
"repeat_count": 0.0,
- "routers_loss": 0.0039036881644278765,
+ "routers_loss": 0.0037980107590556145,
"skip_count": 0.0,
"step": 3578,
"text_loss": 0.18117885291576385
@@ -34008,13 +34008,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.0007896195611595699,
"loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 5773032.0,
"repeat_count": 0.0,
- "routers_loss": 0.00450134975835681,
+ "routers_loss": 0.003672175807878375,
"skip_count": 2.0,
"step": 3580,
"text_loss": 0.7241058349609375
@@ -34027,13 +34027,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0007893672009186744,
- "loss": 0.0082,
+ "loss": 0.0083,
"macro_f1": 1.0,
"num_tokens": 5776077.0,
"repeat_count": 1.0,
- "routers_loss": 0.01287894882261753,
+ "routers_loss": 0.01229850109666586,
"skip_count": 3.0,
"step": 3582,
"text_loss": 0.29140418767929077
@@ -34046,13 +34046,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0284423828125,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0007891147297917216,
"loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 5779088.0,
"repeat_count": 1.0,
- "routers_loss": 0.003500303253531456,
+ "routers_loss": 0.0035251814406365156,
"skip_count": 0.0,
"step": 3584,
"text_loss": 0.1727485954761505
@@ -34065,13 +34065,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05615234375,
+ "grad_norm": 0.055908203125,
"learning_rate": 0.000788862147875459,
- "loss": 0.0093,
+ "loss": 0.0094,
"macro_f1": 0.6666666865348816,
"num_tokens": 5782201.0,
"repeat_count": 0.0,
- "routers_loss": 0.0042770374566316605,
+ "routers_loss": 0.004725661128759384,
"skip_count": 2.0,
"step": 3586,
"text_loss": 0.43512848019599915
@@ -34084,13 +34084,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.057861328125,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0007886094552666765,
- "loss": 0.0107,
+ "loss": 0.0106,
"macro_f1": 0.3333333432674408,
"num_tokens": 5785039.0,
"repeat_count": 0.0,
- "routers_loss": 0.005349197890609503,
+ "routers_loss": 0.005632172804325819,
"skip_count": 0.0,
"step": 3588,
"text_loss": 0.3534786105155945
@@ -34103,13 +34103,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0654296875,
+ "grad_norm": 0.0556640625,
"learning_rate": 0.0007883566520622062,
- "loss": 0.0114,
+ "loss": 0.0109,
"macro_f1": 0.6666666865348816,
"num_tokens": 5788017.0,
"repeat_count": 0.0,
- "routers_loss": 0.008142824284732342,
+ "routers_loss": 0.006249965168535709,
"skip_count": 1.0,
"step": 3590,
"text_loss": 0.2089710384607315
@@ -34122,13 +34122,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0302734375,
+ "grad_norm": 0.02978515625,
"learning_rate": 0.0007881037383589229,
- "loss": 0.0071,
+ "loss": 0.0073,
"macro_f1": 0.3333333432674408,
"num_tokens": 5791168.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013415004359558225,
+ "routers_loss": 0.0013797614956274629,
"skip_count": 0.0,
"step": 3592,
"text_loss": 0.4349329471588135
@@ -34141,13 +34141,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0007878507142537436,
- "loss": 0.0089,
+ "loss": 0.0091,
"macro_f1": 0.6666666865348816,
"num_tokens": 5793927.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022349755745381117,
+ "routers_loss": 0.0019719740375876427,
"skip_count": 1.0,
"step": 3594,
"text_loss": 0.6087368726730347
@@ -34160,13 +34160,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.0007875975798436274,
- "loss": 0.0058,
+ "loss": 0.0059,
"macro_f1": 0.6666666865348816,
"num_tokens": 5797214.0,
"repeat_count": 1.0,
- "routers_loss": 0.0037436108104884624,
+ "routers_loss": 0.0037070370744913816,
"skip_count": 0.0,
"step": 3596,
"text_loss": 0.4258122444152832
@@ -34179,13 +34179,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0458984375,
+ "grad_norm": 0.048583984375,
"learning_rate": 0.0007873443352255764,
- "loss": 0.009,
+ "loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 5800691.0,
"repeat_count": 0.0,
- "routers_loss": 0.008491694927215576,
+ "routers_loss": 0.008431311696767807,
"skip_count": 0.0,
"step": 3598,
"text_loss": 0.6006711721420288
@@ -34198,13 +34198,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052490234375,
+ "grad_norm": 0.055419921875,
"learning_rate": 0.0007870909804966337,
- "loss": 0.0075,
+ "loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 5804712.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020895113702863455,
+ "routers_loss": 0.0017720256000757217,
"skip_count": 0.0,
"step": 3600,
"text_loss": 0.6055042743682861
@@ -34217,13 +34217,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.053955078125,
+ "grad_norm": 0.0517578125,
"learning_rate": 0.0007868375157538861,
- "loss": 0.0086,
+ "loss": 0.0083,
"macro_f1": 0.3272727429866791,
"num_tokens": 5807670.0,
"repeat_count": 1.0,
- "routers_loss": 0.01193003449589014,
+ "routers_loss": 0.010697763413190842,
"skip_count": 0.0,
"step": 3602,
"text_loss": 0.8039056658744812
@@ -34236,13 +34236,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.043212890625,
+ "grad_norm": 0.04150390625,
"learning_rate": 0.0007865839410944611,
- "loss": 0.008,
+ "loss": 0.0081,
"macro_f1": 0.6666666865348816,
"num_tokens": 5810880.0,
"repeat_count": 1.0,
- "routers_loss": 0.003107197815552354,
+ "routers_loss": 0.0030022128485143185,
"skip_count": 0.0,
"step": 3604,
"text_loss": 0.596110463142395
@@ -34255,13 +34255,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0007863302566155295,
- "loss": 0.0098,
+ "loss": 0.0093,
"macro_f1": 0.6666666865348816,
"num_tokens": 5814171.0,
"repeat_count": 0.0,
- "routers_loss": 0.0075443098321557045,
+ "routers_loss": 0.006257854867726564,
"skip_count": 2.0,
"step": 3606,
"text_loss": 0.5700319409370422
@@ -34274,13 +34274,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.0294189453125,
"learning_rate": 0.0007860764624143031,
- "loss": 0.0053,
+ "loss": 0.0054,
"macro_f1": 0.6666666865348816,
"num_tokens": 5817607.0,
"repeat_count": 1.0,
- "routers_loss": 0.005313992965966463,
+ "routers_loss": 0.004838473163545132,
"skip_count": 0.0,
"step": 3608,
"text_loss": 0.8319530487060547
@@ -34293,13 +34293,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.08154296875,
"learning_rate": 0.0007858225585880369,
- "loss": 0.0069,
+ "loss": 0.0067,
"macro_f1": 0.8823530077934265,
"num_tokens": 5821452.0,
"repeat_count": 1.0,
- "routers_loss": 0.020901991054415703,
+ "routers_loss": 0.02173662930727005,
"skip_count": 2.0,
"step": 3610,
"text_loss": 0.3738477826118469
@@ -34312,13 +34312,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0007855685452340269,
- "loss": 0.0078,
+ "loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 5824683.0,
"repeat_count": 0.0,
- "routers_loss": 0.002484811469912529,
+ "routers_loss": 0.0032719180453568697,
"skip_count": 0.0,
"step": 3612,
"text_loss": 0.4054839015007019
@@ -34331,13 +34331,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.0007853144224496118,
- "loss": 0.0094,
+ "loss": 0.0093,
"macro_f1": 0.3272727429866791,
"num_tokens": 5827860.0,
"repeat_count": 1.0,
- "routers_loss": 0.032128892838954926,
+ "routers_loss": 0.032171256840229034,
"skip_count": 0.0,
"step": 3614,
"text_loss": 0.18112395703792572
@@ -34350,13 +34350,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05322265625,
+ "grad_norm": 0.0458984375,
"learning_rate": 0.0007850601903321716,
- "loss": 0.0062,
+ "loss": 0.006,
"macro_f1": 0.6666666865348816,
"num_tokens": 5831651.0,
"repeat_count": 0.0,
- "routers_loss": 0.0136244622990489,
+ "routers_loss": 0.013230946846306324,
"skip_count": 1.0,
"step": 3616,
"text_loss": 0.2698844075202942
@@ -34369,13 +34369,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.000784805848979129,
- "loss": 0.0057,
+ "loss": 0.0058,
"macro_f1": 0.3333333432674408,
"num_tokens": 5834369.0,
"repeat_count": 0.0,
- "routers_loss": 0.001705345930531621,
+ "routers_loss": 0.00162619655020535,
"skip_count": 0.0,
"step": 3618,
"text_loss": 0.2430931180715561
@@ -34388,13 +34388,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0546875,
+ "grad_norm": 0.0498046875,
"learning_rate": 0.0007845513984879477,
- "loss": 0.0066,
+ "loss": 0.0069,
"macro_f1": 0.6666666865348816,
"num_tokens": 5838102.0,
"repeat_count": 1.0,
- "routers_loss": 0.002594438148662448,
+ "routers_loss": 0.002781603019684553,
"skip_count": 0.0,
"step": 3620,
"text_loss": 0.4968300759792328
@@ -34407,13 +34407,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.031005859375,
"learning_rate": 0.0007842968389561337,
- "loss": 0.0049,
+ "loss": 0.0048,
"macro_f1": 0.3333333432674408,
"num_tokens": 5841029.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019142795354127884,
+ "routers_loss": 0.0023873315658420324,
"skip_count": 0.0,
"step": 3622,
"text_loss": 0.5842974781990051
@@ -34426,13 +34426,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.03955078125,
"learning_rate": 0.0007840421704812346,
- "loss": 0.0093,
+ "loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 5845158.0,
"repeat_count": 0.0,
- "routers_loss": 0.004223407246172428,
+ "routers_loss": 0.00400173757225275,
"skip_count": 1.0,
"step": 3624,
"text_loss": 0.8312450647354126
@@ -34445,13 +34445,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03466796875,
+ "grad_norm": 0.035888671875,
"learning_rate": 0.00078378739316084,
- "loss": 0.0092,
+ "loss": 0.0094,
"macro_f1": 0.3333333432674408,
"num_tokens": 5849175.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005486982990987599,
+ "routers_loss": 0.0004974664188921452,
"skip_count": 0.0,
"step": 3626,
"text_loss": 0.48637253046035767
@@ -34464,13 +34464,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 1.0,
"f1_skip": 0.888888955116272,
- "grad_norm": 0.0654296875,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.000783532507092581,
- "loss": 0.0077,
+ "loss": 0.0079,
"macro_f1": 0.9555556178092957,
"num_tokens": 5852020.0,
"repeat_count": 1.0,
- "routers_loss": 0.025490080937743187,
+ "routers_loss": 0.02555239573121071,
"skip_count": 5.0,
"step": 3628,
"text_loss": 0.5407033562660217
@@ -34483,13 +34483,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.041259765625,
"learning_rate": 0.0007832775123741306,
- "loss": 0.0104,
+ "loss": 0.0106,
"macro_f1": 0.3333333432674408,
"num_tokens": 5854873.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026199028361588717,
+ "routers_loss": 0.0025962977670133114,
"skip_count": 0.0,
"step": 3630,
"text_loss": 0.618230938911438
@@ -34502,13 +34502,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0225830078125,
+ "grad_norm": 0.0234375,
"learning_rate": 0.000783022409103203,
"loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 5858086.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028729604091495275,
+ "routers_loss": 0.0029271875973790884,
"skip_count": 0.0,
"step": 3632,
"text_loss": 0.21259798109531403
@@ -34521,13 +34521,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05419921875,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0007827671973775542,
- "loss": 0.0069,
+ "loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 5860886.0,
"repeat_count": 0.0,
- "routers_loss": 0.004097428172826767,
+ "routers_loss": 0.004102068953216076,
"skip_count": 0.0,
"step": 3634,
"text_loss": 0.4991208016872406
@@ -34540,13 +34540,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0302734375,
+ "grad_norm": 0.033203125,
"learning_rate": 0.0007825118772949819,
"loss": 0.0095,
"macro_f1": 0.6666666865348816,
"num_tokens": 5864291.0,
"repeat_count": 0.0,
- "routers_loss": 0.002142589772120118,
+ "routers_loss": 0.0023497689981013536,
"skip_count": 1.0,
"step": 3636,
"text_loss": 0.3878401517868042
@@ -34559,13 +34559,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0206298828125,
+ "grad_norm": 0.0216064453125,
"learning_rate": 0.0007822564489533255,
- "loss": 0.005,
+ "loss": 0.0051,
"macro_f1": 0.6666666865348816,
"num_tokens": 5867155.0,
"repeat_count": 0.0,
- "routers_loss": 0.006497112102806568,
+ "routers_loss": 0.007680345326662064,
"skip_count": 2.0,
"step": 3638,
"text_loss": 0.6132124066352844
@@ -34578,13 +34578,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.053466796875,
"learning_rate": 0.0007820009124504653,
- "loss": 0.0095,
+ "loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 5870325.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008698388119228184,
+ "routers_loss": 0.0008242831099778414,
"skip_count": 0.0,
"step": 3640,
"text_loss": 0.3552473187446594
@@ -34597,13 +34597,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.04296875,
"learning_rate": 0.0007817452678843236,
- "loss": 0.0071,
+ "loss": 0.0073,
"macro_f1": 0.6601307392120361,
"num_tokens": 5873301.0,
"repeat_count": 1.0,
- "routers_loss": 0.022245829924941063,
+ "routers_loss": 0.023831043392419815,
"skip_count": 2.0,
"step": 3642,
"text_loss": 0.18363867700099945
@@ -34616,13 +34616,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.024658203125,
+ "grad_norm": 0.0260009765625,
"learning_rate": 0.0007814895153528635,
- "loss": 0.0071,
+ "loss": 0.007,
"macro_f1": 0.3333333432674408,
"num_tokens": 5876225.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020051905885338783,
+ "routers_loss": 0.001999989850446582,
"skip_count": 0.0,
"step": 3644,
"text_loss": 0.17581747472286224
@@ -34635,13 +34635,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025146484375,
+ "grad_norm": 0.028564453125,
"learning_rate": 0.0007812336549540903,
- "loss": 0.0071,
+ "loss": 0.007,
"macro_f1": 0.3333333432674408,
"num_tokens": 5879501.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014994015218690038,
+ "routers_loss": 0.001098626758903265,
"skip_count": 0.0,
"step": 3646,
"text_loss": 0.5040884613990784
@@ -34654,13 +34654,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0294189453125,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0007809776867860499,
- "loss": 0.0051,
+ "loss": 0.005,
"macro_f1": 0.3272727429866791,
"num_tokens": 5882608.0,
"repeat_count": 0.0,
- "routers_loss": 0.010847748257219791,
+ "routers_loss": 0.012210183776915073,
"skip_count": 1.0,
"step": 3648,
"text_loss": 0.27114811539649963
@@ -34673,13 +34673,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0262451171875,
+ "grad_norm": 0.032958984375,
"learning_rate": 0.00078072161094683,
- "loss": 0.006,
+ "loss": 0.0059,
"macro_f1": 0.6666666865348816,
"num_tokens": 5886106.0,
"repeat_count": 0.0,
- "routers_loss": 0.005927151069045067,
+ "routers_loss": 0.005191771313548088,
"skip_count": 2.0,
"step": 3650,
"text_loss": 0.5167917609214783
@@ -34692,13 +34692,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.024169921875,
+ "grad_norm": 0.0235595703125,
"learning_rate": 0.0007804654275345591,
- "loss": 0.0061,
+ "loss": 0.006,
"macro_f1": 0.6666666865348816,
"num_tokens": 5889122.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019531139405444264,
+ "routers_loss": 0.0016411367105320096,
"skip_count": 1.0,
"step": 3652,
"text_loss": 0.7691274285316467
@@ -34711,13 +34711,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.0277099609375,
+ "grad_norm": 0.03515625,
"learning_rate": 0.0007802091366474074,
- "loss": 0.0052,
+ "loss": 0.005,
"macro_f1": 0.8823530077934265,
"num_tokens": 5892313.0,
"repeat_count": 2.0,
- "routers_loss": 0.015216727741062641,
+ "routers_loss": 0.015627093613147736,
"skip_count": 1.0,
"step": 3654,
"text_loss": 0.4646325409412384
@@ -34730,13 +34730,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0311279296875,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0007799527383835858,
- "loss": 0.0067,
+ "loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 5895577.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009810501942411065,
+ "routers_loss": 0.0009879748104140162,
"skip_count": 0.0,
"step": 3656,
"text_loss": 0.5587969422340393
@@ -34749,13 +34749,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0007796962328413469,
- "loss": 0.0093,
+ "loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 5898546.0,
"repeat_count": 0.0,
- "routers_loss": 0.00458681071177125,
+ "routers_loss": 0.004864919930696487,
"skip_count": 0.0,
"step": 3658,
"text_loss": 0.6981375813484192
@@ -34768,13 +34768,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.029052734375,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0007794396201189839,
- "loss": 0.0076,
+ "loss": 0.0078,
"macro_f1": 1.0,
"num_tokens": 5901618.0,
"repeat_count": 1.0,
- "routers_loss": 0.006519644521176815,
+ "routers_loss": 0.006617432460188866,
"skip_count": 2.0,
"step": 3660,
"text_loss": 0.22521957755088806
@@ -34787,13 +34787,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.0007791829003148312,
- "loss": 0.0097,
+ "loss": 0.0098,
"macro_f1": 0.6601307392120361,
"num_tokens": 5904540.0,
"repeat_count": 1.0,
- "routers_loss": 0.0783558189868927,
+ "routers_loss": 0.0782252699136734,
"skip_count": 2.0,
"step": 3662,
"text_loss": 0.2649642825126648
@@ -34806,13 +34806,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.06494140625,
"learning_rate": 0.0007789260735272647,
- "loss": 0.0115,
+ "loss": 0.0114,
"macro_f1": 0.3333333432674408,
"num_tokens": 5907827.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012588179670274258,
+ "routers_loss": 0.0012057392159476876,
"skip_count": 0.0,
"step": 3664,
"text_loss": 0.6943771243095398
@@ -34825,13 +34825,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0177001953125,
+ "grad_norm": 0.018310546875,
"learning_rate": 0.0007786691398547005,
"loss": 0.0048,
"macro_f1": 0.6666666865348816,
"num_tokens": 5911163.0,
"repeat_count": 0.0,
- "routers_loss": 0.0075621698051691055,
+ "routers_loss": 0.007476957980543375,
"skip_count": 2.0,
"step": 3666,
"text_loss": 0.1502683162689209
@@ -34844,13 +34844,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0303955078125,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.0007784120993955962,
- "loss": 0.0056,
+ "loss": 0.0055,
"macro_f1": 0.6666666865348816,
"num_tokens": 5913948.0,
"repeat_count": 1.0,
- "routers_loss": 0.00408853217959404,
+ "routers_loss": 0.004082011990249157,
"skip_count": 0.0,
"step": 3668,
"text_loss": 0.4127517640590668
@@ -34863,13 +34863,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.800000011920929,
"f1_skip": 1.0,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.041259765625,
"learning_rate": 0.0007781549522484503,
- "loss": 0.0067,
+ "loss": 0.0066,
"macro_f1": 0.9265305995941162,
"num_tokens": 5917360.0,
"repeat_count": 3.0,
- "routers_loss": 0.02851647138595581,
+ "routers_loss": 0.027505695819854736,
"skip_count": 1.0,
"step": 3670,
"text_loss": 0.23892618715763092
@@ -34882,13 +34882,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.031005859375,
+ "grad_norm": 0.0306396484375,
"learning_rate": 0.0007778976985118018,
- "loss": 0.0086,
+ "loss": 0.0083,
"macro_f1": 0.6666666865348816,
"num_tokens": 5920524.0,
"repeat_count": 0.0,
- "routers_loss": 0.0030399872921407223,
+ "routers_loss": 0.0024977331049740314,
"skip_count": 2.0,
"step": 3672,
"text_loss": 0.5076471567153931
@@ -34901,13 +34901,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05908203125,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0007776403382842312,
- "loss": 0.0061,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 5923632.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014176326803863049,
+ "routers_loss": 0.0015700991498306394,
"skip_count": 0.0,
"step": 3674,
"text_loss": 0.6287924647331238
@@ -34920,13 +34920,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.05810546875,
"learning_rate": 0.0007773828716643591,
- "loss": 0.0084,
+ "loss": 0.0085,
"macro_f1": 0.3272727429866791,
"num_tokens": 5926438.0,
"repeat_count": 1.0,
- "routers_loss": 0.0505419559776783,
+ "routers_loss": 0.05108916014432907,
"skip_count": 0.0,
"step": 3676,
"text_loss": 0.26517006754875183
@@ -34939,13 +34939,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0007771252987508474,
"loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 5930081.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034831957891583443,
+ "routers_loss": 0.003439917229115963,
"skip_count": 0.0,
"step": 3678,
"text_loss": 0.5189079642295837
@@ -34958,13 +34958,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.056884765625,
"learning_rate": 0.0007768676196423984,
"loss": 0.0064,
"macro_f1": 1.0,
"num_tokens": 5933463.0,
"repeat_count": 1.0,
- "routers_loss": 0.0020620382856577635,
+ "routers_loss": 0.001935846172273159,
"skip_count": 1.0,
"step": 3680,
"text_loss": 0.6703575849533081
@@ -34972,18 +34972,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 17.286469034341064,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0007766098344377553,
- "loss": 0.0084,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0082,
+ "macro_f1": 0.31446540355682373,
"num_tokens": 5937098.0,
"repeat_count": 0.0,
- "routers_loss": 0.03850153833627701,
+ "routers_loss": 0.0384826585650444,
"skip_count": 2.0,
"step": 3682,
"text_loss": 0.6424444913864136
@@ -34996,13 +34996,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031982421875,
+ "grad_norm": 0.0301513671875,
"learning_rate": 0.0007763519432357018,
- "loss": 0.0065,
+ "loss": 0.0063,
"macro_f1": 0.3333333432674408,
"num_tokens": 5940436.0,
"repeat_count": 0.0,
- "routers_loss": 0.000853471748996526,
+ "routers_loss": 0.0008654671837575734,
"skip_count": 0.0,
"step": 3684,
"text_loss": 0.4189988672733307
@@ -35015,13 +35015,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05712890625,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.0007760939461350623,
- "loss": 0.0107,
+ "loss": 0.0111,
"macro_f1": 0.6666666865348816,
"num_tokens": 5943731.0,
"repeat_count": 0.0,
- "routers_loss": 0.007630084175616503,
+ "routers_loss": 0.007468715775758028,
"skip_count": 2.0,
"step": 3686,
"text_loss": 0.2875453233718872
@@ -35036,11 +35036,11 @@
"f1_skip": 0.0,
"grad_norm": 0.041259765625,
"learning_rate": 0.0007758358432347019,
- "loss": 0.0061,
+ "loss": 0.0058,
"macro_f1": 0.3333333432674408,
"num_tokens": 5946707.0,
"repeat_count": 0.0,
- "routers_loss": 0.001303135184571147,
+ "routers_loss": 0.001252831774763763,
"skip_count": 0.0,
"step": 3688,
"text_loss": 0.5093055367469788
@@ -35053,13 +35053,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0007755776346335259,
- "loss": 0.0058,
+ "loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 5949833.0,
"repeat_count": 0.0,
- "routers_loss": 0.001894078915938735,
+ "routers_loss": 0.001680848654359579,
"skip_count": 0.0,
"step": 3690,
"text_loss": 0.4031114876270294
@@ -35072,13 +35072,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.025146484375,
+ "grad_norm": 0.0255126953125,
"learning_rate": 0.0007753193204304807,
- "loss": 0.0056,
+ "loss": 0.0058,
"macro_f1": 0.6666666865348816,
"num_tokens": 5953095.0,
"repeat_count": 0.0,
- "routers_loss": 0.005708714015781879,
+ "routers_loss": 0.0047258250415325165,
"skip_count": 2.0,
"step": 3692,
"text_loss": 0.17632785439491272
@@ -35091,13 +35091,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.03564453125,
+ "grad_norm": 0.036376953125,
"learning_rate": 0.0007750609007245524,
"loss": 0.0062,
"macro_f1": 1.0,
"num_tokens": 5955971.0,
"repeat_count": 2.0,
- "routers_loss": 0.0019924843218177557,
+ "routers_loss": 0.001980359200388193,
"skip_count": 4.0,
"step": 3694,
"text_loss": 0.3423727750778198
@@ -35110,13 +35110,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0255126953125,
+ "grad_norm": 0.0238037109375,
"learning_rate": 0.0007748023756147679,
- "loss": 0.007,
+ "loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 5958948.0,
"repeat_count": 0.0,
- "routers_loss": 0.005303190555423498,
+ "routers_loss": 0.00511702848598361,
"skip_count": 0.0,
"step": 3696,
"text_loss": 0.28279972076416016
@@ -35129,13 +35129,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0007745437452001949,
- "loss": 0.0063,
+ "loss": 0.0064,
"macro_f1": 0.3333333432674408,
"num_tokens": 5961819.0,
"repeat_count": 0.0,
- "routers_loss": 0.0004839526955038309,
+ "routers_loss": 0.0005220443126745522,
"skip_count": 0.0,
"step": 3698,
"text_loss": 0.4793325662612915
@@ -35148,13 +35148,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0007742850095799408,
- "loss": 0.0083,
+ "loss": 0.0084,
"macro_f1": 0.3272727429866791,
"num_tokens": 5964625.0,
"repeat_count": 1.0,
- "routers_loss": 0.06377380341291428,
+ "routers_loss": 0.06411020457744598,
"skip_count": 0.0,
"step": 3700,
"text_loss": 0.2825184464454651
@@ -35167,13 +35167,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0654296875,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0007740261688531536,
- "loss": 0.007,
+ "loss": 0.0068,
"macro_f1": 0.6666666865348816,
"num_tokens": 5967134.0,
"repeat_count": 0.0,
- "routers_loss": 0.00462002120912075,
+ "routers_loss": 0.004408109001815319,
"skip_count": 3.0,
"step": 3702,
"text_loss": 0.690429151058197
@@ -35186,13 +35186,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0228271484375,
+ "grad_norm": 0.0279541015625,
"learning_rate": 0.0007737672231190215,
- "loss": 0.0033,
+ "loss": 0.0034,
"macro_f1": 0.3333333432674408,
"num_tokens": 5969831.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006775400252081454,
+ "routers_loss": 0.0006747521692886949,
"skip_count": 0.0,
"step": 3704,
"text_loss": 0.32556024193763733
@@ -35205,13 +35205,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02783203125,
+ "grad_norm": 0.031005859375,
"learning_rate": 0.0007735081724767732,
- "loss": 0.0061,
+ "loss": 0.0059,
"macro_f1": 0.3333333432674408,
"num_tokens": 5973015.0,
"repeat_count": 0.0,
- "routers_loss": 0.001372992410324514,
+ "routers_loss": 0.0020414739847183228,
"skip_count": 0.0,
"step": 3706,
"text_loss": 0.5876469612121582
@@ -35224,13 +35224,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.072265625,
"learning_rate": 0.0007732490170256769,
"loss": 0.0071,
"macro_f1": 0.6666666865348816,
"num_tokens": 5975778.0,
"repeat_count": 1.0,
- "routers_loss": 0.005310074891895056,
+ "routers_loss": 0.005610425490885973,
"skip_count": 0.0,
"step": 3708,
"text_loss": 0.2968577444553375
@@ -35243,13 +35243,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05078125,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0007729897568650422,
- "loss": 0.01,
+ "loss": 0.0097,
"macro_f1": 0.3333333432674408,
"num_tokens": 5979115.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012178041506558657,
+ "routers_loss": 0.001248046406544745,
"skip_count": 0.0,
"step": 3710,
"text_loss": 0.626361608505249
@@ -35262,13 +35262,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0595703125,
+ "grad_norm": 0.06787109375,
"learning_rate": 0.0007727303920942176,
- "loss": 0.01,
+ "loss": 0.0102,
"macro_f1": 0.6666666865348816,
"num_tokens": 5982213.0,
"repeat_count": 0.0,
- "routers_loss": 0.004617640748620033,
+ "routers_loss": 0.005791695322841406,
"skip_count": 2.0,
"step": 3712,
"text_loss": 0.4133484661579132
@@ -35281,13 +35281,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.08740234375,
"learning_rate": 0.0007724709228125922,
- "loss": 0.0106,
+ "loss": 0.0105,
"macro_f1": 0.5492662787437439,
"num_tokens": 5984930.0,
"repeat_count": 0.0,
- "routers_loss": 0.020924020558595657,
+ "routers_loss": 0.02114664763212204,
"skip_count": 2.0,
"step": 3714,
"text_loss": 0.4646461308002472
@@ -35300,13 +35300,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.032958984375,
"learning_rate": 0.0007722113491195952,
- "loss": 0.0059,
+ "loss": 0.0058,
"macro_f1": 1.0,
"num_tokens": 5988017.0,
"repeat_count": 2.0,
- "routers_loss": 0.0053578754886984825,
+ "routers_loss": 0.005913930479437113,
"skip_count": 5.0,
"step": 3716,
"text_loss": 0.15474505722522736
@@ -35319,13 +35319,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.026123046875,
+ "grad_norm": 0.02685546875,
"learning_rate": 0.0007719516711146957,
- "loss": 0.0075,
+ "loss": 0.0073,
"macro_f1": 0.6666666865348816,
"num_tokens": 5991562.0,
"repeat_count": 0.0,
- "routers_loss": 0.006991801783442497,
+ "routers_loss": 0.0075925313867628574,
"skip_count": 2.0,
"step": 3718,
"text_loss": 0.5293686985969543
@@ -35338,13 +35338,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031005859375,
+ "grad_norm": 0.037353515625,
"learning_rate": 0.000771691888897403,
- "loss": 0.0054,
+ "loss": 0.0051,
"macro_f1": 0.3333333432674408,
"num_tokens": 5994675.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011527709430083632,
+ "routers_loss": 0.0012335237115621567,
"skip_count": 0.0,
"step": 3720,
"text_loss": 0.5210637450218201
@@ -35357,13 +35357,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.0771484375,
"learning_rate": 0.0007714320025672657,
- "loss": 0.008,
+ "loss": 0.0077,
"macro_f1": 0.6666666865348816,
"num_tokens": 5999070.0,
"repeat_count": 0.0,
- "routers_loss": 0.011113573797047138,
+ "routers_loss": 0.010582062415778637,
"skip_count": 2.0,
"step": 3722,
"text_loss": 0.2783571779727936
@@ -35376,13 +35376,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03369140625,
+ "grad_norm": 0.032958984375,
"learning_rate": 0.000771172012223873,
- "loss": 0.008,
+ "loss": 0.0078,
"macro_f1": 0.6598639488220215,
"num_tokens": 6002702.0,
"repeat_count": 1.0,
- "routers_loss": 0.014584671705961227,
+ "routers_loss": 0.015008784830570221,
"skip_count": 3.0,
"step": 3724,
"text_loss": 0.358705073595047
@@ -35395,13 +35395,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05419921875,
+ "grad_norm": 0.052734375,
"learning_rate": 0.0007709119179668538,
"loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 6005517.0,
"repeat_count": 0.0,
- "routers_loss": 0.001164636923931539,
+ "routers_loss": 0.00111615180503577,
"skip_count": 0.0,
"step": 3726,
"text_loss": 0.45202162861824036
@@ -35414,13 +35414,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.034912109375,
"learning_rate": 0.0007706517198958764,
- "loss": 0.0098,
+ "loss": 0.0096,
"macro_f1": 0.6595745086669922,
"num_tokens": 6009111.0,
"repeat_count": 1.0,
- "routers_loss": 0.05235295370221138,
+ "routers_loss": 0.05215252563357353,
"skip_count": 4.0,
"step": 3728,
"text_loss": 0.20360413193702698
@@ -35433,13 +35433,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0007703914181106497,
- "loss": 0.0077,
+ "loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 6012989.0,
"repeat_count": 0.0,
- "routers_loss": 0.01087163109332323,
+ "routers_loss": 0.010039499960839748,
"skip_count": 3.0,
"step": 3730,
"text_loss": 0.20334361493587494
@@ -35452,13 +35452,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07177734375,
+ "grad_norm": 0.08203125,
"learning_rate": 0.0007701310127109211,
- "loss": 0.0063,
+ "loss": 0.0062,
"macro_f1": 0.3272727429866791,
"num_tokens": 6016420.0,
"repeat_count": 0.0,
- "routers_loss": 0.010110805742442608,
+ "routers_loss": 0.01090205181390047,
"skip_count": 1.0,
"step": 3732,
"text_loss": 0.47959551215171814
@@ -35471,13 +35471,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.0,
"f1_skip": 0.888888955116272,
- "grad_norm": 0.03564453125,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0007698705037964791,
- "loss": 0.0078,
+ "loss": 0.0076,
"macro_f1": 0.6225374937057495,
"num_tokens": 6019551.0,
"repeat_count": 0.0,
- "routers_loss": 0.026909299194812775,
+ "routers_loss": 0.02677762135863304,
"skip_count": 5.0,
"step": 3734,
"text_loss": 0.2621438801288605
@@ -35490,13 +35490,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.056640625,
"learning_rate": 0.000769609891467151,
- "loss": 0.0122,
+ "loss": 0.0119,
"macro_f1": 0.6666666865348816,
"num_tokens": 6022262.0,
"repeat_count": 1.0,
- "routers_loss": 0.003602684009820223,
+ "routers_loss": 0.00460716662928462,
"skip_count": 0.0,
"step": 3736,
"text_loss": 0.3433022201061249
@@ -35509,13 +35509,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.037109375,
"learning_rate": 0.0007693491758228037,
- "loss": 0.005,
+ "loss": 0.0052,
"macro_f1": 0.6666666865348816,
"num_tokens": 6025723.0,
"repeat_count": 0.0,
- "routers_loss": 0.00290105608291924,
+ "routers_loss": 0.0036111194640398026,
"skip_count": 2.0,
"step": 3738,
"text_loss": 0.38703784346580505
@@ -35528,13 +35528,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0007690883569633442,
"loss": 0.006,
"macro_f1": 0.3333333432674408,
"num_tokens": 6028652.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031469720415771008,
+ "routers_loss": 0.003299296135082841,
"skip_count": 0.0,
"step": 3740,
"text_loss": 0.24203069508075714
@@ -35547,13 +35547,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.027587890625,
+ "grad_norm": 0.0277099609375,
"learning_rate": 0.0007688274349887188,
- "loss": 0.0048,
+ "loss": 0.0047,
"macro_f1": 0.3333333432674408,
"num_tokens": 6032280.0,
"repeat_count": 0.0,
- "routers_loss": 0.0029467069543898106,
+ "routers_loss": 0.003173880511894822,
"skip_count": 0.0,
"step": 3742,
"text_loss": 0.2827291488647461
@@ -35566,13 +35566,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031494140625,
+ "grad_norm": 0.0302734375,
"learning_rate": 0.0007685664099989131,
- "loss": 0.0074,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 6035111.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009511710377410054,
+ "routers_loss": 0.0008576177642680705,
"skip_count": 0.0,
"step": 3744,
"text_loss": 0.43613526225090027
@@ -35585,13 +35585,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0252685546875,
+ "grad_norm": 0.0274658203125,
"learning_rate": 0.0007683052820939524,
"loss": 0.0072,
"macro_f1": 0.6666666865348816,
"num_tokens": 6038428.0,
"repeat_count": 0.0,
- "routers_loss": 0.004079817794263363,
+ "routers_loss": 0.004335585981607437,
"skip_count": 2.0,
"step": 3746,
"text_loss": 1.0385624170303345
@@ -35604,13 +35604,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0007680440513739015,
"loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 6041185.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007996217464096844,
+ "routers_loss": 0.0008210531086660922,
"skip_count": 0.0,
"step": 3748,
"text_loss": 0.7070431709289551
@@ -35623,13 +35623,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.041015625,
+ "grad_norm": 0.056640625,
"learning_rate": 0.0007677827179388646,
- "loss": 0.0088,
+ "loss": 0.0089,
"macro_f1": 1.0,
"num_tokens": 6046333.0,
"repeat_count": 1.0,
- "routers_loss": 0.0047629233449697495,
+ "routers_loss": 0.003778942162171006,
"skip_count": 1.0,
"step": 3750,
"text_loss": 0.3682238757610321
@@ -35642,13 +35642,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.05908203125,
+ "grad_norm": 0.08984375,
"learning_rate": 0.000767521281888985,
- "loss": 0.0087,
+ "loss": 0.009,
"macro_f1": 1.0,
"num_tokens": 6049528.0,
"repeat_count": 1.0,
- "routers_loss": 0.0039178295992314816,
+ "routers_loss": 0.002767334459349513,
"skip_count": 1.0,
"step": 3752,
"text_loss": 0.7619418501853943
@@ -35661,13 +35661,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03662109375,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0007672597433244455,
- "loss": 0.0109,
+ "loss": 0.0108,
"macro_f1": 0.6666666865348816,
"num_tokens": 6053202.0,
"repeat_count": 0.0,
- "routers_loss": 0.004995788913220167,
+ "routers_loss": 0.004796457476913929,
"skip_count": 2.0,
"step": 3754,
"text_loss": 0.4157083034515381
@@ -35680,13 +35680,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.062255859375,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0007669981023454682,
- "loss": 0.0125,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 6056609.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012595724547281861,
+ "routers_loss": 0.0013067846884950995,
"skip_count": 0.0,
"step": 3756,
"text_loss": 0.4529118537902832
@@ -35699,13 +35699,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0284423828125,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0007667363590523142,
"loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 6060504.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012152433628216386,
+ "routers_loss": 0.0010285493917763233,
"skip_count": 0.0,
"step": 3758,
"text_loss": 0.8363246321678162
@@ -35718,13 +35718,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.055419921875,
"learning_rate": 0.0007664745135452844,
- "loss": 0.0093,
+ "loss": 0.0092,
"macro_f1": 0.6666666865348816,
"num_tokens": 6063526.0,
"repeat_count": 0.0,
- "routers_loss": 0.006478998344391584,
+ "routers_loss": 0.006289863493293524,
"skip_count": 3.0,
"step": 3760,
"text_loss": 0.5313657522201538
@@ -35737,13 +35737,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.05517578125,
"learning_rate": 0.0007662125659247183,
- "loss": 0.0096,
+ "loss": 0.0093,
"macro_f1": 0.3333333432674408,
"num_tokens": 6067147.0,
"repeat_count": 0.0,
- "routers_loss": 0.003008047351613641,
+ "routers_loss": 0.0028537956532090902,
"skip_count": 0.0,
"step": 3762,
"text_loss": 0.5668109059333801
@@ -35756,13 +35756,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03369140625,
+ "grad_norm": 0.039794921875,
"learning_rate": 0.0007659505162909949,
"loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 6070350.0,
"repeat_count": 0.0,
- "routers_loss": 0.002841299632564187,
+ "routers_loss": 0.0026814753655344248,
"skip_count": 0.0,
"step": 3764,
"text_loss": 0.4983512759208679
@@ -35775,13 +35775,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.055419921875,
+ "grad_norm": 0.056884765625,
"learning_rate": 0.0007656883647445318,
- "loss": 0.01,
+ "loss": 0.0099,
"macro_f1": 0.6666666865348816,
"num_tokens": 6073091.0,
"repeat_count": 0.0,
- "routers_loss": 0.006070348434150219,
+ "routers_loss": 0.005981382913887501,
"skip_count": 1.0,
"step": 3766,
"text_loss": 0.30372318625450134
@@ -35794,13 +35794,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0289306640625,
+ "grad_norm": 0.028564453125,
"learning_rate": 0.0007654261113857863,
- "loss": 0.0073,
+ "loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 6076244.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008278369787149131,
+ "routers_loss": 0.000803640519734472,
"skip_count": 0.0,
"step": 3768,
"text_loss": 0.6100738048553467
@@ -35813,13 +35813,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02392578125,
+ "grad_norm": 0.027587890625,
"learning_rate": 0.0007651637563152539,
"loss": 0.0055,
"macro_f1": 0.3333333432674408,
"num_tokens": 6078936.0,
"repeat_count": 0.0,
- "routers_loss": 0.001354316365905106,
+ "routers_loss": 0.0013324898900464177,
"skip_count": 0.0,
"step": 3770,
"text_loss": 0.4733821153640747
@@ -35832,13 +35832,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0242919921875,
+ "grad_norm": 0.029541015625,
"learning_rate": 0.0007649012996334701,
- "loss": 0.0051,
+ "loss": 0.0054,
"macro_f1": 0.6666666865348816,
"num_tokens": 6081951.0,
"repeat_count": 1.0,
- "routers_loss": 0.0019684957806020975,
+ "routers_loss": 0.0021543330512940884,
"skip_count": 0.0,
"step": 3772,
"text_loss": 0.6794875860214233
@@ -35851,13 +35851,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.0007646387414410085,
- "loss": 0.0076,
+ "loss": 0.0073,
"macro_f1": 0.3333333432674408,
"num_tokens": 6085165.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005270782858133316,
+ "routers_loss": 0.0005426189745776355,
"skip_count": 0.0,
"step": 3774,
"text_loss": 0.5886107683181763
@@ -35870,13 +35870,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.028076171875,
+ "grad_norm": 0.0262451171875,
"learning_rate": 0.0007643760818384819,
"loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 6088370.0,
"repeat_count": 0.0,
- "routers_loss": 0.0029050554148852825,
+ "routers_loss": 0.002537576947361231,
"skip_count": 0.0,
"step": 3776,
"text_loss": 0.23591920733451843
@@ -35889,13 +35889,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.03564453125,
"learning_rate": 0.0007641133209265423,
- "loss": 0.0064,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 6092319.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026071348693221807,
+ "routers_loss": 0.002613696036860347,
"skip_count": 0.0,
"step": 3778,
"text_loss": 0.3217754662036896
@@ -35908,13 +35908,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.051025390625,
+ "grad_norm": 0.052978515625,
"learning_rate": 0.0007638504588058796,
- "loss": 0.0101,
+ "loss": 0.0105,
"macro_f1": 0.3333333432674408,
"num_tokens": 6095799.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008351493743248284,
+ "routers_loss": 0.0007219464750960469,
"skip_count": 0.0,
"step": 3780,
"text_loss": 0.4276983141899109
@@ -35927,13 +35927,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.026611328125,
+ "grad_norm": 0.0263671875,
"learning_rate": 0.0007635874955772234,
- "loss": 0.007,
+ "loss": 0.0069,
"macro_f1": 0.6666666865348816,
"num_tokens": 6098789.0,
"repeat_count": 0.0,
- "routers_loss": 0.005872148554772139,
+ "routers_loss": 0.005965052172541618,
"skip_count": 3.0,
"step": 3782,
"text_loss": 0.30936646461486816
@@ -35946,13 +35946,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0703125,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0007633244313413417,
"loss": 0.0077,
"macro_f1": 0.3333333432674408,
"num_tokens": 6101631.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007862916099838912,
+ "routers_loss": 0.0007469559786841273,
"skip_count": 0.0,
"step": 3784,
"text_loss": 0.44460123777389526
@@ -35965,13 +35965,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0007630612661990412,
- "loss": 0.0098,
+ "loss": 0.0097,
"macro_f1": 0.6666666865348816,
"num_tokens": 6105097.0,
"repeat_count": 0.0,
- "routers_loss": 0.0037640000227838755,
+ "routers_loss": 0.004300760570913553,
"skip_count": 1.0,
"step": 3786,
"text_loss": 0.41950157284736633
@@ -35984,13 +35984,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0007627980002511672,
- "loss": 0.0068,
+ "loss": 0.0069,
"macro_f1": 0.6666666865348816,
"num_tokens": 6107847.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023107193410396576,
+ "routers_loss": 0.0023050960153341293,
"skip_count": 1.0,
"step": 3788,
"text_loss": 0.48561373353004456
@@ -36003,13 +36003,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03271484375,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.0007625346335986039,
- "loss": 0.0066,
+ "loss": 0.0063,
"macro_f1": 0.3333333432674408,
"num_tokens": 6110546.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017923865234479308,
+ "routers_loss": 0.0018124044872820377,
"skip_count": 0.0,
"step": 3790,
"text_loss": 0.20882295072078705
@@ -36022,13 +36022,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0007622711663422735,
"loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 6113600.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007700122077949345,
+ "routers_loss": 0.0007613401976414025,
"skip_count": 0.0,
"step": 3792,
"text_loss": 0.31751760840415955
@@ -36041,13 +36041,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04248046875,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0007620075985831375,
- "loss": 0.009,
+ "loss": 0.0092,
"macro_f1": 0.6666666865348816,
"num_tokens": 6116916.0,
"repeat_count": 0.0,
- "routers_loss": 0.004986821208149195,
+ "routers_loss": 0.005452962126582861,
"skip_count": 2.0,
"step": 3794,
"text_loss": 0.3246645927429199
@@ -36060,13 +36060,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0272216796875,
+ "grad_norm": 0.0306396484375,
"learning_rate": 0.0007617439304221956,
"loss": 0.0055,
"macro_f1": 0.6666666865348816,
"num_tokens": 6120056.0,
"repeat_count": 2.0,
- "routers_loss": 0.004177430644631386,
+ "routers_loss": 0.0043787881731987,
"skip_count": 0.0,
"step": 3796,
"text_loss": 0.4859195947647095
@@ -36079,13 +36079,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0228271484375,
+ "grad_norm": 0.02294921875,
"learning_rate": 0.0007614801619604856,
- "loss": 0.0065,
+ "loss": 0.0064,
"macro_f1": 0.3333333432674408,
"num_tokens": 6122668.0,
"repeat_count": 0.0,
- "routers_loss": 0.003494138829410076,
+ "routers_loss": 0.0033891722559928894,
"skip_count": 0.0,
"step": 3798,
"text_loss": 0.48194369673728943
@@ -36098,13 +36098,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0252685546875,
+ "grad_norm": 0.02587890625,
"learning_rate": 0.0007612162932990845,
- "loss": 0.0063,
+ "loss": 0.0061,
"macro_f1": 0.3333333432674408,
"num_tokens": 6126792.0,
"repeat_count": 0.0,
- "routers_loss": 0.001831608940847218,
+ "routers_loss": 0.001883238204754889,
"skip_count": 0.0,
"step": 3800,
"text_loss": 0.3740062117576599
@@ -36117,13 +36117,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0007609523245391068,
- "loss": 0.0078,
+ "loss": 0.0076,
"macro_f1": 0.6666666865348816,
"num_tokens": 6129801.0,
"repeat_count": 0.0,
- "routers_loss": 0.010433467105031013,
+ "routers_loss": 0.00882677361369133,
"skip_count": 2.0,
"step": 3802,
"text_loss": 0.5759486556053162
@@ -36136,13 +36136,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0007606882557817062,
- "loss": 0.0057,
+ "loss": 0.0058,
"macro_f1": 0.6666666865348816,
"num_tokens": 6133613.0,
"repeat_count": 0.0,
- "routers_loss": 0.009141471236944199,
+ "routers_loss": 0.009537030011415482,
"skip_count": 2.0,
"step": 3804,
"text_loss": 0.3217554986476898
@@ -36155,13 +36155,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0235595703125,
+ "grad_norm": 0.0220947265625,
"learning_rate": 0.0007604240871280742,
- "loss": 0.0055,
+ "loss": 0.0053,
"macro_f1": 0.3333333432674408,
"num_tokens": 6137784.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024337477516382933,
+ "routers_loss": 0.0023913346230983734,
"skip_count": 0.0,
"step": 3806,
"text_loss": 0.3718445599079132
@@ -36174,13 +36174,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0390625,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0007601598186794407,
- "loss": 0.0083,
+ "loss": 0.0081,
"macro_f1": 0.6603773832321167,
"num_tokens": 6141356.0,
"repeat_count": 1.0,
- "routers_loss": 0.03635421022772789,
+ "routers_loss": 0.033796411007642746,
"skip_count": 1.0,
"step": 3808,
"text_loss": 0.2717749774456024
@@ -36193,13 +36193,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.000759895450537074,
- "loss": 0.0101,
+ "loss": 0.01,
"macro_f1": 0.6666666865348816,
"num_tokens": 6144448.0,
"repeat_count": 0.0,
- "routers_loss": 0.002765925833955407,
+ "routers_loss": 0.0037919918540865183,
"skip_count": 2.0,
"step": 3810,
"text_loss": 0.5935076475143433
@@ -36212,13 +36212,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03369140625,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0007596309828022803,
- "loss": 0.0072,
+ "loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 6147526.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009747639996930957,
+ "routers_loss": 0.0008182782912626863,
"skip_count": 0.0,
"step": 3812,
"text_loss": 0.449336439371109
@@ -36231,13 +36231,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.03125,
"learning_rate": 0.0007593664155764044,
"loss": 0.0061,
"macro_f1": 0.6666666865348816,
"num_tokens": 6150620.0,
"repeat_count": 1.0,
- "routers_loss": 0.001395601429976523,
+ "routers_loss": 0.001734903547912836,
"skip_count": 0.0,
"step": 3814,
"text_loss": 0.6647221446037292
@@ -36250,13 +36250,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.037353515625,
"learning_rate": 0.0007591017489608286,
- "loss": 0.0092,
+ "loss": 0.0088,
"macro_f1": 0.3272727429866791,
"num_tokens": 6153714.0,
"repeat_count": 1.0,
- "routers_loss": 0.048050083220005035,
+ "routers_loss": 0.04721754416823387,
"skip_count": 0.0,
"step": 3816,
"text_loss": 0.25481200218200684
@@ -36269,13 +36269,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03662109375,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0007588369830569738,
- "loss": 0.0062,
+ "loss": 0.0061,
"macro_f1": 0.3333333432674408,
"num_tokens": 6156974.0,
"repeat_count": 0.0,
- "routers_loss": 0.00022119733330328017,
+ "routers_loss": 0.0002484306460246444,
"skip_count": 0.0,
"step": 3818,
"text_loss": 0.7195295691490173
@@ -36288,13 +36288,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02783203125,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0007585721179662988,
"loss": 0.0072,
"macro_f1": 0.6666666865348816,
"num_tokens": 6159660.0,
"repeat_count": 0.0,
- "routers_loss": 0.005448841955512762,
+ "routers_loss": 0.0051363613456487656,
"skip_count": 2.0,
"step": 3820,
"text_loss": 0.5073586702346802
@@ -36307,13 +36307,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0458984375,
+ "grad_norm": 0.052734375,
"learning_rate": 0.0007583071537903005,
- "loss": 0.0067,
+ "loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 6163146.0,
"repeat_count": 0.0,
- "routers_loss": 0.007093957159668207,
+ "routers_loss": 0.006719176657497883,
"skip_count": 0.0,
"step": 3822,
"text_loss": 0.6950558423995972
@@ -36326,13 +36326,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.025634765625,
+ "grad_norm": 0.0269775390625,
"learning_rate": 0.0007580420906305136,
- "loss": 0.007,
+ "loss": 0.0073,
"macro_f1": 1.0,
"num_tokens": 6166257.0,
"repeat_count": 1.0,
- "routers_loss": 0.008060536347329617,
+ "routers_loss": 0.00871267355978489,
"skip_count": 3.0,
"step": 3824,
"text_loss": 0.2549148201942444
@@ -36345,13 +36345,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025146484375,
+ "grad_norm": 0.022705078125,
"learning_rate": 0.0007577769285885109,
- "loss": 0.004,
+ "loss": 0.0039,
"macro_f1": 0.3333333432674408,
"num_tokens": 6169624.0,
"repeat_count": 0.0,
- "routers_loss": 0.001302229124121368,
+ "routers_loss": 0.0015642556827515364,
"skip_count": 0.0,
"step": 3826,
"text_loss": 0.3720305860042572
@@ -36364,13 +36364,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.038330078125,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.0007575116677659029,
- "loss": 0.0076,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 6172673.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010101167717948556,
+ "routers_loss": 0.0011551049537956715,
"skip_count": 0.0,
"step": 3828,
"text_loss": 0.6819429397583008
@@ -36383,13 +36383,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.040771484375,
"learning_rate": 0.0007572463082643377,
- "loss": 0.0083,
+ "loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 6175414.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009081853204406798,
+ "routers_loss": 0.0008922060951590538,
"skip_count": 0.0,
"step": 3830,
"text_loss": 0.5424665212631226
@@ -36402,13 +36402,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03515625,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0007569808501855023,
"loss": 0.0044,
"macro_f1": 0.6666666865348816,
"num_tokens": 6178701.0,
"repeat_count": 0.0,
- "routers_loss": 0.0040206871926784515,
+ "routers_loss": 0.004167596809566021,
"skip_count": 1.0,
"step": 3832,
"text_loss": 0.4429764151573181
@@ -36421,13 +36421,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.04931640625,
"learning_rate": 0.00075671529363112,
"loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 6183036.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009683453245088458,
+ "routers_loss": 0.0008732969872653484,
"skip_count": 0.0,
"step": 3834,
"text_loss": 0.8015334010124207
@@ -36440,13 +36440,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0277099609375,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0007564496387029531,
- "loss": 0.0056,
+ "loss": 0.0055,
"macro_f1": 0.6666666865348816,
"num_tokens": 6186325.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021183546632528305,
+ "routers_loss": 0.0021374202333390713,
"skip_count": 1.0,
"step": 3836,
"text_loss": 0.4233771562576294
@@ -36459,13 +36459,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.000756183885502801,
- "loss": 0.0059,
+ "loss": 0.006,
"macro_f1": 0.6666666865348816,
"num_tokens": 6189919.0,
"repeat_count": 1.0,
- "routers_loss": 0.0034987039398401976,
+ "routers_loss": 0.004017227329313755,
"skip_count": 0.0,
"step": 3838,
"text_loss": 0.33691394329071045
@@ -36478,13 +36478,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.01953125,
+ "grad_norm": 0.018310546875,
"learning_rate": 0.0007559180341325005,
- "loss": 0.0048,
+ "loss": 0.0046,
"macro_f1": 0.3333333432674408,
"num_tokens": 6193412.0,
"repeat_count": 0.0,
- "routers_loss": 0.001348655903711915,
+ "routers_loss": 0.0013120946241542697,
"skip_count": 0.0,
"step": 3840,
"text_loss": 0.14970099925994873
@@ -36497,13 +36497,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.029541015625,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0007556520846939265,
"loss": 0.0061,
"macro_f1": 0.5492662787437439,
"num_tokens": 6196588.0,
"repeat_count": 0.0,
- "routers_loss": 0.011758741922676563,
+ "routers_loss": 0.011793316341936588,
"skip_count": 2.0,
"step": 3842,
"text_loss": 0.2714047133922577
@@ -36516,13 +36516,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.03466796875,
+ "grad_norm": 0.031494140625,
"learning_rate": 0.0007553860372889914,
- "loss": 0.0064,
+ "loss": 0.0062,
"macro_f1": 1.0,
"num_tokens": 6200841.0,
"repeat_count": 1.0,
- "routers_loss": 0.022454025223851204,
+ "routers_loss": 0.019968654960393906,
"skip_count": 4.0,
"step": 3844,
"text_loss": 0.23680976033210754
@@ -36535,13 +36535,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.049560546875,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0007551198920196452,
"loss": 0.0079,
"macro_f1": 0.5492662787437439,
"num_tokens": 6203797.0,
"repeat_count": 0.0,
- "routers_loss": 0.012088865973055363,
+ "routers_loss": 0.013615630567073822,
"skip_count": 2.0,
"step": 3846,
"text_loss": 0.25839608907699585
@@ -36554,13 +36554,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.057373046875,
+ "grad_norm": 0.0546875,
"learning_rate": 0.000754853648987875,
- "loss": 0.0073,
+ "loss": 0.0072,
"macro_f1": 0.6666666865348816,
"num_tokens": 6206790.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025066444650292397,
+ "routers_loss": 0.002420815173536539,
"skip_count": 1.0,
"step": 3848,
"text_loss": 0.5358025431632996
@@ -36573,13 +36573,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.032470703125,
"learning_rate": 0.0007545873082957057,
- "loss": 0.0073,
+ "loss": 0.0072,
"macro_f1": 0.9265305995941162,
"num_tokens": 6209791.0,
"repeat_count": 1.0,
- "routers_loss": 0.01811581663787365,
+ "routers_loss": 0.018236197531223297,
"skip_count": 3.0,
"step": 3850,
"text_loss": 0.1463700383901596
@@ -36592,13 +36592,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0390625,
+ "grad_norm": 0.034423828125,
"learning_rate": 0.0007543208700451998,
"loss": 0.0052,
"macro_f1": 0.6666666865348816,
"num_tokens": 6212792.0,
"repeat_count": 0.0,
- "routers_loss": 0.005889591295272112,
+ "routers_loss": 0.006242573726922274,
"skip_count": 3.0,
"step": 3852,
"text_loss": 0.9441591501235962
@@ -36611,13 +36611,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0299072265625,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0007540543343384565,
- "loss": 0.0064,
+ "loss": 0.0062,
"macro_f1": 0.3272727429866791,
"num_tokens": 6215747.0,
"repeat_count": 0.0,
- "routers_loss": 0.015324318781495094,
+ "routers_loss": 0.01451140083372593,
"skip_count": 1.0,
"step": 3854,
"text_loss": 0.41610902547836304
@@ -36630,13 +36630,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0007537877012776132,
"loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 6218593.0,
"repeat_count": 0.0,
- "routers_loss": 0.0003138817264698446,
+ "routers_loss": 0.00037674361374229193,
"skip_count": 0.0,
"step": 3856,
"text_loss": 0.6048852205276489
@@ -36649,13 +36649,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0269775390625,
+ "grad_norm": 0.0255126953125,
"learning_rate": 0.0007535209709648439,
- "loss": 0.0044,
+ "loss": 0.0045,
"macro_f1": 1.0,
"num_tokens": 6221315.0,
"repeat_count": 1.0,
- "routers_loss": 0.006152884569019079,
+ "routers_loss": 0.005776284262537956,
"skip_count": 3.0,
"step": 3858,
"text_loss": 0.35627537965774536
@@ -36668,13 +36668,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025634765625,
+ "grad_norm": 0.0245361328125,
"learning_rate": 0.0007532541435023605,
- "loss": 0.0048,
+ "loss": 0.0049,
"macro_f1": 0.3333333432674408,
"num_tokens": 6225012.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009145989897660911,
+ "routers_loss": 0.0009280376834794879,
"skip_count": 0.0,
"step": 3860,
"text_loss": 0.6440183520317078
@@ -36687,13 +36687,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025146484375,
+ "grad_norm": 0.0224609375,
"learning_rate": 0.0007529872189924114,
"loss": 0.0046,
"macro_f1": 0.3333333432674408,
"num_tokens": 6227650.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010246031451970339,
+ "routers_loss": 0.0009876530384644866,
"skip_count": 0.0,
"step": 3862,
"text_loss": 0.35507893562316895
@@ -36706,13 +36706,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0007527201975372827,
- "loss": 0.0046,
+ "loss": 0.0045,
"macro_f1": 0.6603773832321167,
"num_tokens": 6230557.0,
"repeat_count": 1.0,
- "routers_loss": 0.011913667432963848,
+ "routers_loss": 0.013780162669718266,
"skip_count": 1.0,
"step": 3864,
"text_loss": 0.38958442211151123
@@ -36725,13 +36725,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04345703125,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.0007524530792392977,
- "loss": 0.0111,
+ "loss": 0.011,
"macro_f1": 0.6666666865348816,
"num_tokens": 6233371.0,
"repeat_count": 0.0,
- "routers_loss": 0.0050127157010138035,
+ "routers_loss": 0.004849869292229414,
"skip_count": 3.0,
"step": 3866,
"text_loss": 0.3826720714569092
@@ -36744,13 +36744,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0228271484375,
+ "grad_norm": 0.0191650390625,
"learning_rate": 0.0007521858642008163,
- "loss": 0.0073,
+ "loss": 0.0072,
"macro_f1": 0.6666666865348816,
"num_tokens": 6236770.0,
"repeat_count": 0.0,
- "routers_loss": 0.008781078271567822,
+ "routers_loss": 0.008618295192718506,
"skip_count": 1.0,
"step": 3868,
"text_loss": 0.3596078157424927
@@ -36763,13 +36763,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03076171875,
+ "grad_norm": 0.029052734375,
"learning_rate": 0.0007519185525242363,
"loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 6239661.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014061459805816412,
+ "routers_loss": 0.0013421972980722785,
"skip_count": 0.0,
"step": 3870,
"text_loss": 0.5585550665855408
@@ -36782,13 +36782,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.027099609375,
+ "grad_norm": 0.026611328125,
"learning_rate": 0.0007516511443119916,
- "loss": 0.0056,
+ "loss": 0.0057,
"macro_f1": 0.6666666865348816,
"num_tokens": 6242459.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031452353578060865,
+ "routers_loss": 0.0038009448908269405,
"skip_count": 1.0,
"step": 3872,
"text_loss": 0.4418395757675171
@@ -36801,13 +36801,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.030517578125,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0007513836396665534,
"loss": 0.0061,
"macro_f1": 1.0,
"num_tokens": 6245489.0,
"repeat_count": 1.0,
- "routers_loss": 0.0028979210183024406,
+ "routers_loss": 0.002785376040264964,
"skip_count": 2.0,
"step": 3874,
"text_loss": 0.551510751247406
@@ -36820,13 +36820,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02294921875,
+ "grad_norm": 0.0234375,
"learning_rate": 0.0007511160386904305,
- "loss": 0.0051,
+ "loss": 0.005,
"macro_f1": 0.6666666865348816,
"num_tokens": 6249014.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021069799549877644,
+ "routers_loss": 0.0021424589212983847,
"skip_count": 1.0,
"step": 3876,
"text_loss": 1.0502676963806152
@@ -36839,13 +36839,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.034423828125,
"learning_rate": 0.0007508483414861679,
- "loss": 0.0083,
+ "loss": 0.0084,
"macro_f1": 0.6666666865348816,
"num_tokens": 6252357.0,
"repeat_count": 0.0,
- "routers_loss": 0.0073753902688622475,
+ "routers_loss": 0.0085759861394763,
"skip_count": 1.0,
"step": 3878,
"text_loss": 0.49212515354156494
@@ -36858,13 +36858,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0390625,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0007505805481563477,
- "loss": 0.0094,
+ "loss": 0.0093,
"macro_f1": 0.3333333432674408,
"num_tokens": 6254975.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010532810119912028,
+ "routers_loss": 0.0010723904706537724,
"skip_count": 0.0,
"step": 3880,
"text_loss": 0.7022985816001892
@@ -36877,13 +36877,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.05078125,
"learning_rate": 0.0007503126588035887,
- "loss": 0.0086,
+ "loss": 0.0081,
"macro_f1": 1.0,
"num_tokens": 6258001.0,
"repeat_count": 1.0,
- "routers_loss": 0.012617395259439945,
+ "routers_loss": 0.012809890322387218,
"skip_count": 2.0,
"step": 3882,
"text_loss": 0.1829151213169098
@@ -36896,13 +36896,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.0439453125,
"learning_rate": 0.0007500446735305466,
"loss": 0.0073,
"macro_f1": 0.6666666865348816,
"num_tokens": 6261795.0,
"repeat_count": 0.0,
- "routers_loss": 0.002872605575248599,
+ "routers_loss": 0.0026790346018970013,
"skip_count": 1.0,
"step": 3884,
"text_loss": 0.20436066389083862
@@ -36915,13 +36915,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.02978515625,
+ "grad_norm": 0.035888671875,
"learning_rate": 0.000749776592439914,
"loss": 0.007,
"macro_f1": 1.0,
"num_tokens": 6265585.0,
"repeat_count": 1.0,
- "routers_loss": 0.0047233253717422485,
+ "routers_loss": 0.005243788007646799,
"skip_count": 2.0,
"step": 3886,
"text_loss": 0.4479229748249054
@@ -36934,13 +36934,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02294921875,
+ "grad_norm": 0.024658203125,
"learning_rate": 0.00074950841563442,
- "loss": 0.0052,
+ "loss": 0.0051,
"macro_f1": 0.6666666865348816,
"num_tokens": 6269039.0,
"repeat_count": 0.0,
- "routers_loss": 0.007303252816200256,
+ "routers_loss": 0.007998534478247166,
"skip_count": 1.0,
"step": 3888,
"text_loss": 0.2154676914215088
@@ -36953,13 +36953,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0242919921875,
+ "grad_norm": 0.0238037109375,
"learning_rate": 0.0007492401432168303,
"loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 6272315.0,
"repeat_count": 0.0,
- "routers_loss": 0.005679785739630461,
+ "routers_loss": 0.004648822825402021,
"skip_count": 1.0,
"step": 3890,
"text_loss": 0.3375042676925659
@@ -36972,13 +36972,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0007489717752899477,
- "loss": 0.0097,
+ "loss": 0.0094,
"macro_f1": 0.3272727429866791,
"num_tokens": 6275342.0,
"repeat_count": 0.0,
- "routers_loss": 0.013875136151909828,
+ "routers_loss": 0.012154200114309788,
"skip_count": 1.0,
"step": 3892,
"text_loss": 0.1964082419872284
@@ -36991,13 +36991,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0247802734375,
+ "grad_norm": 0.0267333984375,
"learning_rate": 0.000748703311956611,
"loss": 0.0058,
"macro_f1": 1.0,
"num_tokens": 6278700.0,
"repeat_count": 1.0,
- "routers_loss": 0.004874289035797119,
+ "routers_loss": 0.004610476549714804,
"skip_count": 2.0,
"step": 3894,
"text_loss": 0.26545581221580505
@@ -37010,13 +37010,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.06201171875,
"learning_rate": 0.0007484347533196961,
"loss": 0.0105,
"macro_f1": 0.6666666865348816,
"num_tokens": 6281864.0,
"repeat_count": 0.0,
- "routers_loss": 0.008282547816634178,
+ "routers_loss": 0.0075586591847240925,
"skip_count": 2.0,
"step": 3896,
"text_loss": 0.3106999397277832
@@ -37029,13 +37029,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0208740234375,
+ "grad_norm": 0.02099609375,
"learning_rate": 0.0007481660994821151,
- "loss": 0.007,
+ "loss": 0.0068,
"macro_f1": 0.6666666865348816,
"num_tokens": 6284676.0,
"repeat_count": 0.0,
- "routers_loss": 0.00792533066123724,
+ "routers_loss": 0.007845268584787846,
"skip_count": 1.0,
"step": 3898,
"text_loss": 0.4094304144382477
@@ -37048,13 +37048,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0007478973505468165,
- "loss": 0.0086,
+ "loss": 0.0081,
"macro_f1": 1.0,
"num_tokens": 6287470.0,
"repeat_count": 1.0,
- "routers_loss": 0.012142898514866829,
+ "routers_loss": 0.011116391979157925,
"skip_count": 2.0,
"step": 3900,
"text_loss": 0.1838909536600113
@@ -37067,13 +37067,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.03515625,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0007476285066167857,
- "loss": 0.0062,
+ "loss": 0.0059,
"macro_f1": 0.6666666865348816,
"num_tokens": 6290432.0,
"repeat_count": 1.0,
- "routers_loss": 0.004634121898561716,
+ "routers_loss": 0.004599364474415779,
"skip_count": 0.0,
"step": 3902,
"text_loss": 0.25872838497161865
@@ -37086,13 +37086,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0419921875,
+ "grad_norm": 0.046142578125,
"learning_rate": 0.0007473595677950439,
"loss": 0.0109,
"macro_f1": 0.6666666865348816,
"num_tokens": 6293557.0,
"repeat_count": 0.0,
- "routers_loss": 0.001632143510505557,
+ "routers_loss": 0.0016367282951250672,
"skip_count": 1.0,
"step": 3904,
"text_loss": 0.5272360444068909
@@ -37105,13 +37105,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.032470703125,
"learning_rate": 0.0007470905341846492,
- "loss": 0.0053,
+ "loss": 0.0052,
"macro_f1": 0.3333333432674408,
"num_tokens": 6295979.0,
"repeat_count": 0.0,
- "routers_loss": 0.0004961033118888736,
+ "routers_loss": 0.0004760588926728815,
"skip_count": 0.0,
"step": 3906,
"text_loss": 0.666959822177887
@@ -37124,13 +37124,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0007468214058886956,
- "loss": 0.0074,
+ "loss": 0.0072,
"macro_f1": 0.3333333432674408,
"num_tokens": 6299215.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007425977964885533,
+ "routers_loss": 0.000524883100297302,
"skip_count": 0.0,
"step": 3908,
"text_loss": 0.5144801139831543
@@ -37143,13 +37143,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0007465521830103137,
- "loss": 0.0081,
+ "loss": 0.0077,
"macro_f1": 0.3333333432674408,
"num_tokens": 6302320.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015668199630454183,
+ "routers_loss": 0.0016085522947832942,
"skip_count": 0.0,
"step": 3910,
"text_loss": 0.14342890679836273
@@ -37162,13 +37162,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037353515625,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0007462828656526702,
- "loss": 0.0065,
+ "loss": 0.0064,
"macro_f1": 0.6666666865348816,
"num_tokens": 6305212.0,
"repeat_count": 0.0,
- "routers_loss": 0.003138904692605138,
+ "routers_loss": 0.002720315707847476,
"skip_count": 2.0,
"step": 3912,
"text_loss": 0.31109121441841125
@@ -37181,13 +37181,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.052001953125,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0007460134539189681,
- "loss": 0.0117,
+ "loss": 0.0114,
"macro_f1": 0.6666666865348816,
"num_tokens": 6308964.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012123063206672668,
+ "routers_loss": 0.0010418406454846263,
"skip_count": 1.0,
"step": 3914,
"text_loss": 0.5662030577659607
@@ -37200,13 +37200,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.052001953125,
"learning_rate": 0.0007457439479124459,
"loss": 0.0134,
"macro_f1": 0.3333333432674408,
"num_tokens": 6313195.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017939694225788116,
+ "routers_loss": 0.0020303844939917326,
"skip_count": 0.0,
"step": 3916,
"text_loss": 0.6358339190483093
@@ -37219,13 +37219,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0274658203125,
+ "grad_norm": 0.0289306640625,
"learning_rate": 0.0007454743477363797,
"loss": 0.0054,
"macro_f1": 0.3333333432674408,
"num_tokens": 6315949.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006735047209076583,
+ "routers_loss": 0.0006592223653569818,
"skip_count": 0.0,
"step": 3918,
"text_loss": 0.35648423433303833
@@ -37238,13 +37238,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.027099609375,
+ "grad_norm": 0.0262451171875,
"learning_rate": 0.0007452046534940803,
- "loss": 0.0078,
+ "loss": 0.0075,
"macro_f1": 0.6603773832321167,
"num_tokens": 6319024.0,
"repeat_count": 1.0,
- "routers_loss": 0.025279851630330086,
+ "routers_loss": 0.024555351585149765,
"skip_count": 1.0,
"step": 3920,
"text_loss": 0.21955153346061707
@@ -37257,13 +37257,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033203125,
+ "grad_norm": 0.035888671875,
"learning_rate": 0.0007449348652888952,
- "loss": 0.007,
+ "loss": 0.0068,
"macro_f1": 0.6666666865348816,
"num_tokens": 6321633.0,
"repeat_count": 0.0,
- "routers_loss": 0.002887458074837923,
+ "routers_loss": 0.003606822807341814,
"skip_count": 1.0,
"step": 3922,
"text_loss": 0.6079489588737488
@@ -37276,13 +37276,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0007446649832242075,
"loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 6325209.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034941197372972965,
+ "routers_loss": 0.0035831446293741465,
"skip_count": 1.0,
"step": 3924,
"text_loss": 0.2774808406829834
@@ -37295,13 +37295,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03173828125,
+ "grad_norm": 0.0311279296875,
"learning_rate": 0.0007443950074034368,
- "loss": 0.0067,
+ "loss": 0.0064,
"macro_f1": 0.6666666865348816,
"num_tokens": 6327822.0,
"repeat_count": 0.0,
- "routers_loss": 0.006862608715891838,
+ "routers_loss": 0.006809544749557972,
"skip_count": 2.0,
"step": 3926,
"text_loss": 0.48236769437789917
@@ -37314,13 +37314,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.0007441249379300381,
- "loss": 0.0072,
+ "loss": 0.007,
"macro_f1": 0.6601307392120361,
"num_tokens": 6331662.0,
"repeat_count": 1.0,
- "routers_loss": 0.02176409214735031,
+ "routers_loss": 0.023832591250538826,
"skip_count": 2.0,
"step": 3928,
"text_loss": 0.7287537455558777
@@ -37333,13 +37333,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.04296875,
"learning_rate": 0.0007438547749075028,
- "loss": 0.0064,
+ "loss": 0.0061,
"macro_f1": 1.0,
"num_tokens": 6335801.0,
"repeat_count": 1.0,
- "routers_loss": 0.013603253290057182,
+ "routers_loss": 0.011755098588764668,
"skip_count": 3.0,
"step": 3930,
"text_loss": 0.17253030836582184
@@ -37352,13 +37352,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0267333984375,
+ "grad_norm": 0.02685546875,
"learning_rate": 0.0007435845184393577,
- "loss": 0.0052,
+ "loss": 0.005,
"macro_f1": 0.6666666865348816,
"num_tokens": 6338747.0,
"repeat_count": 1.0,
- "routers_loss": 0.006635789293795824,
+ "routers_loss": 0.005972472485154867,
"skip_count": 0.0,
"step": 3932,
"text_loss": 0.6400216817855835
@@ -37371,13 +37371,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0361328125,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0007433141686291657,
- "loss": 0.0077,
+ "loss": 0.0075,
"macro_f1": 0.6666666865348816,
"num_tokens": 6342772.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032724342308938503,
+ "routers_loss": 0.0030393085908144712,
"skip_count": 1.0,
"step": 3934,
"text_loss": 0.6865074038505554
@@ -37390,13 +37390,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0213623046875,
+ "grad_norm": 0.020263671875,
"learning_rate": 0.0007430437255805252,
- "loss": 0.007,
+ "loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 6345957.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007380369352176785,
+ "routers_loss": 0.0006984061910770833,
"skip_count": 0.0,
"step": 3936,
"text_loss": 0.40398702025413513
@@ -37409,13 +37409,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.078125,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0007427731893970706,
"loss": 0.007,
"macro_f1": 0.6666666865348816,
"num_tokens": 6349162.0,
"repeat_count": 1.0,
- "routers_loss": 0.004635625518858433,
+ "routers_loss": 0.005219762213528156,
"skip_count": 0.0,
"step": 3938,
"text_loss": 0.5951031446456909
@@ -37428,13 +37428,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.043701171875,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.0007425025601824717,
- "loss": 0.0085,
+ "loss": 0.0083,
"macro_f1": 0.6666666865348816,
"num_tokens": 6352655.0,
"repeat_count": 0.0,
- "routers_loss": 0.014994140714406967,
+ "routers_loss": 0.015575960278511047,
"skip_count": 3.0,
"step": 3940,
"text_loss": 0.26689088344573975
@@ -37447,13 +37447,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031494140625,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0007422318380404346,
- "loss": 0.0067,
+ "loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 6355890.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011694672284647822,
+ "routers_loss": 0.0012208883417770267,
"skip_count": 0.0,
"step": 3942,
"text_loss": 0.570725679397583
@@ -37466,13 +37466,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.02587890625,
+ "grad_norm": 0.0235595703125,
"learning_rate": 0.0007419610230746999,
"loss": 0.0056,
"macro_f1": 0.6666666865348816,
"num_tokens": 6358891.0,
"repeat_count": 1.0,
- "routers_loss": 0.003442608518525958,
+ "routers_loss": 0.0029412026051431894,
"skip_count": 0.0,
"step": 3944,
"text_loss": 0.5521301031112671
@@ -37485,13 +37485,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0007416901153890448,
"loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 6361586.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009970148093998432,
+ "routers_loss": 0.0010283910669386387,
"skip_count": 0.0,
"step": 3946,
"text_loss": 0.4046417772769928
@@ -37504,13 +37504,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.043212890625,
+ "grad_norm": 0.03955078125,
"learning_rate": 0.0007414191150872818,
- "loss": 0.0078,
+ "loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 6364954.0,
"repeat_count": 0.0,
- "routers_loss": 0.009517154656350613,
+ "routers_loss": 0.008222512900829315,
"skip_count": 2.0,
"step": 3948,
"text_loss": 0.2803446352481842
@@ -37523,13 +37523,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.030029296875,
+ "grad_norm": 0.03564453125,
"learning_rate": 0.0007411480222732583,
- "loss": 0.0091,
+ "loss": 0.0093,
"macro_f1": 0.3333333432674408,
"num_tokens": 6367660.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012908667558804154,
+ "routers_loss": 0.001304348581470549,
"skip_count": 0.0,
"step": 3950,
"text_loss": 0.45553359389305115
@@ -37542,13 +37542,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03564453125,
+ "grad_norm": 0.03759765625,
"learning_rate": 0.0007408768370508576,
- "loss": 0.0076,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 6371585.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015499353175982833,
+ "routers_loss": 0.0016345062758773565,
"skip_count": 0.0,
"step": 3952,
"text_loss": 0.25424402952194214
@@ -37561,13 +37561,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.0007406055595239986,
- "loss": 0.007,
+ "loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 6374365.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005612325621768832,
+ "routers_loss": 0.0005097290268167853,
"skip_count": 0.0,
"step": 3954,
"text_loss": 0.5856026411056519
@@ -37580,13 +37580,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07373046875,
+ "grad_norm": 0.060546875,
"learning_rate": 0.0007403341897966356,
- "loss": 0.0063,
+ "loss": 0.0061,
"macro_f1": 0.6666666865348816,
"num_tokens": 6377335.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024961072485893965,
+ "routers_loss": 0.002482263371348381,
"skip_count": 1.0,
"step": 3956,
"text_loss": 0.5145615339279175
@@ -37599,32 +37599,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0230712890625,
+ "grad_norm": 0.0245361328125,
"learning_rate": 0.0007400627279727574,
"loss": 0.0041,
"macro_f1": 0.3333333432674408,
"num_tokens": 6380799.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013171056052669883,
+ "routers_loss": 0.0011743451468646526,
"skip_count": 0.0,
"step": 3958,
"text_loss": 0.31868961453437805
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 18.591722923393014,
- "f1_execute": 0.9818181991577148,
- "f1_repeat": 0.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.027099609375,
+ "grad_norm": 0.0286865234375,
"learning_rate": 0.0007397911741563892,
- "loss": 0.0054,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 6383963.0,
"repeat_count": 1.0,
- "routers_loss": 0.012845510616898537,
+ "routers_loss": 0.009861881844699383,
"skip_count": 0.0,
"step": 3960,
"text_loss": 0.21192194521427155
@@ -37637,13 +37637,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0390625,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.0007395195284515905,
- "loss": 0.0099,
+ "loss": 0.0095,
"macro_f1": 0.6666666865348816,
"num_tokens": 6387410.0,
"repeat_count": 1.0,
- "routers_loss": 0.003112874459475279,
+ "routers_loss": 0.004189098719507456,
"skip_count": 0.0,
"step": 3962,
"text_loss": 0.5809708833694458
@@ -37656,13 +37656,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039794921875,
+ "grad_norm": 0.036376953125,
"learning_rate": 0.0007392477909624567,
- "loss": 0.0058,
+ "loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 6390670.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019742189906537533,
+ "routers_loss": 0.001853612600825727,
"skip_count": 0.0,
"step": 3964,
"text_loss": 0.48985618352890015
@@ -37675,13 +37675,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.028076171875,
+ "grad_norm": 0.0308837890625,
"learning_rate": 0.0007389759617931182,
- "loss": 0.0066,
+ "loss": 0.0067,
"macro_f1": 0.6666666865348816,
"num_tokens": 6393609.0,
"repeat_count": 1.0,
- "routers_loss": 0.003850853070616722,
+ "routers_loss": 0.003303771372884512,
"skip_count": 0.0,
"step": 3966,
"text_loss": 0.28729453682899475
@@ -37694,13 +37694,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 1.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0007387040410477404,
- "loss": 0.0057,
+ "loss": 0.0058,
"macro_f1": 0.9452888369560242,
"num_tokens": 6396608.0,
"repeat_count": 1.0,
- "routers_loss": 0.020281648263335228,
+ "routers_loss": 0.01791577786207199,
"skip_count": 4.0,
"step": 3968,
"text_loss": 0.30386820435523987
@@ -37713,13 +37713,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0284423828125,
+ "grad_norm": 0.029541015625,
"learning_rate": 0.0007384320288305235,
- "loss": 0.0093,
+ "loss": 0.0091,
"macro_f1": 0.3333333432674408,
"num_tokens": 6399793.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005419629742391407,
+ "routers_loss": 0.0005771282012574375,
"skip_count": 0.0,
"step": 3970,
"text_loss": 0.47285011410713196
@@ -37732,13 +37732,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0291748046875,
+ "grad_norm": 0.032958984375,
"learning_rate": 0.0007381599252457037,
- "loss": 0.0061,
+ "loss": 0.006,
"macro_f1": 0.3333333432674408,
"num_tokens": 6403365.0,
"repeat_count": 0.0,
- "routers_loss": 0.003040255280211568,
+ "routers_loss": 0.003010645741596818,
"skip_count": 0.0,
"step": 3972,
"text_loss": 0.5313063859939575
@@ -37751,32 +37751,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.000737887730397551,
"loss": 0.0061,
"macro_f1": 0.6666666865348816,
"num_tokens": 6406205.0,
"repeat_count": 1.0,
- "routers_loss": 0.006762589327991009,
+ "routers_loss": 0.006457438692450523,
"skip_count": 0.0,
"step": 3974,
"text_loss": 0.2323843240737915
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 18.666862342236573,
- "f1_execute": 0.9818181991577148,
- "f1_repeat": 0.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.0007376154443903713,
- "loss": 0.0086,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 6409552.0,
"repeat_count": 1.0,
- "routers_loss": 0.01173968706279993,
+ "routers_loss": 0.010693981312215328,
"skip_count": 0.0,
"step": 3976,
"text_loss": 0.6304101943969727
@@ -37789,13 +37789,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0007373430673285051,
"loss": 0.008,
"macro_f1": 0.3272727429866791,
"num_tokens": 6412386.0,
"repeat_count": 1.0,
- "routers_loss": 0.028297962620854378,
+ "routers_loss": 0.03116440214216709,
"skip_count": 0.0,
"step": 3978,
"text_loss": 0.23448467254638672
@@ -37808,13 +37808,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08447265625,
+ "grad_norm": 0.10009765625,
"learning_rate": 0.0007370705993163278,
- "loss": 0.011,
+ "loss": 0.0111,
"macro_f1": 0.3272727429866791,
"num_tokens": 6416054.0,
"repeat_count": 1.0,
- "routers_loss": 0.010761309415102005,
+ "routers_loss": 0.011973714455962181,
"skip_count": 0.0,
"step": 3980,
"text_loss": 0.6371755599975586
@@ -37827,13 +37827,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0458984375,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0007367980404582497,
"loss": 0.0105,
"macro_f1": 1.0,
"num_tokens": 6419238.0,
"repeat_count": 1.0,
- "routers_loss": 0.0057355971075594425,
+ "routers_loss": 0.005117347463965416,
"skip_count": 2.0,
"step": 3982,
"text_loss": 0.19822923839092255
@@ -37846,13 +37846,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0267333984375,
+ "grad_norm": 0.0296630859375,
"learning_rate": 0.0007365253908587158,
- "loss": 0.005,
+ "loss": 0.0049,
"macro_f1": 0.3333333432674408,
"num_tokens": 6422122.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011142889270558953,
+ "routers_loss": 0.0010648667812347412,
"skip_count": 0.0,
"step": 3984,
"text_loss": 0.566700279712677
@@ -37865,13 +37865,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0263671875,
+ "grad_norm": 0.025146484375,
"learning_rate": 0.0007362526506222058,
- "loss": 0.0045,
+ "loss": 0.0044,
"macro_f1": 0.3333333432674408,
"num_tokens": 6425313.0,
"repeat_count": 0.0,
- "routers_loss": 0.005405326373875141,
+ "routers_loss": 0.005726494826376438,
"skip_count": 0.0,
"step": 3986,
"text_loss": 0.6568437814712524
@@ -37884,13 +37884,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0289306640625,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0007359798198532343,
- "loss": 0.0043,
+ "loss": 0.0042,
"macro_f1": 0.6666666865348816,
"num_tokens": 6428422.0,
"repeat_count": 1.0,
- "routers_loss": 0.005449058022350073,
+ "routers_loss": 0.004504100419580936,
"skip_count": 0.0,
"step": 3988,
"text_loss": 0.598754346370697
@@ -37903,13 +37903,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.0306396484375,
"learning_rate": 0.0007357068986563509,
- "loss": 0.0083,
+ "loss": 0.0081,
"macro_f1": 0.6666666865348816,
"num_tokens": 6431512.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020256424322724342,
+ "routers_loss": 0.0019837068393826485,
"skip_count": 1.0,
"step": 3990,
"text_loss": 0.7152895927429199
@@ -37922,13 +37922,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.032470703125,
"learning_rate": 0.0007354338871361393,
- "loss": 0.0084,
+ "loss": 0.0079,
"macro_f1": 0.6666666865348816,
"num_tokens": 6434358.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027240889612585306,
+ "routers_loss": 0.0026031541638076305,
"skip_count": 1.0,
"step": 3992,
"text_loss": 0.4986513555049896
@@ -37941,13 +37941,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.039794921875,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.000735160785397218,
- "loss": 0.0061,
+ "loss": 0.006,
"macro_f1": 0.6666666865348816,
"num_tokens": 6438175.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026689881924539804,
+ "routers_loss": 0.0024831905029714108,
"skip_count": 2.0,
"step": 3994,
"text_loss": 0.4406205713748932
@@ -37960,13 +37960,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0007348875935442401,
- "loss": 0.0067,
+ "loss": 0.0065,
"macro_f1": 0.3333333432674408,
"num_tokens": 6441228.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010014307918027043,
+ "routers_loss": 0.0008635876583866775,
"skip_count": 0.0,
"step": 3996,
"text_loss": 0.48884135484695435
@@ -37979,13 +37979,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.040283203125,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0007346143116818932,
- "loss": 0.0046,
+ "loss": 0.0044,
"macro_f1": 0.3333333432674408,
"num_tokens": 6444318.0,
"repeat_count": 0.0,
- "routers_loss": 0.004282998852431774,
+ "routers_loss": 0.004007008858025074,
"skip_count": 0.0,
"step": 3998,
"text_loss": 0.6669428944587708
@@ -37998,13 +37998,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.08203125,
"learning_rate": 0.0007343409399148994,
- "loss": 0.0092,
+ "loss": 0.0093,
"macro_f1": 0.3333333432674408,
"num_tokens": 6448317.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031171543523669243,
+ "routers_loss": 0.0031380734872072935,
"skip_count": 0.0,
"step": 4000,
"text_loss": 0.6468493938446045
diff --git a/checkpoint-4000/training_args.bin b/checkpoint-4000/training_args.bin
index deeea733277b4031781a5b299881dd8e675e7606..a3d3ae372faf14539639f54454aa52b6ee730c4a 100644
--- a/checkpoint-4000/training_args.bin
+++ b/checkpoint-4000/training_args.bin
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:0b3f5975f57762b552c7ee29776bf32a4dbb125781a0658488d3884fb25c5296
+oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8
size 5880
diff --git a/checkpoint-5000/model-00002-of-00002.safetensors b/checkpoint-5000/model-00002-of-00002.safetensors
index e5a18c77927f4cd1f054dbebaaef463f9874696a..4fd8e45d2e43ad1cbdd82dc191486dde3af7361c 100644
--- a/checkpoint-5000/model-00002-of-00002.safetensors
+++ b/checkpoint-5000/model-00002-of-00002.safetensors
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:ae38e7cc34de9f085dfca6da945b517d6b7201575652933d0a76b280ef98f026
+oid sha256:b1d3406d823631d0e3b7578d45263f036789513b216f560f8fe554a64de4a525
size 1481790520
diff --git a/checkpoint-5000/optimizer.pt b/checkpoint-5000/optimizer.pt
index d2981fc2869c3e6703d3563af98c953ca08d3dc3..30db85d1e2490cf65b2bb12085834ba01ef021de 100644
--- a/checkpoint-5000/optimizer.pt
+++ b/checkpoint-5000/optimizer.pt
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:3af43e11f60ae0f6ec6be0f47432d6dc5a652d6babd15a6e7d17e30412a5def2
+oid sha256:523375eef8a0adbf6e87d5f4658cef29f76781043100ad34e1f31232264003f7
size 44191162
diff --git a/checkpoint-5000/trainer_state.json b/checkpoint-5000/trainer_state.json
index 825058617d82e0beb9ce3322b1a8231a1ec1418f..d29ad111e10aba5f6b374584df732a732758afa1 100644
--- a/checkpoint-5000/trainer_state.json
+++ b/checkpoint-5000/trainer_state.json
@@ -12,18 +12,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 31.0,
+ "avg_layers": 25.0,
"epoch": 0.009392427355444672,
- "f1_execute": 0.4864864945411682,
+ "f1_execute": 0.6976743936538696,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 2.40625,
+ "grad_norm": 2.25,
"learning_rate": 2e-06,
- "loss": 0.5484,
- "macro_f1": 0.1621621698141098,
+ "loss": 0.4974,
+ "macro_f1": 0.23255813121795654,
"num_tokens": 3175.0,
"repeat_count": 0.0,
- "routers_loss": 0.503563642501831,
+ "routers_loss": 0.4339469373226166,
"skip_count": 0.0,
"step": 2,
"text_loss": 0.3330848515033722
@@ -31,18 +31,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 23.0,
"epoch": 0.018784854710889344,
- "f1_execute": 0.4864864945411682,
+ "f1_execute": 0.7272726893424988,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.9140625,
+ "grad_norm": 1.8359375,
"learning_rate": 6e-06,
- "loss": 0.536,
- "macro_f1": 0.1621621698141098,
+ "loss": 0.4988,
+ "macro_f1": 0.24242423474788666,
"num_tokens": 5816.0,
"repeat_count": 0.0,
- "routers_loss": 0.4589468538761139,
+ "routers_loss": 0.4511934816837311,
"skip_count": 1.0,
"step": 4,
"text_loss": 0.4571273922920227
@@ -50,37 +50,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 32.0,
+ "avg_layers": 28.0,
"epoch": 0.02817728206633402,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.6666666865348816,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 2.375,
+ "grad_norm": 2.234375,
"learning_rate": 1e-05,
- "loss": 0.5469,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.5113,
+ "macro_f1": 0.222222238779068,
"num_tokens": 9739.0,
"repeat_count": 0.0,
- "routers_loss": 0.5736724138259888,
+ "routers_loss": 0.49306994676589966,
"skip_count": 0.0,
"step": 6,
"text_loss": 0.41060560941696167
},
{
- "acc_repeat": 1.0,
- "acc_skip": 0.5,
- "avg_layers": 33.0,
+ "acc_repeat": 0.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 0.03756970942177869,
- "f1_execute": 0.47058823704719543,
- "f1_repeat": 0.1538461595773697,
- "f1_skip": 0.222222238779068,
- "grad_norm": 1.8515625,
+ "f1_execute": 0.5641025900840759,
+ "f1_repeat": 0.0,
+ "f1_skip": 0.0,
+ "grad_norm": 1.7265625,
"learning_rate": 1.4e-05,
- "loss": 0.5291,
- "macro_f1": 0.28221890330314636,
+ "loss": 0.4766,
+ "macro_f1": 0.18803420662879944,
"num_tokens": 12869.0,
"repeat_count": 1.0,
- "routers_loss": 0.49970296025276184,
+ "routers_loss": 0.48872503638267517,
"skip_count": 2.0,
"step": 8,
"text_loss": 0.36678561568260193
@@ -88,37 +88,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 27.0,
"epoch": 0.046962136777223364,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.6976743936538696,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.953125,
+ "grad_norm": 1.78125,
"learning_rate": 1.8e-05,
- "loss": 0.5316,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.4806,
+ "macro_f1": 0.23255813121795654,
"num_tokens": 15845.0,
"repeat_count": 0.0,
- "routers_loss": 0.5153562426567078,
+ "routers_loss": 0.45077216625213623,
"skip_count": 0.0,
"step": 10,
"text_loss": 0.5597779154777527
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 0.5,
"acc_skip": 0.3333333432674408,
- "avg_layers": 34.0,
+ "avg_layers": 26.0,
"epoch": 0.05635456413266804,
- "f1_execute": 0.5714285373687744,
- "f1_repeat": 0.0,
- "f1_skip": 0.25,
- "grad_norm": 1.6328125,
+ "f1_execute": 0.7179487347602844,
+ "f1_repeat": 0.2857142984867096,
+ "f1_skip": 0.20000000298023224,
+ "grad_norm": 1.5390625,
"learning_rate": 2.2e-05,
- "loss": 0.5051,
- "macro_f1": 0.2738095223903656,
+ "loss": 0.4557,
+ "macro_f1": 0.40122103691101074,
"num_tokens": 19353.0,
"repeat_count": 2.0,
- "routers_loss": 0.46214747428894043,
+ "routers_loss": 0.4130440056324005,
"skip_count": 3.0,
"step": 12,
"text_loss": 0.2056603729724884
@@ -126,37 +126,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 27.0,
"epoch": 0.06574699148811271,
- "f1_execute": 0.5263157486915588,
+ "f1_execute": 0.6976743936538696,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 2.671875,
+ "grad_norm": 2.4375,
"learning_rate": 2.6e-05,
- "loss": 0.5653,
- "macro_f1": 0.17543858289718628,
+ "loss": 0.5129,
+ "macro_f1": 0.23255813121795654,
"num_tokens": 22675.0,
"repeat_count": 0.0,
- "routers_loss": 0.5300976634025574,
+ "routers_loss": 0.4582902193069458,
"skip_count": 0.0,
"step": 14,
"text_loss": 0.32989829778671265
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 34.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 0.07513941884355738,
- "f1_execute": 0.6153846383094788,
+ "f1_execute": 0.6829268336296082,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 1.8828125,
+ "f1_skip": 0.2222222238779068,
+ "grad_norm": 1.7421875,
"learning_rate": 3e-05,
- "loss": 0.5225,
- "macro_f1": 0.20512822270393372,
+ "loss": 0.4729,
+ "macro_f1": 0.3017163574695587,
"num_tokens": 26022.0,
"repeat_count": 0.0,
- "routers_loss": 0.473240464925766,
+ "routers_loss": 0.42910993099212646,
"skip_count": 1.0,
"step": 16,
"text_loss": 0.1353905349969864
@@ -164,18 +164,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 38.0,
+ "avg_layers": 27.0,
"epoch": 0.08453184619900206,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.7555555105209351,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.6015625,
+ "grad_norm": 1.4765625,
"learning_rate": 3.4000000000000007e-05,
- "loss": 0.4867,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.4274,
+ "macro_f1": 0.2518518567085266,
"num_tokens": 29251.0,
"repeat_count": 0.0,
- "routers_loss": 0.4795944094657898,
+ "routers_loss": 0.3990713059902191,
"skip_count": 0.0,
"step": 18,
"text_loss": 0.3806765377521515
@@ -183,18 +183,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 36.0,
+ "avg_layers": 26.0,
"epoch": 0.09392427355444673,
- "f1_execute": 0.6153846383094788,
- "f1_repeat": 0.1538461595773697,
+ "f1_execute": 0.6829268336296082,
+ "f1_repeat": 0.2857142984867096,
"f1_skip": 0.0,
- "grad_norm": 1.3984375,
+ "grad_norm": 1.3125,
"learning_rate": 3.8e-05,
- "loss": 0.4718,
- "macro_f1": 0.25641027092933655,
+ "loss": 0.4261,
+ "macro_f1": 0.3228803873062134,
"num_tokens": 32545.0,
"repeat_count": 1.0,
- "routers_loss": 0.41872408986091614,
+ "routers_loss": 0.40146592259407043,
"skip_count": 0.0,
"step": 20,
"text_loss": 0.25648367404937744
@@ -202,18 +202,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 26.0,
"epoch": 0.1033167009098914,
- "f1_execute": 0.6341463327407837,
+ "f1_execute": 0.7272727489471436,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.7734375,
+ "grad_norm": 1.625,
"learning_rate": 4.2000000000000004e-05,
- "loss": 0.4472,
- "macro_f1": 0.21138212084770203,
+ "loss": 0.404,
+ "macro_f1": 0.24242424964904785,
"num_tokens": 36560.0,
"repeat_count": 0.0,
- "routers_loss": 0.4152105450630188,
+ "routers_loss": 0.372715026140213,
"skip_count": 0.0,
"step": 22,
"text_loss": 0.2799522578716278
@@ -221,18 +221,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 32.0,
+ "avg_layers": 27.0,
"epoch": 0.11270912826533608,
- "f1_execute": 0.5999999642372131,
+ "f1_execute": 0.7555555105209351,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.8046875,
+ "grad_norm": 1.6328125,
"learning_rate": 4.6e-05,
- "loss": 0.4554,
- "macro_f1": 0.19999998807907104,
+ "loss": 0.4218,
+ "macro_f1": 0.2518518567085266,
"num_tokens": 39597.0,
"repeat_count": 0.0,
- "routers_loss": 0.47541096806526184,
+ "routers_loss": 0.4504941403865814,
"skip_count": 0.0,
"step": 24,
"text_loss": 0.6635695695877075
@@ -240,18 +240,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 34.0,
+ "avg_layers": 27.0,
"epoch": 0.12210155562078075,
- "f1_execute": 0.7826087474822998,
+ "f1_execute": 0.8085106015205383,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.875,
+ "grad_norm": 1.7109375,
"learning_rate": 5e-05,
- "loss": 0.4182,
- "macro_f1": 0.2608695924282074,
+ "loss": 0.3886,
+ "macro_f1": 0.26950353384017944,
"num_tokens": 43080.0,
"repeat_count": 0.0,
- "routers_loss": 0.37319275736808777,
+ "routers_loss": 0.3498791456222534,
"skip_count": 0.0,
"step": 26,
"text_loss": 0.7035041451454163
@@ -259,18 +259,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 0.13149398297622542,
- "f1_execute": 0.7826087474822998,
+ "f1_execute": 0.8085106015205383,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.4375,
+ "grad_norm": 1.34375,
"learning_rate": 5.4e-05,
- "loss": 0.3991,
- "macro_f1": 0.2608695924282074,
+ "loss": 0.3724,
+ "macro_f1": 0.26950353384017944,
"num_tokens": 46406.0,
"repeat_count": 0.0,
- "routers_loss": 0.3604123294353485,
+ "routers_loss": 0.31265875697135925,
"skip_count": 0.0,
"step": 28,
"text_loss": 0.6388277411460876
@@ -280,16 +280,16 @@
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.1408864103316701,
- "f1_execute": 0.8979591727256775,
+ "f1_execute": 0.8571428060531616,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.421875,
+ "grad_norm": 1.2578125,
"learning_rate": 5.800000000000001e-05,
- "loss": 0.3827,
- "macro_f1": 0.2993197441101074,
+ "loss": 0.341,
+ "macro_f1": 0.2857142686843872,
"num_tokens": 49966.0,
"repeat_count": 0.0,
- "routers_loss": 0.35880225896835327,
+ "routers_loss": 0.3200918138027191,
"skip_count": 2.0,
"step": 30,
"text_loss": 0.17372547090053558
@@ -297,18 +297,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 24.0,
+ "avg_layers": 25.0,
"epoch": 0.15027883768711475,
- "f1_execute": 0.9200000166893005,
+ "f1_execute": 0.8571428060531616,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.4609375,
+ "grad_norm": 1.4140625,
"learning_rate": 6.2e-05,
- "loss": 0.3452,
- "macro_f1": 0.30666667222976685,
+ "loss": 0.3207,
+ "macro_f1": 0.2857142686843872,
"num_tokens": 53378.0,
"repeat_count": 1.0,
- "routers_loss": 0.31086465716362,
+ "routers_loss": 0.32304447889328003,
"skip_count": 1.0,
"step": 32,
"text_loss": 0.18196581304073334
@@ -316,18 +316,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 25.0,
"epoch": 0.15967126504255943,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.3671875,
+ "grad_norm": 1.46875,
"learning_rate": 6.6e-05,
- "loss": 0.3283,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.3304,
+ "macro_f1": 0.3006536364555359,
"num_tokens": 56933.0,
"repeat_count": 0.0,
- "routers_loss": 0.2674171030521393,
+ "routers_loss": 0.24814388155937195,
"skip_count": 0.0,
"step": 34,
"text_loss": 0.28823015093803406
@@ -335,18 +335,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 0.16906369239800412,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.1015625,
+ "grad_norm": 1.1171875,
"learning_rate": 7.000000000000001e-05,
- "loss": 0.2849,
- "macro_f1": 0.3205128312110901,
+ "loss": 0.2778,
+ "macro_f1": 0.3006536066532135,
"num_tokens": 60744.0,
"repeat_count": 1.0,
- "routers_loss": 0.24587315320968628,
+ "routers_loss": 0.22411039471626282,
"skip_count": 0.0,
"step": 36,
"text_loss": 0.5260357856750488
@@ -354,18 +354,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 31.0,
+ "avg_layers": 27.0,
"epoch": 0.17845611975344877,
- "f1_execute": 0.8085106015205383,
+ "f1_execute": 0.8571428656578064,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.3046875,
+ "grad_norm": 1.484375,
"learning_rate": 7.4e-05,
- "loss": 0.2616,
- "macro_f1": 0.26950353384017944,
+ "loss": 0.2738,
+ "macro_f1": 0.2857142984867096,
"num_tokens": 64900.0,
"repeat_count": 0.0,
- "routers_loss": 0.32050269842147827,
+ "routers_loss": 0.44355395436286926,
"skip_count": 0.0,
"step": 38,
"text_loss": 0.5382097363471985
@@ -373,18 +373,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 0.18784854710889345,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 1.1796875,
+ "grad_norm": 1.3828125,
"learning_rate": 7.8e-05,
- "loss": 0.2084,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.2137,
+ "macro_f1": 0.3076923191547394,
"num_tokens": 68000.0,
"repeat_count": 0.0,
- "routers_loss": 0.15196125209331512,
+ "routers_loss": 0.202330082654953,
"skip_count": 0.0,
"step": 40,
"text_loss": 0.5946118831634521
@@ -392,18 +392,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 25.0,
"epoch": 0.19724097446433814,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.61328125,
+ "grad_norm": 0.78125,
"learning_rate": 8.2e-05,
- "loss": 0.1947,
+ "loss": 0.21,
"macro_f1": 0.3144654333591461,
"num_tokens": 70529.0,
"repeat_count": 0.0,
- "routers_loss": 0.14121046662330627,
+ "routers_loss": 0.18023855984210968,
"skip_count": 0.0,
"step": 42,
"text_loss": 0.5550904273986816
@@ -416,13 +416,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.50390625,
+ "grad_norm": 0.609375,
"learning_rate": 8.599999999999999e-05,
- "loss": 0.1884,
+ "loss": 0.1918,
"macro_f1": 0.32098764181137085,
"num_tokens": 73427.0,
"repeat_count": 2.0,
- "routers_loss": 0.21312278509140015,
+ "routers_loss": 0.2101590931415558,
"skip_count": 0.0,
"step": 44,
"text_loss": 0.4636923372745514
@@ -435,13 +435,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.45703125,
+ "grad_norm": 0.53125,
"learning_rate": 8.999999999999999e-05,
- "loss": 0.166,
+ "loss": 0.1881,
"macro_f1": 0.3333333432674408,
"num_tokens": 76472.0,
"repeat_count": 0.0,
- "routers_loss": 0.1184137836098671,
+ "routers_loss": 0.11800424009561539,
"skip_count": 0.0,
"step": 46,
"text_loss": 0.4187001883983612
@@ -454,13 +454,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.62890625,
+ "grad_norm": 0.953125,
"learning_rate": 9.400000000000001e-05,
- "loss": 0.1313,
+ "loss": 0.1446,
"macro_f1": 0.3272727429866791,
"num_tokens": 79124.0,
"repeat_count": 1.0,
- "routers_loss": 0.10897563397884369,
+ "routers_loss": 0.11632519960403442,
"skip_count": 0.0,
"step": 48,
"text_loss": 0.2253919243812561
@@ -468,18 +468,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 0.2348106838861168,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.4375,
+ "grad_norm": 0.58984375,
"learning_rate": 9.800000000000001e-05,
- "loss": 0.1531,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.1543,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 81980.0,
"repeat_count": 1.0,
- "routers_loss": 0.09979952871799469,
+ "routers_loss": 0.09669367223978043,
"skip_count": 0.0,
"step": 50,
"text_loss": 0.6053179502487183
@@ -487,18 +487,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 0.2442031112415615,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.515625,
+ "grad_norm": 0.8515625,
"learning_rate": 0.000102,
- "loss": 0.1265,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.1393,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 85236.0,
"repeat_count": 0.0,
- "routers_loss": 0.05543195456266403,
+ "routers_loss": 0.12471720576286316,
"skip_count": 0.0,
"step": 52,
"text_loss": 0.6027331948280334
@@ -511,13 +511,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.328125,
+ "grad_norm": 0.421875,
"learning_rate": 0.000106,
- "loss": 0.1436,
+ "loss": 0.1473,
"macro_f1": 0.32098764181137085,
"num_tokens": 88238.0,
"repeat_count": 0.0,
- "routers_loss": 0.15049344301223755,
+ "routers_loss": 0.1376056969165802,
"skip_count": 2.0,
"step": 54,
"text_loss": 0.2861751616001129
@@ -530,13 +530,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.263671875,
+ "grad_norm": 0.35546875,
"learning_rate": 0.00011,
- "loss": 0.1021,
+ "loss": 0.1082,
"macro_f1": 0.3333333432674408,
"num_tokens": 91056.0,
"repeat_count": 0.0,
- "routers_loss": 0.07367338240146637,
+ "routers_loss": 0.07449393719434738,
"skip_count": 0.0,
"step": 56,
"text_loss": 0.48106974363327026
@@ -544,18 +544,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 26.0,
"epoch": 0.2723803933078955,
- "f1_execute": 1.0,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25,
+ "grad_norm": 0.271484375,
"learning_rate": 0.000114,
- "loss": 0.114,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.1123,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 94987.0,
"repeat_count": 0.0,
- "routers_loss": 0.03782692551612854,
+ "routers_loss": 0.07064720243215561,
"skip_count": 0.0,
"step": 58,
"text_loss": 0.3554874658584595
@@ -568,13 +568,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.333984375,
+ "grad_norm": 0.5390625,
"learning_rate": 0.000118,
- "loss": 0.1197,
+ "loss": 0.1234,
"macro_f1": 0.32098764181137085,
"num_tokens": 97909.0,
"repeat_count": 0.0,
- "routers_loss": 0.14074955880641937,
+ "routers_loss": 0.16835889220237732,
"skip_count": 2.0,
"step": 60,
"text_loss": 0.5475804805755615
@@ -587,13 +587,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.000122,
- "loss": 0.1174,
+ "loss": 0.1224,
"macro_f1": 0.3333333432674408,
"num_tokens": 101043.0,
"repeat_count": 0.0,
- "routers_loss": 0.058013737201690674,
+ "routers_loss": 0.06127442046999931,
"skip_count": 0.0,
"step": 62,
"text_loss": 0.5966938734054565
@@ -606,13 +606,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.212890625,
"learning_rate": 0.000126,
- "loss": 0.0911,
+ "loss": 0.0931,
"macro_f1": 0.3333333432674408,
"num_tokens": 104103.0,
"repeat_count": 0.0,
- "routers_loss": 0.04936821386218071,
+ "routers_loss": 0.047825805842876434,
"skip_count": 0.0,
"step": 64,
"text_loss": 0.5480486750602722
@@ -625,13 +625,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.220703125,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.00013000000000000002,
- "loss": 0.1107,
+ "loss": 0.1088,
"macro_f1": 0.3006536364555359,
"num_tokens": 107009.0,
"repeat_count": 1.0,
- "routers_loss": 0.2628525495529175,
+ "routers_loss": 0.275174081325531,
"skip_count": 4.0,
"step": 66,
"text_loss": 0.41714492440223694
@@ -644,13 +644,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.000134,
- "loss": 0.1109,
+ "loss": 0.1123,
"macro_f1": 0.3333333432674408,
"num_tokens": 110486.0,
"repeat_count": 0.0,
- "routers_loss": 0.02859785594046116,
+ "routers_loss": 0.029025178402662277,
"skip_count": 0.0,
"step": 68,
"text_loss": 0.6775627732276917
@@ -663,13 +663,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.298828125,
+ "grad_norm": 0.314453125,
"learning_rate": 0.00013800000000000002,
- "loss": 0.1067,
+ "loss": 0.1049,
"macro_f1": 0.3272727429866791,
"num_tokens": 113878.0,
"repeat_count": 0.0,
- "routers_loss": 0.10459086298942566,
+ "routers_loss": 0.10141710191965103,
"skip_count": 1.0,
"step": 70,
"text_loss": 0.6678873896598816
@@ -682,13 +682,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2109375,
+ "grad_norm": 0.248046875,
"learning_rate": 0.00014199999999999998,
- "loss": 0.1166,
+ "loss": 0.1119,
"macro_f1": 0.3272727429866791,
"num_tokens": 116989.0,
"repeat_count": 0.0,
- "routers_loss": 0.0718551054596901,
+ "routers_loss": 0.08002066612243652,
"skip_count": 1.0,
"step": 72,
"text_loss": 0.405692994594574
@@ -701,13 +701,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1787109375,
"learning_rate": 0.000146,
- "loss": 0.1007,
+ "loss": 0.0944,
"macro_f1": 0.3144654333591461,
"num_tokens": 119883.0,
"repeat_count": 0.0,
- "routers_loss": 0.1850946843624115,
+ "routers_loss": 0.1867009848356247,
"skip_count": 3.0,
"step": 74,
"text_loss": 0.44616150856018066
@@ -720,13 +720,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.34375,
+ "grad_norm": 0.333984375,
"learning_rate": 0.00015,
- "loss": 0.1019,
+ "loss": 0.1003,
"macro_f1": 0.32098764181137085,
"num_tokens": 123325.0,
"repeat_count": 0.0,
- "routers_loss": 0.09809529036283493,
+ "routers_loss": 0.07042168825864792,
"skip_count": 2.0,
"step": 76,
"text_loss": 0.11340200901031494
@@ -739,13 +739,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.259765625,
+ "grad_norm": 0.26171875,
"learning_rate": 0.000154,
- "loss": 0.1088,
+ "loss": 0.1066,
"macro_f1": 0.32098764181137085,
"num_tokens": 126131.0,
"repeat_count": 0.0,
- "routers_loss": 0.11277207732200623,
+ "routers_loss": 0.11535373330116272,
"skip_count": 2.0,
"step": 78,
"text_loss": 0.3269135355949402
@@ -758,13 +758,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2412109375,
+ "grad_norm": 0.255859375,
"learning_rate": 0.000158,
- "loss": 0.0866,
+ "loss": 0.0891,
"macro_f1": 0.3272727429866791,
"num_tokens": 130349.0,
"repeat_count": 0.0,
- "routers_loss": 0.09079254418611526,
+ "routers_loss": 0.09497501701116562,
"skip_count": 1.0,
"step": 80,
"text_loss": 0.15273472666740417
@@ -777,13 +777,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.000162,
- "loss": 0.0928,
+ "loss": 0.0929,
"macro_f1": 0.3333333432674408,
"num_tokens": 133607.0,
"repeat_count": 0.0,
- "routers_loss": 0.02900076098740101,
+ "routers_loss": 0.030639523640275,
"skip_count": 0.0,
"step": 82,
"text_loss": 0.282884806394577
@@ -796,13 +796,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.00016600000000000002,
- "loss": 0.1251,
+ "loss": 0.1254,
"macro_f1": 0.3272727429866791,
"num_tokens": 136694.0,
"repeat_count": 0.0,
- "routers_loss": 0.0763339251279831,
+ "routers_loss": 0.07906441390514374,
"skip_count": 1.0,
"step": 84,
"text_loss": 0.459094375371933
@@ -817,11 +817,11 @@
"f1_skip": 0.0,
"grad_norm": 0.212890625,
"learning_rate": 0.00017,
- "loss": 0.1064,
+ "loss": 0.1071,
"macro_f1": 0.3144654333591461,
"num_tokens": 139966.0,
"repeat_count": 1.0,
- "routers_loss": 0.13191410899162292,
+ "routers_loss": 0.1124570444226265,
"skip_count": 2.0,
"step": 86,
"text_loss": 0.29985448718070984
@@ -834,13 +834,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.25390625,
"learning_rate": 0.000174,
- "loss": 0.1055,
+ "loss": 0.1031,
"macro_f1": 0.32098764181137085,
"num_tokens": 142788.0,
"repeat_count": 2.0,
- "routers_loss": 0.21200031042099,
+ "routers_loss": 0.1966402679681778,
"skip_count": 0.0,
"step": 88,
"text_loss": 0.6435291767120361
@@ -853,13 +853,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.318359375,
+ "grad_norm": 0.349609375,
"learning_rate": 0.000178,
- "loss": 0.0971,
+ "loss": 0.0963,
"macro_f1": 0.3333333432674408,
"num_tokens": 146192.0,
"repeat_count": 0.0,
- "routers_loss": 0.031911369413137436,
+ "routers_loss": 0.0325632207095623,
"skip_count": 0.0,
"step": 90,
"text_loss": 0.35170626640319824
@@ -872,13 +872,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.216796875,
+ "grad_norm": 0.2265625,
"learning_rate": 0.000182,
- "loss": 0.1056,
+ "loss": 0.1073,
"macro_f1": 0.32098764181137085,
"num_tokens": 149792.0,
"repeat_count": 1.0,
- "routers_loss": 0.14131835103034973,
+ "routers_loss": 0.15115146338939667,
"skip_count": 1.0,
"step": 92,
"text_loss": 0.83159339427948
@@ -891,13 +891,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.205078125,
"learning_rate": 0.000186,
- "loss": 0.1059,
+ "loss": 0.1073,
"macro_f1": 0.3333333432674408,
"num_tokens": 152766.0,
"repeat_count": 0.0,
- "routers_loss": 0.04137955233454704,
+ "routers_loss": 0.043313540518283844,
"skip_count": 0.0,
"step": 94,
"text_loss": 0.49707934260368347
@@ -910,13 +910,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.00019,
- "loss": 0.0934,
+ "loss": 0.0947,
"macro_f1": 0.3333333432674408,
"num_tokens": 156112.0,
"repeat_count": 0.0,
- "routers_loss": 0.03163003921508789,
+ "routers_loss": 0.032021280378103256,
"skip_count": 0.0,
"step": 96,
"text_loss": 0.27608928084373474
@@ -929,13 +929,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.2099609375,
"learning_rate": 0.000194,
- "loss": 0.0847,
+ "loss": 0.0846,
"macro_f1": 0.3076923191547394,
"num_tokens": 159454.0,
"repeat_count": 2.0,
- "routers_loss": 0.2567490339279175,
+ "routers_loss": 0.24473154544830322,
"skip_count": 2.0,
"step": 98,
"text_loss": 0.6026689410209656
@@ -948,13 +948,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.30859375,
+ "grad_norm": 0.271484375,
"learning_rate": 0.00019800000000000002,
- "loss": 0.1077,
+ "loss": 0.1028,
"macro_f1": 0.32098764181137085,
"num_tokens": 163661.0,
"repeat_count": 0.0,
- "routers_loss": 0.11468870937824249,
+ "routers_loss": 0.11468276381492615,
"skip_count": 2.0,
"step": 100,
"text_loss": 0.46733155846595764
@@ -967,13 +967,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.17578125,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.000202,
- "loss": 0.1131,
+ "loss": 0.1089,
"macro_f1": 0.3333333432674408,
"num_tokens": 167134.0,
"repeat_count": 0.0,
- "routers_loss": 0.02124219387769699,
+ "routers_loss": 0.021144939586520195,
"skip_count": 0.0,
"step": 102,
"text_loss": 0.6362994909286499
@@ -986,13 +986,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.000206,
- "loss": 0.0624,
+ "loss": 0.0621,
"macro_f1": 0.3272727429866791,
"num_tokens": 170433.0,
"repeat_count": 0.0,
- "routers_loss": 0.06983796507120132,
+ "routers_loss": 0.06594710797071457,
"skip_count": 1.0,
"step": 104,
"text_loss": 0.4515477120876312
@@ -1005,13 +1005,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.00021,
- "loss": 0.0951,
+ "loss": 0.0929,
"macro_f1": 0.3333333432674408,
"num_tokens": 173387.0,
"repeat_count": 0.0,
- "routers_loss": 0.03467355668544769,
+ "routers_loss": 0.032923027873039246,
"skip_count": 0.0,
"step": 106,
"text_loss": 0.6638453006744385
@@ -1024,13 +1024,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.240234375,
"learning_rate": 0.000214,
- "loss": 0.0881,
+ "loss": 0.0883,
"macro_f1": 0.3272727429866791,
"num_tokens": 176170.0,
"repeat_count": 1.0,
- "routers_loss": 0.08142061531543732,
+ "routers_loss": 0.08034781366586685,
"skip_count": 0.0,
"step": 108,
"text_loss": 1.186936855316162
@@ -1043,13 +1043,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.267578125,
"learning_rate": 0.000218,
- "loss": 0.0795,
+ "loss": 0.0794,
"macro_f1": 0.3272727429866791,
"num_tokens": 179877.0,
"repeat_count": 0.0,
- "routers_loss": 0.08327355235815048,
+ "routers_loss": 0.07814185321331024,
"skip_count": 1.0,
"step": 110,
"text_loss": 0.5488709211349487
@@ -1062,13 +1062,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.000222,
- "loss": 0.0943,
+ "loss": 0.0946,
"macro_f1": 0.3333333432674408,
"num_tokens": 182726.0,
"repeat_count": 0.0,
- "routers_loss": 0.019890006631612778,
+ "routers_loss": 0.01884695515036583,
"skip_count": 0.0,
"step": 112,
"text_loss": 0.5195863842964172
@@ -1081,13 +1081,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2001953125,
+ "grad_norm": 0.19921875,
"learning_rate": 0.00022600000000000002,
- "loss": 0.0933,
+ "loss": 0.0974,
"macro_f1": 0.32098764181137085,
"num_tokens": 185624.0,
"repeat_count": 0.0,
- "routers_loss": 0.09992363303899765,
+ "routers_loss": 0.09657823294401169,
"skip_count": 2.0,
"step": 114,
"text_loss": 0.43858134746551514
@@ -1100,13 +1100,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.3046875,
"learning_rate": 0.00023,
- "loss": 0.0762,
+ "loss": 0.0753,
"macro_f1": 0.3333333432674408,
"num_tokens": 188155.0,
"repeat_count": 0.0,
- "routers_loss": 0.014119029976427555,
+ "routers_loss": 0.01463601179420948,
"skip_count": 0.0,
"step": 116,
"text_loss": 0.392981618642807
@@ -1119,13 +1119,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.423828125,
+ "grad_norm": 0.439453125,
"learning_rate": 0.00023400000000000002,
- "loss": 0.0842,
+ "loss": 0.0843,
"macro_f1": 0.3333333432674408,
"num_tokens": 190970.0,
"repeat_count": 0.0,
- "routers_loss": 0.03976766765117645,
+ "routers_loss": 0.03859659656882286,
"skip_count": 0.0,
"step": 118,
"text_loss": 0.309179425239563
@@ -1138,13 +1138,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.2255859375,
"learning_rate": 0.00023799999999999998,
- "loss": 0.0517,
+ "loss": 0.053,
"macro_f1": 0.3333333432674408,
"num_tokens": 193988.0,
"repeat_count": 0.0,
- "routers_loss": 0.017428619787096977,
+ "routers_loss": 0.019092386588454247,
"skip_count": 0.0,
"step": 120,
"text_loss": 0.48543134331703186
@@ -1157,13 +1157,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.296875,
+ "grad_norm": 0.35546875,
"learning_rate": 0.000242,
- "loss": 0.1134,
+ "loss": 0.1203,
"macro_f1": 0.3272727429866791,
"num_tokens": 196475.0,
"repeat_count": 0.0,
- "routers_loss": 0.06965513527393341,
+ "routers_loss": 0.0619138665497303,
"skip_count": 1.0,
"step": 122,
"text_loss": 0.4615364074707031
@@ -1176,13 +1176,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1875,
"learning_rate": 0.000246,
- "loss": 0.0984,
+ "loss": 0.1002,
"macro_f1": 0.3272727429866791,
"num_tokens": 200045.0,
"repeat_count": 1.0,
- "routers_loss": 0.10476501286029816,
+ "routers_loss": 0.09752107411623001,
"skip_count": 0.0,
"step": 124,
"text_loss": 0.15802054107189178
@@ -1195,13 +1195,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.00025,
- "loss": 0.0771,
+ "loss": 0.0773,
"macro_f1": 0.3333333432674408,
"num_tokens": 203214.0,
"repeat_count": 0.0,
- "routers_loss": 0.028317544609308243,
+ "routers_loss": 0.02896115928888321,
"skip_count": 0.0,
"step": 126,
"text_loss": 0.4543360471725464
@@ -1214,13 +1214,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.390625,
+ "grad_norm": 0.4296875,
"learning_rate": 0.000254,
- "loss": 0.0933,
+ "loss": 0.0973,
"macro_f1": 0.3333333432674408,
"num_tokens": 206168.0,
"repeat_count": 0.0,
- "routers_loss": 0.012766432017087936,
+ "routers_loss": 0.011423567309975624,
"skip_count": 0.0,
"step": 128,
"text_loss": 0.4730179011821747
@@ -1233,13 +1233,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.353515625,
+ "grad_norm": 0.365234375,
"learning_rate": 0.00025800000000000004,
- "loss": 0.0989,
+ "loss": 0.099,
"macro_f1": 0.3333333432674408,
"num_tokens": 209907.0,
"repeat_count": 0.0,
- "routers_loss": 0.021400077268481255,
+ "routers_loss": 0.01957600563764572,
"skip_count": 0.0,
"step": 130,
"text_loss": 0.45122358202934265
@@ -1252,13 +1252,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.2060546875,
"learning_rate": 0.000262,
- "loss": 0.0873,
+ "loss": 0.0868,
"macro_f1": 0.3272727429866791,
"num_tokens": 213521.0,
"repeat_count": 0.0,
- "routers_loss": 0.05025051161646843,
+ "routers_loss": 0.04882373288273811,
"skip_count": 1.0,
"step": 132,
"text_loss": 0.4341491758823395
@@ -1271,13 +1271,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1611328125,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.000266,
- "loss": 0.085,
+ "loss": 0.0834,
"macro_f1": 0.3333333432674408,
"num_tokens": 216484.0,
"repeat_count": 0.0,
- "routers_loss": 0.017420046031475067,
+ "routers_loss": 0.016083380207419395,
"skip_count": 0.0,
"step": 134,
"text_loss": 0.46990111470222473
@@ -1290,13 +1290,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2041015625,
+ "grad_norm": 0.220703125,
"learning_rate": 0.00027,
- "loss": 0.086,
+ "loss": 0.0863,
"macro_f1": 0.3333333432674408,
"num_tokens": 219398.0,
"repeat_count": 0.0,
- "routers_loss": 0.018217921257019043,
+ "routers_loss": 0.01733536459505558,
"skip_count": 0.0,
"step": 136,
"text_loss": 0.4455361068248749
@@ -1309,13 +1309,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.00027400000000000005,
- "loss": 0.0985,
+ "loss": 0.0997,
"macro_f1": 0.3333333432674408,
"num_tokens": 222430.0,
"repeat_count": 0.0,
- "routers_loss": 0.012350660748779774,
+ "routers_loss": 0.01332803163677454,
"skip_count": 0.0,
"step": 138,
"text_loss": 0.47699397802352905
@@ -1328,13 +1328,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.302734375,
+ "grad_norm": 0.333984375,
"learning_rate": 0.00027800000000000004,
"loss": 0.0922,
"macro_f1": 0.3144654333591461,
"num_tokens": 225458.0,
"repeat_count": 1.0,
- "routers_loss": 0.14993029832839966,
+ "routers_loss": 0.14924728870391846,
"skip_count": 2.0,
"step": 140,
"text_loss": 0.5858222842216492
@@ -1347,13 +1347,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.251953125,
+ "grad_norm": 0.25,
"learning_rate": 0.00028199999999999997,
- "loss": 0.0791,
+ "loss": 0.0798,
"macro_f1": 0.3144654333591461,
"num_tokens": 229365.0,
"repeat_count": 1.0,
- "routers_loss": 0.17921413481235504,
+ "routers_loss": 0.1860177218914032,
"skip_count": 2.0,
"step": 142,
"text_loss": 0.5003137588500977
@@ -1366,13 +1366,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.00028599999999999996,
- "loss": 0.0535,
+ "loss": 0.054,
"macro_f1": 0.32098764181137085,
"num_tokens": 231787.0,
"repeat_count": 1.0,
- "routers_loss": 0.1420905590057373,
+ "routers_loss": 0.16498211026191711,
"skip_count": 1.0,
"step": 144,
"text_loss": 0.5026470422744751
@@ -1385,13 +1385,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.29296875,
+ "grad_norm": 0.306640625,
"learning_rate": 0.00029,
- "loss": 0.0956,
+ "loss": 0.0936,
"macro_f1": 0.32098764181137085,
"num_tokens": 235014.0,
"repeat_count": 1.0,
- "routers_loss": 0.12468750029802322,
+ "routers_loss": 0.11801310628652573,
"skip_count": 1.0,
"step": 146,
"text_loss": 0.611888587474823
@@ -1404,13 +1404,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.000294,
- "loss": 0.0879,
+ "loss": 0.0878,
"macro_f1": 0.3333333432674408,
"num_tokens": 238210.0,
"repeat_count": 0.0,
- "routers_loss": 0.024295611307024956,
+ "routers_loss": 0.02422776259481907,
"skip_count": 0.0,
"step": 148,
"text_loss": 0.2876914143562317
@@ -1423,13 +1423,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.000298,
- "loss": 0.087,
+ "loss": 0.0858,
"macro_f1": 0.32098764181137085,
"num_tokens": 241582.0,
"repeat_count": 0.0,
- "routers_loss": 0.07016433775424957,
+ "routers_loss": 0.07282499223947525,
"skip_count": 2.0,
"step": 150,
"text_loss": 0.3919292390346527
@@ -1442,13 +1442,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3828125,
+ "grad_norm": 0.37890625,
"learning_rate": 0.000302,
- "loss": 0.0782,
+ "loss": 0.0797,
"macro_f1": 0.32098764181137085,
"num_tokens": 244621.0,
"repeat_count": 1.0,
- "routers_loss": 0.18942493200302124,
+ "routers_loss": 0.20659038424491882,
"skip_count": 1.0,
"step": 152,
"text_loss": 0.4294498860836029
@@ -1461,13 +1461,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1787109375,
"learning_rate": 0.000306,
- "loss": 0.0713,
+ "loss": 0.072,
"macro_f1": 0.3333333432674408,
"num_tokens": 247833.0,
"repeat_count": 0.0,
- "routers_loss": 0.02319060079753399,
+ "routers_loss": 0.02428400330245495,
"skip_count": 0.0,
"step": 154,
"text_loss": 0.5930765867233276
@@ -1480,13 +1480,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.15234375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.00031,
- "loss": 0.0778,
+ "loss": 0.0772,
"macro_f1": 0.3333333432674408,
"num_tokens": 251349.0,
"repeat_count": 0.0,
- "routers_loss": 0.01764747127890587,
+ "routers_loss": 0.0167869683355093,
"skip_count": 0.0,
"step": 156,
"text_loss": 0.41063904762268066
@@ -1499,13 +1499,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.000314,
- "loss": 0.0829,
+ "loss": 0.0821,
"macro_f1": 0.3333333432674408,
"num_tokens": 254886.0,
"repeat_count": 0.0,
- "routers_loss": 0.02268100716173649,
+ "routers_loss": 0.02531604655086994,
"skip_count": 0.0,
"step": 158,
"text_loss": 0.6739020347595215
@@ -1518,13 +1518,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.201171875,
"learning_rate": 0.00031800000000000003,
- "loss": 0.0889,
+ "loss": 0.09,
"macro_f1": 0.3333333432674408,
"num_tokens": 258260.0,
"repeat_count": 0.0,
- "routers_loss": 0.016952091827988625,
+ "routers_loss": 0.017772775143384933,
"skip_count": 0.0,
"step": 160,
"text_loss": 0.46873849630355835
@@ -1537,13 +1537,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2216796875,
+ "grad_norm": 0.224609375,
"learning_rate": 0.000322,
- "loss": 0.0923,
+ "loss": 0.0893,
"macro_f1": 0.3272727429866791,
"num_tokens": 261846.0,
"repeat_count": 0.0,
- "routers_loss": 0.03669808804988861,
+ "routers_loss": 0.034902360290288925,
"skip_count": 1.0,
"step": 162,
"text_loss": 0.3727971017360687
@@ -1556,13 +1556,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.212890625,
"learning_rate": 0.000326,
- "loss": 0.0769,
+ "loss": 0.076,
"macro_f1": 0.3333333432674408,
"num_tokens": 264348.0,
"repeat_count": 0.0,
- "routers_loss": 0.012101447209715843,
+ "routers_loss": 0.013553355820477009,
"skip_count": 0.0,
"step": 164,
"text_loss": 0.5798237323760986
@@ -1575,13 +1575,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.37109375,
+ "grad_norm": 0.408203125,
"learning_rate": 0.00033,
- "loss": 0.0897,
+ "loss": 0.0926,
"macro_f1": 0.32098764181137085,
"num_tokens": 267479.0,
"repeat_count": 1.0,
- "routers_loss": 0.1562056541442871,
+ "routers_loss": 0.13571743667125702,
"skip_count": 1.0,
"step": 166,
"text_loss": 0.8084776997566223
@@ -1594,13 +1594,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.2431640625,
"learning_rate": 0.00033400000000000004,
- "loss": 0.0829,
+ "loss": 0.0817,
"macro_f1": 0.32098764181137085,
"num_tokens": 270268.0,
"repeat_count": 2.0,
- "routers_loss": 0.20807914435863495,
+ "routers_loss": 0.19884146749973297,
"skip_count": 0.0,
"step": 168,
"text_loss": 0.7366134524345398
@@ -1613,13 +1613,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.267578125,
"learning_rate": 0.00033800000000000003,
- "loss": 0.0987,
+ "loss": 0.1022,
"macro_f1": 0.32098764181137085,
"num_tokens": 273518.0,
"repeat_count": 1.0,
- "routers_loss": 0.1530539095401764,
+ "routers_loss": 0.15469175577163696,
"skip_count": 1.0,
"step": 170,
"text_loss": 0.27204006910324097
@@ -1632,13 +1632,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000342,
- "loss": 0.087,
+ "loss": 0.0865,
"macro_f1": 0.32098764181137085,
"num_tokens": 277210.0,
"repeat_count": 0.0,
- "routers_loss": 0.08004544675350189,
+ "routers_loss": 0.08603330701589584,
"skip_count": 2.0,
"step": 172,
"text_loss": 0.7137667536735535
@@ -1651,13 +1651,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1767578125,
+ "grad_norm": 0.189453125,
"learning_rate": 0.000346,
- "loss": 0.0916,
+ "loss": 0.0902,
"macro_f1": 0.3076923191547394,
"num_tokens": 280389.0,
"repeat_count": 0.0,
- "routers_loss": 0.19228078424930573,
+ "routers_loss": 0.17851492762565613,
"skip_count": 4.0,
"step": 174,
"text_loss": 0.5148105621337891
@@ -1670,13 +1670,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1611328125,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.00035,
- "loss": 0.0863,
+ "loss": 0.0853,
"macro_f1": 0.3333333432674408,
"num_tokens": 283501.0,
"repeat_count": 0.0,
- "routers_loss": 0.024507170543074608,
+ "routers_loss": 0.021331604570150375,
"skip_count": 0.0,
"step": 176,
"text_loss": 0.301013320684433
@@ -1689,13 +1689,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.000354,
- "loss": 0.0898,
+ "loss": 0.0911,
"macro_f1": 0.32098764181137085,
"num_tokens": 287154.0,
"repeat_count": 0.0,
- "routers_loss": 0.05055495724081993,
+ "routers_loss": 0.057273946702480316,
"skip_count": 2.0,
"step": 178,
"text_loss": 0.4740981459617615
@@ -1708,13 +1708,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.240234375,
"learning_rate": 0.000358,
- "loss": 0.0865,
+ "loss": 0.0904,
"macro_f1": 0.3272727429866791,
"num_tokens": 289929.0,
"repeat_count": 0.0,
- "routers_loss": 0.03999815881252289,
+ "routers_loss": 0.04116598889231682,
"skip_count": 1.0,
"step": 180,
"text_loss": 0.4838573932647705
@@ -1727,13 +1727,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.14453125,
"learning_rate": 0.000362,
- "loss": 0.0983,
+ "loss": 0.0991,
"macro_f1": 0.3333333432674408,
"num_tokens": 294293.0,
"repeat_count": 0.0,
- "routers_loss": 0.025158070027828217,
+ "routers_loss": 0.027111956849694252,
"skip_count": 0.0,
"step": 182,
"text_loss": 0.7495553493499756
@@ -1746,32 +1746,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.158203125,
"learning_rate": 0.000366,
- "loss": 0.1015,
+ "loss": 0.1038,
"macro_f1": 0.3333333432674408,
"num_tokens": 297730.0,
"repeat_count": 0.0,
- "routers_loss": 0.01825365424156189,
+ "routers_loss": 0.019166452810168266,
"skip_count": 0.0,
"step": 184,
"text_loss": 0.534831166267395
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 0.8734957440563546,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.2158203125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2236328125,
"learning_rate": 0.00037,
- "loss": 0.0736,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0784,
+ "macro_f1": 0.5427350401878357,
"num_tokens": 300593.0,
"repeat_count": 1.0,
- "routers_loss": 0.22729666531085968,
+ "routers_loss": 0.2349659502506256,
"skip_count": 2.0,
"step": 186,
"text_loss": 0.3549048602581024
@@ -1784,13 +1784,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.2041015625,
"learning_rate": 0.000374,
- "loss": 0.0838,
+ "loss": 0.0827,
"macro_f1": 0.3076923191547394,
"num_tokens": 303456.0,
"repeat_count": 2.0,
- "routers_loss": 0.24516475200653076,
+ "routers_loss": 0.22502389550209045,
"skip_count": 2.0,
"step": 188,
"text_loss": 0.8837642073631287
@@ -1803,13 +1803,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2470703125,
+ "grad_norm": 0.271484375,
"learning_rate": 0.000378,
- "loss": 0.1056,
+ "loss": 0.1085,
"macro_f1": 0.3272727429866791,
"num_tokens": 306241.0,
"repeat_count": 1.0,
- "routers_loss": 0.1307530701160431,
+ "routers_loss": 0.12291611731052399,
"skip_count": 0.0,
"step": 190,
"text_loss": 0.73353511095047
@@ -1822,13 +1822,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.15625,
"learning_rate": 0.000382,
- "loss": 0.0961,
+ "loss": 0.0969,
"macro_f1": 0.3272727429866791,
"num_tokens": 310606.0,
"repeat_count": 0.0,
- "routers_loss": 0.06541688740253448,
+ "routers_loss": 0.055988848209381104,
"skip_count": 1.0,
"step": 192,
"text_loss": 0.6261917352676392
@@ -1841,13 +1841,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.333984375,
+ "grad_norm": 0.34375,
"learning_rate": 0.000386,
- "loss": 0.1058,
+ "loss": 0.1055,
"macro_f1": 0.3144654333591461,
"num_tokens": 313564.0,
"repeat_count": 0.0,
- "routers_loss": 0.12492545694112778,
+ "routers_loss": 0.12363404780626297,
"skip_count": 3.0,
"step": 194,
"text_loss": 0.2790874242782593
@@ -1860,13 +1860,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28515625,
+ "grad_norm": 0.27734375,
"learning_rate": 0.00039000000000000005,
- "loss": 0.0966,
+ "loss": 0.0964,
"macro_f1": 0.3076923191547394,
"num_tokens": 316958.0,
"repeat_count": 2.0,
- "routers_loss": 0.2838033139705658,
+ "routers_loss": 0.2718356251716614,
"skip_count": 2.0,
"step": 196,
"text_loss": 0.14428086578845978
@@ -1881,11 +1881,11 @@
"f1_skip": 0.0,
"grad_norm": 0.2021484375,
"learning_rate": 0.00039400000000000004,
- "loss": 0.0929,
+ "loss": 0.0917,
"macro_f1": 0.32098764181137085,
"num_tokens": 320103.0,
"repeat_count": 0.0,
- "routers_loss": 0.07692629098892212,
+ "routers_loss": 0.07188102602958679,
"skip_count": 2.0,
"step": 198,
"text_loss": 0.27155816555023193
@@ -1898,13 +1898,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.201171875,
"learning_rate": 0.000398,
"loss": 0.0809,
"macro_f1": 0.32098764181137085,
"num_tokens": 323566.0,
"repeat_count": 1.0,
- "routers_loss": 0.18504399061203003,
+ "routers_loss": 0.18038256466388702,
"skip_count": 1.0,
"step": 200,
"text_loss": 0.8453494310379028
@@ -1917,13 +1917,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.000402,
- "loss": 0.078,
+ "loss": 0.0801,
"macro_f1": 0.3333333432674408,
"num_tokens": 326385.0,
"repeat_count": 0.0,
- "routers_loss": 0.014647359028458595,
+ "routers_loss": 0.014639763161540031,
"skip_count": 0.0,
"step": 202,
"text_loss": 0.5733131766319275
@@ -1936,13 +1936,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2041015625,
+ "grad_norm": 0.21875,
"learning_rate": 0.00040600000000000006,
- "loss": 0.1028,
+ "loss": 0.104,
"macro_f1": 0.3333333432674408,
"num_tokens": 329266.0,
"repeat_count": 0.0,
- "routers_loss": 0.017848484218120575,
+ "routers_loss": 0.015269627794623375,
"skip_count": 0.0,
"step": 204,
"text_loss": 0.7355639934539795
@@ -1955,13 +1955,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.27734375,
"learning_rate": 0.00041,
- "loss": 0.0832,
+ "loss": 0.0833,
"macro_f1": 0.3333333432674408,
"num_tokens": 332984.0,
"repeat_count": 0.0,
- "routers_loss": 0.01900508813560009,
+ "routers_loss": 0.018046971410512924,
"skip_count": 0.0,
"step": 206,
"text_loss": 0.587641179561615
@@ -1974,13 +1974,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.166015625,
+ "grad_norm": 0.185546875,
"learning_rate": 0.000414,
"loss": 0.0588,
"macro_f1": 0.3272727429866791,
"num_tokens": 335739.0,
"repeat_count": 1.0,
- "routers_loss": 0.13018715381622314,
+ "routers_loss": 0.12791286408901215,
"skip_count": 0.0,
"step": 208,
"text_loss": 0.6538406610488892
@@ -1993,13 +1993,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.24609375,
"learning_rate": 0.00041799999999999997,
- "loss": 0.0697,
+ "loss": 0.0732,
"macro_f1": 0.3272727429866791,
"num_tokens": 338966.0,
"repeat_count": 0.0,
- "routers_loss": 0.055288366973400116,
+ "routers_loss": 0.050490595400333405,
"skip_count": 1.0,
"step": 210,
"text_loss": 0.4188295602798462
@@ -2012,13 +2012,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.271484375,
"learning_rate": 0.000422,
- "loss": 0.0576,
+ "loss": 0.0588,
"macro_f1": 0.3144654333591461,
"num_tokens": 342063.0,
"repeat_count": 0.0,
- "routers_loss": 0.10952572524547577,
+ "routers_loss": 0.11652113497257233,
"skip_count": 3.0,
"step": 212,
"text_loss": 0.21822240948677063
@@ -2031,13 +2031,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.197265625,
+ "grad_norm": 0.2060546875,
"learning_rate": 0.000426,
- "loss": 0.062,
+ "loss": 0.0621,
"macro_f1": 0.3333333432674408,
"num_tokens": 344887.0,
"repeat_count": 0.0,
- "routers_loss": 0.02415696159005165,
+ "routers_loss": 0.023898238316178322,
"skip_count": 0.0,
"step": 214,
"text_loss": 0.24692800641059875
@@ -2050,13 +2050,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.353515625,
+ "grad_norm": 0.3671875,
"learning_rate": 0.00043,
- "loss": 0.1011,
+ "loss": 0.1005,
"macro_f1": 0.3272727429866791,
"num_tokens": 348700.0,
"repeat_count": 1.0,
- "routers_loss": 0.06956391036510468,
+ "routers_loss": 0.06414655596017838,
"skip_count": 0.0,
"step": 216,
"text_loss": 0.4744548797607422
@@ -2069,13 +2069,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1904296875,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.00043400000000000003,
- "loss": 0.076,
+ "loss": 0.0753,
"macro_f1": 0.32098764181137085,
"num_tokens": 351507.0,
"repeat_count": 1.0,
- "routers_loss": 0.1140352189540863,
+ "routers_loss": 0.11702914535999298,
"skip_count": 1.0,
"step": 218,
"text_loss": 0.5614864826202393
@@ -2090,11 +2090,11 @@
"f1_skip": 0.0,
"grad_norm": 0.189453125,
"learning_rate": 0.000438,
- "loss": 0.0788,
+ "loss": 0.0792,
"macro_f1": 0.3333333432674408,
"num_tokens": 354484.0,
"repeat_count": 0.0,
- "routers_loss": 0.011621571145951748,
+ "routers_loss": 0.014991643838584423,
"skip_count": 0.0,
"step": 220,
"text_loss": 0.47209832072257996
@@ -2107,13 +2107,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.251953125,
"learning_rate": 0.000442,
"loss": 0.106,
"macro_f1": 0.3272727429866791,
"num_tokens": 357954.0,
"repeat_count": 0.0,
- "routers_loss": 0.05813701078295708,
+ "routers_loss": 0.04747112840414047,
"skip_count": 1.0,
"step": 222,
"text_loss": 0.2968728244304657
@@ -2126,13 +2126,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.357421875,
+ "grad_norm": 0.40234375,
"learning_rate": 0.000446,
- "loss": 0.0827,
+ "loss": 0.0853,
"macro_f1": 0.32098764181137085,
"num_tokens": 360547.0,
"repeat_count": 0.0,
- "routers_loss": 0.0646885335445404,
+ "routers_loss": 0.06754162162542343,
"skip_count": 2.0,
"step": 224,
"text_loss": 0.2364148646593094
@@ -2145,13 +2145,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.244140625,
+ "grad_norm": 0.2412109375,
"learning_rate": 0.00045000000000000004,
- "loss": 0.1011,
+ "loss": 0.1016,
"macro_f1": 0.3272727429866791,
"num_tokens": 364529.0,
"repeat_count": 0.0,
- "routers_loss": 0.07224348932504654,
+ "routers_loss": 0.07830183953046799,
"skip_count": 1.0,
"step": 226,
"text_loss": 0.4787476360797882
@@ -2164,13 +2164,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.1953125,
"learning_rate": 0.00045400000000000003,
- "loss": 0.0781,
+ "loss": 0.0792,
"macro_f1": 0.3333333432674408,
"num_tokens": 367683.0,
"repeat_count": 0.0,
- "routers_loss": 0.015971746295690536,
+ "routers_loss": 0.015735948458313942,
"skip_count": 0.0,
"step": 228,
"text_loss": 0.37148505449295044
@@ -2183,13 +2183,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.25,
"learning_rate": 0.000458,
- "loss": 0.099,
+ "loss": 0.0995,
"macro_f1": 0.3333333432674408,
"num_tokens": 371402.0,
"repeat_count": 0.0,
- "routers_loss": 0.017818331718444824,
+ "routers_loss": 0.013354359194636345,
"skip_count": 0.0,
"step": 230,
"text_loss": 0.7464763522148132
@@ -2202,13 +2202,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.000462,
- "loss": 0.0757,
+ "loss": 0.0731,
"macro_f1": 0.3333333432674408,
"num_tokens": 374587.0,
"repeat_count": 0.0,
- "routers_loss": 0.01582280732691288,
+ "routers_loss": 0.013763721100986004,
"skip_count": 0.0,
"step": 232,
"text_loss": 0.8754443526268005
@@ -2221,13 +2221,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.42578125,
+ "grad_norm": 0.3984375,
"learning_rate": 0.00046600000000000005,
- "loss": 0.0876,
+ "loss": 0.0861,
"macro_f1": 0.3333333432674408,
"num_tokens": 377513.0,
"repeat_count": 0.0,
- "routers_loss": 0.011417915113270283,
+ "routers_loss": 0.010075435042381287,
"skip_count": 0.0,
"step": 234,
"text_loss": 0.31534913182258606
@@ -2240,13 +2240,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.17578125,
"learning_rate": 0.00047,
- "loss": 0.0801,
+ "loss": 0.0791,
"macro_f1": 0.3272727429866791,
"num_tokens": 380736.0,
"repeat_count": 0.0,
- "routers_loss": 0.05787832289934158,
+ "routers_loss": 0.059825167059898376,
"skip_count": 1.0,
"step": 236,
"text_loss": 0.5936337113380432
@@ -2259,13 +2259,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.236328125,
+ "grad_norm": 0.267578125,
"learning_rate": 0.000474,
- "loss": 0.0508,
+ "loss": 0.0514,
"macro_f1": 0.32098764181137085,
"num_tokens": 383236.0,
"repeat_count": 0.0,
- "routers_loss": 0.09476690739393234,
+ "routers_loss": 0.09134846180677414,
"skip_count": 2.0,
"step": 238,
"text_loss": 0.5976157784461975
@@ -2278,13 +2278,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.208984375,
"learning_rate": 0.00047799999999999996,
- "loss": 0.0833,
+ "loss": 0.0858,
"macro_f1": 0.32098764181137085,
"num_tokens": 385778.0,
"repeat_count": 1.0,
- "routers_loss": 0.1099705696105957,
+ "routers_loss": 0.11989791691303253,
"skip_count": 1.0,
"step": 240,
"text_loss": 0.3554210960865021
@@ -2297,13 +2297,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.171875,
"learning_rate": 0.000482,
- "loss": 0.0745,
+ "loss": 0.0734,
"macro_f1": 0.3333333432674408,
"num_tokens": 388777.0,
"repeat_count": 0.0,
- "routers_loss": 0.01269970741122961,
+ "routers_loss": 0.013591105118393898,
"skip_count": 0.0,
"step": 242,
"text_loss": 0.4829460382461548
@@ -2316,13 +2316,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11962890625,
+ "grad_norm": 0.12060546875,
"learning_rate": 0.000486,
- "loss": 0.061,
+ "loss": 0.0625,
"macro_f1": 0.32098764181137085,
"num_tokens": 391797.0,
"repeat_count": 0.0,
- "routers_loss": 0.08505752682685852,
+ "routers_loss": 0.0920003354549408,
"skip_count": 2.0,
"step": 244,
"text_loss": 0.3085818886756897
@@ -2335,13 +2335,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.00049,
- "loss": 0.0504,
+ "loss": 0.0501,
"macro_f1": 0.3333333432674408,
"num_tokens": 396485.0,
"repeat_count": 0.0,
- "routers_loss": 0.012750142253935337,
+ "routers_loss": 0.0129330949857831,
"skip_count": 0.0,
"step": 246,
"text_loss": 0.42803969979286194
@@ -2354,13 +2354,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.291015625,
+ "grad_norm": 0.296875,
"learning_rate": 0.000494,
- "loss": 0.0962,
+ "loss": 0.0945,
"macro_f1": 0.3144654333591461,
"num_tokens": 399923.0,
"repeat_count": 0.0,
- "routers_loss": 0.11287309974431992,
+ "routers_loss": 0.10677755624055862,
"skip_count": 3.0,
"step": 248,
"text_loss": 0.2908555567264557
@@ -2373,32 +2373,32 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.203125,
"learning_rate": 0.000498,
- "loss": 0.0821,
+ "loss": 0.0812,
"macro_f1": 0.3144654333591461,
"num_tokens": 403647.0,
"repeat_count": 0.0,
- "routers_loss": 0.1486474722623825,
+ "routers_loss": 0.1504337340593338,
"skip_count": 3.0,
"step": 250,
"text_loss": 0.333095908164978
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 1.183152333431171,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
+ "f1_skip": 0.0,
"grad_norm": 0.22265625,
"learning_rate": 0.0005020000000000001,
- "loss": 0.0832,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0828,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 409147.0,
"repeat_count": 0.0,
- "routers_loss": 0.06636594980955124,
+ "routers_loss": 0.06503184884786606,
"skip_count": 2.0,
"step": 252,
"text_loss": 0.16117942333221436
@@ -2411,13 +2411,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.267578125,
+ "grad_norm": 0.287109375,
"learning_rate": 0.000506,
- "loss": 0.1,
+ "loss": 0.0995,
"macro_f1": 0.3333333432674408,
"num_tokens": 412072.0,
"repeat_count": 0.0,
- "routers_loss": 0.015062150545418262,
+ "routers_loss": 0.016280122101306915,
"skip_count": 0.0,
"step": 254,
"text_loss": 0.4217492640018463
@@ -2430,13 +2430,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2138671875,
+ "grad_norm": 0.21484375,
"learning_rate": 0.00051,
- "loss": 0.0808,
+ "loss": 0.0803,
"macro_f1": 0.3144654333591461,
"num_tokens": 415052.0,
"repeat_count": 2.0,
- "routers_loss": 0.2051105946302414,
+ "routers_loss": 0.2117508500814438,
"skip_count": 1.0,
"step": 256,
"text_loss": 0.5795308947563171
@@ -2449,13 +2449,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2412109375,
+ "grad_norm": 0.2421875,
"learning_rate": 0.000514,
- "loss": 0.068,
+ "loss": 0.0668,
"macro_f1": 0.3272727429866791,
"num_tokens": 418099.0,
"repeat_count": 1.0,
- "routers_loss": 0.1467045396566391,
+ "routers_loss": 0.15002092719078064,
"skip_count": 0.0,
"step": 258,
"text_loss": 0.4840938448905945
@@ -2468,13 +2468,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.000518,
- "loss": 0.0543,
+ "loss": 0.0538,
"macro_f1": 0.3333333432674408,
"num_tokens": 422526.0,
"repeat_count": 0.0,
- "routers_loss": 0.013022038154304028,
+ "routers_loss": 0.012834074907004833,
"skip_count": 0.0,
"step": 260,
"text_loss": 0.36141225695610046
@@ -2487,13 +2487,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.216796875,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.000522,
- "loss": 0.0848,
+ "loss": 0.085,
"macro_f1": 0.3076923191547394,
"num_tokens": 425765.0,
"repeat_count": 2.0,
- "routers_loss": 0.2575930058956146,
+ "routers_loss": 0.23808011412620544,
"skip_count": 2.0,
"step": 262,
"text_loss": 0.27572691440582275
@@ -2506,13 +2506,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000526,
- "loss": 0.07,
+ "loss": 0.0708,
"macro_f1": 0.3272727429866791,
"num_tokens": 429048.0,
"repeat_count": 0.0,
- "routers_loss": 0.0558602549135685,
+ "routers_loss": 0.055687375366687775,
"skip_count": 1.0,
"step": 264,
"text_loss": 0.37020301818847656
@@ -2525,13 +2525,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.2080078125,
"learning_rate": 0.0005300000000000001,
- "loss": 0.082,
+ "loss": 0.0839,
"macro_f1": 0.3272727429866791,
"num_tokens": 431784.0,
"repeat_count": 0.0,
- "routers_loss": 0.09126655012369156,
+ "routers_loss": 0.0872957780957222,
"skip_count": 1.0,
"step": 266,
"text_loss": 0.5937283039093018
@@ -2544,13 +2544,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.263671875,
"learning_rate": 0.0005340000000000001,
- "loss": 0.0764,
+ "loss": 0.0733,
"macro_f1": 0.32098764181137085,
"num_tokens": 434297.0,
"repeat_count": 2.0,
- "routers_loss": 0.24805288016796112,
+ "routers_loss": 0.23507654666900635,
"skip_count": 0.0,
"step": 268,
"text_loss": 0.3367372453212738
@@ -2563,13 +2563,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.22265625,
+ "grad_norm": 0.2431640625,
"learning_rate": 0.0005380000000000001,
- "loss": 0.0686,
+ "loss": 0.0708,
"macro_f1": 0.32098764181137085,
"num_tokens": 437586.0,
"repeat_count": 0.0,
- "routers_loss": 0.13135533034801483,
+ "routers_loss": 0.12860390543937683,
"skip_count": 2.0,
"step": 270,
"text_loss": 0.7149854302406311
@@ -2582,13 +2582,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.2451171875,
"learning_rate": 0.0005420000000000001,
- "loss": 0.1083,
+ "loss": 0.1072,
"macro_f1": 0.3272727429866791,
"num_tokens": 440649.0,
"repeat_count": 0.0,
- "routers_loss": 0.04991440102458,
+ "routers_loss": 0.044308312237262726,
"skip_count": 1.0,
"step": 272,
"text_loss": 0.26778292655944824
@@ -2601,13 +2601,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.455078125,
+ "grad_norm": 0.44921875,
"learning_rate": 0.000546,
- "loss": 0.0991,
+ "loss": 0.0938,
"macro_f1": 0.3144654333591461,
"num_tokens": 443907.0,
"repeat_count": 0.0,
- "routers_loss": 0.12236632406711578,
+ "routers_loss": 0.11514109373092651,
"skip_count": 3.0,
"step": 274,
"text_loss": 0.23578761518001556
@@ -2620,13 +2620,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.25,
+ "grad_norm": 0.2578125,
"learning_rate": 0.00055,
- "loss": 0.0936,
+ "loss": 0.0932,
"macro_f1": 0.5492662787437439,
"num_tokens": 447147.0,
"repeat_count": 0.0,
- "routers_loss": 0.053506772965192795,
+ "routers_loss": 0.055705297738313675,
"skip_count": 2.0,
"step": 276,
"text_loss": 0.2513524889945984
@@ -2639,13 +2639,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.265625,
+ "grad_norm": 0.29296875,
"learning_rate": 0.000554,
- "loss": 0.066,
+ "loss": 0.0667,
"macro_f1": 0.32098764181137085,
"num_tokens": 450032.0,
"repeat_count": 0.0,
- "routers_loss": 0.13446088135242462,
+ "routers_loss": 0.13778971135616302,
"skip_count": 2.0,
"step": 278,
"text_loss": 0.4857243597507477
@@ -2658,32 +2658,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.185546875,
"learning_rate": 0.000558,
- "loss": 0.0682,
+ "loss": 0.0672,
"macro_f1": 0.3272727429866791,
"num_tokens": 453195.0,
"repeat_count": 1.0,
- "routers_loss": 0.07270720601081848,
+ "routers_loss": 0.0700262188911438,
"skip_count": 0.0,
"step": 280,
"text_loss": 0.7589789628982544
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 1.3240387437628411,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.28125,
+ "f1_skip": 0.0,
+ "grad_norm": 0.25,
"learning_rate": 0.0005620000000000001,
- "loss": 0.0648,
- "macro_f1": 0.5427350401878357,
+ "loss": 0.0603,
+ "macro_f1": 0.3144654333591461,
"num_tokens": 455942.0,
"repeat_count": 1.0,
- "routers_loss": 0.13866399228572845,
+ "routers_loss": 0.11706235259771347,
"skip_count": 2.0,
"step": 282,
"text_loss": 0.4783432185649872
@@ -2696,13 +2696,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.236328125,
+ "grad_norm": 0.265625,
"learning_rate": 0.000566,
- "loss": 0.0782,
+ "loss": 0.0793,
"macro_f1": 0.3272727429866791,
"num_tokens": 458932.0,
"repeat_count": 0.0,
- "routers_loss": 0.0645354762673378,
+ "routers_loss": 0.07073967158794403,
"skip_count": 1.0,
"step": 284,
"text_loss": 0.7117193937301636
@@ -2715,13 +2715,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.00057,
- "loss": 0.0892,
+ "loss": 0.0915,
"macro_f1": 0.3272727429866791,
"num_tokens": 462650.0,
"repeat_count": 0.0,
- "routers_loss": 0.05967628210783005,
+ "routers_loss": 0.05301115661859512,
"skip_count": 1.0,
"step": 286,
"text_loss": 0.4175460636615753
@@ -2734,13 +2734,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23828125,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.000574,
- "loss": 0.0676,
+ "loss": 0.0675,
"macro_f1": 0.3272727429866791,
"num_tokens": 466290.0,
"repeat_count": 0.0,
- "routers_loss": 0.06438407301902771,
+ "routers_loss": 0.06356479972600937,
"skip_count": 1.0,
"step": 288,
"text_loss": 0.5832946300506592
@@ -2753,13 +2753,13 @@
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.28515625,
"learning_rate": 0.000578,
- "loss": 0.0781,
+ "loss": 0.0805,
"macro_f1": 0.3006536066532135,
"num_tokens": 469296.0,
"repeat_count": 1.0,
- "routers_loss": 0.21225209534168243,
+ "routers_loss": 0.21032999455928802,
"skip_count": 3.0,
"step": 290,
"text_loss": 0.36023473739624023
@@ -2772,13 +2772,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.244140625,
+ "grad_norm": 0.27734375,
"learning_rate": 0.0005819999999999999,
- "loss": 0.0664,
+ "loss": 0.0685,
"macro_f1": 0.32098764181137085,
"num_tokens": 472272.0,
"repeat_count": 1.0,
- "routers_loss": 0.08085516840219498,
+ "routers_loss": 0.08062280714511871,
"skip_count": 1.0,
"step": 292,
"text_loss": 0.37197956442832947
@@ -2791,13 +2791,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.28125,
"learning_rate": 0.0005859999999999999,
- "loss": 0.0874,
+ "loss": 0.0878,
"macro_f1": 0.32098764181137085,
"num_tokens": 475864.0,
"repeat_count": 0.0,
- "routers_loss": 0.05378658324480057,
+ "routers_loss": 0.05023600533604622,
"skip_count": 2.0,
"step": 294,
"text_loss": 0.4765273630619049
@@ -2810,13 +2810,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.2177734375,
"learning_rate": 0.00059,
- "loss": 0.0715,
+ "loss": 0.0728,
"macro_f1": 0.3333333432674408,
"num_tokens": 478916.0,
"repeat_count": 0.0,
- "routers_loss": 0.01145261898636818,
+ "routers_loss": 0.011689410544931889,
"skip_count": 0.0,
"step": 296,
"text_loss": 0.5878773927688599
@@ -2831,11 +2831,11 @@
"f1_skip": 0.0,
"grad_norm": 0.15625,
"learning_rate": 0.000594,
- "loss": 0.0737,
+ "loss": 0.0727,
"macro_f1": 0.3333333432674408,
"num_tokens": 482369.0,
"repeat_count": 0.0,
- "routers_loss": 0.009397956542670727,
+ "routers_loss": 0.010772093199193478,
"skip_count": 0.0,
"step": 298,
"text_loss": 0.4424116313457489
@@ -2848,13 +2848,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.181640625,
"learning_rate": 0.000598,
- "loss": 0.0802,
+ "loss": 0.0787,
"macro_f1": 0.3076923191547394,
"num_tokens": 486049.0,
"repeat_count": 2.0,
- "routers_loss": 0.2389357089996338,
+ "routers_loss": 0.23482851684093475,
"skip_count": 2.0,
"step": 300,
"text_loss": 0.21217775344848633
@@ -2862,18 +2862,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 1.417963017317288,
- "f1_execute": 0.9019607901573181,
+ "f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.2080078125,
"learning_rate": 0.000602,
- "loss": 0.0745,
- "macro_f1": 0.3006536066532135,
+ "loss": 0.073,
+ "macro_f1": 0.3076923191547394,
"num_tokens": 488683.0,
"repeat_count": 1.0,
- "routers_loss": 0.18252353370189667,
+ "routers_loss": 0.18843084573745728,
"skip_count": 3.0,
"step": 302,
"text_loss": 0.2109498232603073
@@ -2886,13 +2886,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.27734375,
+ "grad_norm": 0.279296875,
"learning_rate": 0.000606,
- "loss": 0.0935,
+ "loss": 0.0945,
"macro_f1": 0.3144654333591461,
"num_tokens": 492010.0,
"repeat_count": 0.0,
- "routers_loss": 0.18185268342494965,
+ "routers_loss": 0.17861786484718323,
"skip_count": 3.0,
"step": 304,
"text_loss": 0.8446305394172668
@@ -2905,13 +2905,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.00061,
- "loss": 0.0853,
+ "loss": 0.0827,
"macro_f1": 0.3333333432674408,
"num_tokens": 494764.0,
"repeat_count": 0.0,
- "routers_loss": 0.013210167177021503,
+ "routers_loss": 0.014124520123004913,
"skip_count": 0.0,
"step": 306,
"text_loss": 0.742735743522644
@@ -2924,13 +2924,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.26953125,
"learning_rate": 0.000614,
- "loss": 0.1089,
+ "loss": 0.1071,
"macro_f1": 0.3333333432674408,
"num_tokens": 497820.0,
"repeat_count": 0.0,
- "routers_loss": 0.016936838626861572,
+ "routers_loss": 0.017968112602829933,
"skip_count": 0.0,
"step": 308,
"text_loss": 0.28305482864379883
@@ -2943,13 +2943,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.0006180000000000001,
- "loss": 0.077,
+ "loss": 0.0775,
"macro_f1": 0.32098764181137085,
"num_tokens": 500694.0,
"repeat_count": 0.0,
- "routers_loss": 0.08630389720201492,
+ "routers_loss": 0.08593655377626419,
"skip_count": 2.0,
"step": 310,
"text_loss": 0.3496848940849304
@@ -2962,13 +2962,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.19140625,
"learning_rate": 0.000622,
- "loss": 0.0602,
+ "loss": 0.061,
"macro_f1": 0.3333333432674408,
"num_tokens": 503871.0,
"repeat_count": 0.0,
- "routers_loss": 0.013665963895618916,
+ "routers_loss": 0.016449492424726486,
"skip_count": 0.0,
"step": 312,
"text_loss": 0.6691372990608215
@@ -2981,13 +2981,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.205078125,
"learning_rate": 0.000626,
- "loss": 0.0794,
+ "loss": 0.0815,
"macro_f1": 0.3333333432674408,
"num_tokens": 506730.0,
"repeat_count": 0.0,
- "routers_loss": 0.01584783010184765,
+ "routers_loss": 0.014532964676618576,
"skip_count": 0.0,
"step": 314,
"text_loss": 0.6118118166923523
@@ -3000,13 +3000,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.224609375,
+ "grad_norm": 0.2216796875,
"learning_rate": 0.00063,
- "loss": 0.0762,
+ "loss": 0.0742,
"macro_f1": 0.3333333432674408,
"num_tokens": 510323.0,
"repeat_count": 0.0,
- "routers_loss": 0.01368923019617796,
+ "routers_loss": 0.013093139044940472,
"skip_count": 0.0,
"step": 316,
"text_loss": 0.38126271963119507
@@ -3019,13 +3019,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.388671875,
+ "grad_norm": 0.400390625,
"learning_rate": 0.000634,
- "loss": 0.0908,
+ "loss": 0.0915,
"macro_f1": 0.3333333432674408,
"num_tokens": 514075.0,
"repeat_count": 0.0,
- "routers_loss": 0.009135022759437561,
+ "routers_loss": 0.008627045899629593,
"skip_count": 0.0,
"step": 318,
"text_loss": 0.5983037948608398
@@ -3038,13 +3038,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000638,
- "loss": 0.0949,
+ "loss": 0.1008,
"macro_f1": 0.3272727429866791,
"num_tokens": 517418.0,
"repeat_count": 0.0,
- "routers_loss": 0.046641621738672256,
+ "routers_loss": 0.04561378434300423,
"skip_count": 1.0,
"step": 320,
"text_loss": 0.767257034778595
@@ -3052,18 +3052,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 1.5118872908717347,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23046875,
+ "grad_norm": 0.259765625,
"learning_rate": 0.000642,
- "loss": 0.0925,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0926,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 520443.0,
"repeat_count": 0.0,
- "routers_loss": 0.020637936890125275,
+ "routers_loss": 0.024372953921556473,
"skip_count": 0.0,
"step": 322,
"text_loss": 0.6572105884552002
@@ -3076,13 +3076,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26953125,
+ "grad_norm": 0.30078125,
"learning_rate": 0.000646,
"loss": 0.0822,
"macro_f1": 0.3272727429866791,
"num_tokens": 523317.0,
"repeat_count": 1.0,
- "routers_loss": 0.08289298415184021,
+ "routers_loss": 0.08099937438964844,
"skip_count": 0.0,
"step": 324,
"text_loss": 0.205499529838562
@@ -3090,18 +3090,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 1.530672145582624,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23828125,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.0006500000000000001,
- "loss": 0.0823,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0809,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 526355.0,
"repeat_count": 0.0,
- "routers_loss": 0.06960040330886841,
+ "routers_loss": 0.0657225176692009,
"skip_count": 1.0,
"step": 326,
"text_loss": 0.2587239742279053
@@ -3114,13 +3114,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1162109375,
+ "grad_norm": 0.111328125,
"learning_rate": 0.0006540000000000001,
- "loss": 0.0799,
+ "loss": 0.0779,
"macro_f1": 0.3333333432674408,
"num_tokens": 529689.0,
"repeat_count": 0.0,
- "routers_loss": 0.02087482251226902,
+ "routers_loss": 0.01849208027124405,
"skip_count": 0.0,
"step": 328,
"text_loss": 0.2172023057937622
@@ -3133,13 +3133,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.1845703125,
"learning_rate": 0.0006580000000000001,
- "loss": 0.0757,
+ "loss": 0.0758,
"macro_f1": 0.3333333432674408,
"num_tokens": 532603.0,
"repeat_count": 0.0,
- "routers_loss": 0.016592051833868027,
+ "routers_loss": 0.016184113919734955,
"skip_count": 0.0,
"step": 330,
"text_loss": 0.5980568528175354
@@ -3152,32 +3152,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.22265625,
+ "grad_norm": 0.220703125,
"learning_rate": 0.000662,
- "loss": 0.0438,
+ "loss": 0.0439,
"macro_f1": 0.3333333432674408,
"num_tokens": 536056.0,
"repeat_count": 0.0,
- "routers_loss": 0.012950568459928036,
+ "routers_loss": 0.01303898449987173,
"skip_count": 0.0,
"step": 332,
"text_loss": 0.5421966314315796
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
"epoch": 1.5682418550044028,
- "f1_execute": 0.8799999952316284,
+ "f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.310546875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.296875,
"learning_rate": 0.000666,
- "loss": 0.0964,
- "macro_f1": 0.29333335161209106,
+ "loss": 0.0963,
+ "macro_f1": 0.465986430644989,
"num_tokens": 539231.0,
"repeat_count": 3.0,
- "routers_loss": 0.3373340964317322,
+ "routers_loss": 0.3075675964355469,
"skip_count": 3.0,
"step": 334,
"text_loss": 0.19719554483890533
@@ -3190,13 +3190,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.171875,
+ "grad_norm": 0.173828125,
"learning_rate": 0.00067,
"loss": 0.0706,
"macro_f1": 0.3333333432674408,
"num_tokens": 542038.0,
"repeat_count": 0.0,
- "routers_loss": 0.008110735565423965,
+ "routers_loss": 0.009116224013268948,
"skip_count": 0.0,
"step": 336,
"text_loss": 0.3407036066055298
@@ -3209,13 +3209,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.248046875,
+ "grad_norm": 0.2421875,
"learning_rate": 0.000674,
- "loss": 0.0771,
+ "loss": 0.0768,
"macro_f1": 0.3333333432674408,
"num_tokens": 545019.0,
"repeat_count": 0.0,
- "routers_loss": 0.01841609925031662,
+ "routers_loss": 0.021463042125105858,
"skip_count": 0.0,
"step": 338,
"text_loss": 0.24486012756824493
@@ -3228,13 +3228,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.0006780000000000001,
- "loss": 0.0894,
+ "loss": 0.0889,
"macro_f1": 0.3333333432674408,
"num_tokens": 548036.0,
"repeat_count": 0.0,
- "routers_loss": 0.01612614095211029,
+ "routers_loss": 0.01857556402683258,
"skip_count": 0.0,
"step": 340,
"text_loss": 0.28140124678611755
@@ -3247,13 +3247,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.125,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0006820000000000001,
- "loss": 0.0611,
+ "loss": 0.0617,
"macro_f1": 0.3006536364555359,
"num_tokens": 551419.0,
"repeat_count": 2.0,
- "routers_loss": 0.26202192902565,
+ "routers_loss": 0.27090007066726685,
"skip_count": 3.0,
"step": 342,
"text_loss": 0.20690307021141052
@@ -3266,13 +3266,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.3046875,
"learning_rate": 0.0006860000000000001,
- "loss": 0.1013,
+ "loss": 0.1047,
"macro_f1": 0.32098764181137085,
"num_tokens": 554037.0,
"repeat_count": 0.0,
- "routers_loss": 0.09235779196023941,
+ "routers_loss": 0.09231195598840714,
"skip_count": 2.0,
"step": 344,
"text_loss": 0.4479128420352936
@@ -3285,13 +3285,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.255859375,
"learning_rate": 0.00069,
- "loss": 0.0856,
+ "loss": 0.0883,
"macro_f1": 0.3333333432674408,
"num_tokens": 556672.0,
"repeat_count": 0.0,
- "routers_loss": 0.010735333897173405,
+ "routers_loss": 0.00935924518853426,
"skip_count": 0.0,
"step": 346,
"text_loss": 0.6377320289611816
@@ -3304,13 +3304,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.000694,
- "loss": 0.0778,
+ "loss": 0.0781,
"macro_f1": 0.32098764181137085,
"num_tokens": 559756.0,
"repeat_count": 0.0,
- "routers_loss": 0.14742356538772583,
+ "routers_loss": 0.17641772329807281,
"skip_count": 2.0,
"step": 348,
"text_loss": 0.6097636222839355
@@ -3323,13 +3323,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.30859375,
+ "grad_norm": 0.30078125,
"learning_rate": 0.0006979999999999999,
- "loss": 0.0614,
+ "loss": 0.0616,
"macro_f1": 0.5492662787437439,
"num_tokens": 563415.0,
"repeat_count": 0.0,
- "routers_loss": 0.06606879830360413,
+ "routers_loss": 0.06240406632423401,
"skip_count": 2.0,
"step": 350,
"text_loss": 0.5291631817817688
@@ -3342,13 +3342,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.322265625,
+ "grad_norm": 0.296875,
"learning_rate": 0.0007019999999999999,
- "loss": 0.1033,
+ "loss": 0.1026,
"macro_f1": 0.3333333432674408,
"num_tokens": 566357.0,
"repeat_count": 0.0,
- "routers_loss": 0.012873432599008083,
+ "routers_loss": 0.012269247323274612,
"skip_count": 0.0,
"step": 352,
"text_loss": 0.5170195698738098
@@ -3361,13 +3361,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0007059999999999999,
- "loss": 0.0819,
+ "loss": 0.0815,
"macro_f1": 0.32098764181137085,
"num_tokens": 569449.0,
"repeat_count": 0.0,
- "routers_loss": 0.07853665202856064,
+ "routers_loss": 0.07515309751033783,
"skip_count": 2.0,
"step": 354,
"text_loss": 0.34507250785827637
@@ -3380,13 +3380,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.251953125,
+ "grad_norm": 0.263671875,
"learning_rate": 0.00071,
- "loss": 0.0804,
+ "loss": 0.0791,
"macro_f1": 0.3144654333591461,
"num_tokens": 572761.0,
"repeat_count": 1.0,
- "routers_loss": 0.2216549813747406,
+ "routers_loss": 0.20768006145954132,
"skip_count": 2.0,
"step": 356,
"text_loss": 0.3158532381057739
@@ -3399,13 +3399,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.185546875,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.000714,
- "loss": 0.0675,
+ "loss": 0.0682,
"macro_f1": 0.3333333432674408,
"num_tokens": 575909.0,
"repeat_count": 0.0,
- "routers_loss": 0.02423691377043724,
+ "routers_loss": 0.025329967960715294,
"skip_count": 0.0,
"step": 358,
"text_loss": 0.21455390751361847
@@ -3413,18 +3413,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 1.6903434106251836,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.21484375,
"learning_rate": 0.000718,
- "loss": 0.0781,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0775,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 579186.0,
"repeat_count": 1.0,
- "routers_loss": 0.07496294379234314,
+ "routers_loss": 0.07676175981760025,
"skip_count": 0.0,
"step": 360,
"text_loss": 0.61895352602005
@@ -3437,13 +3437,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2138671875,
+ "grad_norm": 0.197265625,
"learning_rate": 0.000722,
- "loss": 0.0778,
+ "loss": 0.0781,
"macro_f1": 0.32098767161369324,
"num_tokens": 582437.0,
"repeat_count": 0.0,
- "routers_loss": 0.08181872963905334,
+ "routers_loss": 0.08070661872625351,
"skip_count": 1.0,
"step": 362,
"text_loss": 0.20557661354541779
@@ -3456,13 +3456,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.2216796875,
"learning_rate": 0.000726,
- "loss": 0.1112,
+ "loss": 0.11,
"macro_f1": 0.3333333432674408,
"num_tokens": 586096.0,
"repeat_count": 0.0,
- "routers_loss": 0.016959719359874725,
+ "routers_loss": 0.015891313552856445,
"skip_count": 0.0,
"step": 364,
"text_loss": 0.597991943359375
@@ -3475,13 +3475,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.15625,
"learning_rate": 0.00073,
- "loss": 0.0577,
+ "loss": 0.0573,
"macro_f1": 0.3076923191547394,
"num_tokens": 589520.0,
"repeat_count": 1.0,
- "routers_loss": 0.13295969367027283,
+ "routers_loss": 0.12844261527061462,
"skip_count": 3.0,
"step": 366,
"text_loss": 0.2944789230823517
@@ -3494,13 +3494,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1455078125,
+ "grad_norm": 0.150390625,
"learning_rate": 0.000734,
- "loss": 0.0986,
+ "loss": 0.1005,
"macro_f1": 0.3333333432674408,
"num_tokens": 592691.0,
"repeat_count": 0.0,
- "routers_loss": 0.02476893551647663,
+ "routers_loss": 0.02382199838757515,
"skip_count": 0.0,
"step": 368,
"text_loss": 0.23989969491958618
@@ -3513,13 +3513,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1796875,
"learning_rate": 0.000738,
- "loss": 0.0682,
+ "loss": 0.0661,
"macro_f1": 0.3333333432674408,
"num_tokens": 596004.0,
"repeat_count": 0.0,
- "routers_loss": 0.019863395020365715,
+ "routers_loss": 0.018812084570527077,
"skip_count": 0.0,
"step": 370,
"text_loss": 0.22111408412456512
@@ -3532,13 +3532,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.2412109375,
"learning_rate": 0.000742,
- "loss": 0.0663,
+ "loss": 0.0666,
"macro_f1": 0.3272727429866791,
"num_tokens": 599087.0,
"repeat_count": 0.0,
- "routers_loss": 0.07230417430400848,
+ "routers_loss": 0.08290331065654755,
"skip_count": 1.0,
"step": 372,
"text_loss": 0.2567356526851654
@@ -3551,13 +3551,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.2412109375,
"learning_rate": 0.000746,
- "loss": 0.0986,
+ "loss": 0.0941,
"macro_f1": 0.32098764181137085,
"num_tokens": 602330.0,
"repeat_count": 1.0,
- "routers_loss": 0.11727793514728546,
+ "routers_loss": 0.11482042074203491,
"skip_count": 1.0,
"step": 374,
"text_loss": 0.7217292785644531
@@ -3570,13 +3570,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.224609375,
+ "grad_norm": 0.2265625,
"learning_rate": 0.00075,
- "loss": 0.0724,
+ "loss": 0.0728,
"macro_f1": 0.3272727429866791,
"num_tokens": 605503.0,
"repeat_count": 1.0,
- "routers_loss": 0.13495951890945435,
+ "routers_loss": 0.11849870532751083,
"skip_count": 0.0,
"step": 376,
"text_loss": 0.5122153759002686
@@ -3589,13 +3589,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.23046875,
+ "grad_norm": 0.2333984375,
"learning_rate": 0.000754,
- "loss": 0.0823,
+ "loss": 0.0835,
"macro_f1": 0.32098767161369324,
"num_tokens": 608505.0,
"repeat_count": 0.0,
- "routers_loss": 0.07612533867359161,
+ "routers_loss": 0.07090992480516434,
"skip_count": 1.0,
"step": 378,
"text_loss": 0.2204965502023697
@@ -3608,13 +3608,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.1826171875,
"learning_rate": 0.000758,
- "loss": 0.0803,
+ "loss": 0.0794,
"macro_f1": 0.3272727429866791,
"num_tokens": 611193.0,
"repeat_count": 0.0,
- "routers_loss": 0.0484120175242424,
+ "routers_loss": 0.03812089189887047,
"skip_count": 1.0,
"step": 380,
"text_loss": 0.44909021258354187
@@ -3627,13 +3627,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.000762,
- "loss": 0.0866,
+ "loss": 0.0882,
"macro_f1": 0.3272727429866791,
"num_tokens": 614231.0,
"repeat_count": 1.0,
- "routers_loss": 0.10939671844244003,
+ "routers_loss": 0.10270529240369797,
"skip_count": 0.0,
"step": 382,
"text_loss": 0.13624964654445648
@@ -3646,13 +3646,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.326171875,
+ "grad_norm": 0.330078125,
"learning_rate": 0.0007660000000000001,
- "loss": 0.1083,
+ "loss": 0.1107,
"macro_f1": 0.32098764181137085,
"num_tokens": 617090.0,
"repeat_count": 1.0,
- "routers_loss": 0.11382336914539337,
+ "routers_loss": 0.11624004691839218,
"skip_count": 1.0,
"step": 384,
"text_loss": 0.7314052581787109
@@ -3667,11 +3667,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1396484375,
"learning_rate": 0.0007700000000000001,
- "loss": 0.0616,
+ "loss": 0.0628,
"macro_f1": 0.32098764181137085,
"num_tokens": 620596.0,
"repeat_count": 0.0,
- "routers_loss": 0.07494530081748962,
+ "routers_loss": 0.07114322483539581,
"skip_count": 2.0,
"step": 386,
"text_loss": 0.503322958946228
@@ -3684,13 +3684,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.298828125,
+ "grad_norm": 0.306640625,
"learning_rate": 0.0007740000000000001,
- "loss": 0.0816,
+ "loss": 0.0829,
"macro_f1": 0.32098764181137085,
"num_tokens": 624108.0,
"repeat_count": 0.0,
- "routers_loss": 0.05718417093157768,
+ "routers_loss": 0.06061873584985733,
"skip_count": 2.0,
"step": 388,
"text_loss": 0.11481904983520508
@@ -3703,13 +3703,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1982421875,
+ "grad_norm": 0.2099609375,
"learning_rate": 0.000778,
- "loss": 0.0783,
+ "loss": 0.0791,
"macro_f1": 0.3006536364555359,
"num_tokens": 626895.0,
"repeat_count": 1.0,
- "routers_loss": 0.2848989963531494,
+ "routers_loss": 0.2921771705150604,
"skip_count": 4.0,
"step": 390,
"text_loss": 0.3069624602794647
@@ -3722,13 +3722,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.30078125,
+ "grad_norm": 0.30859375,
"learning_rate": 0.000782,
- "loss": 0.0608,
+ "loss": 0.0605,
"macro_f1": 0.3076923191547394,
"num_tokens": 630204.0,
"repeat_count": 0.0,
- "routers_loss": 0.2050076276063919,
+ "routers_loss": 0.202707901597023,
"skip_count": 4.0,
"step": 392,
"text_loss": 0.6022785305976868
@@ -3741,13 +3741,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28125,
+ "grad_norm": 0.29296875,
"learning_rate": 0.000786,
- "loss": 0.0863,
+ "loss": 0.0877,
"macro_f1": 0.3333333432674408,
"num_tokens": 634373.0,
"repeat_count": 0.0,
- "routers_loss": 0.020946886390447617,
+ "routers_loss": 0.0221510399132967,
"skip_count": 0.0,
"step": 394,
"text_loss": 0.26787394285202026
@@ -3760,13 +3760,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.376953125,
+ "grad_norm": 0.37890625,
"learning_rate": 0.00079,
- "loss": 0.0798,
+ "loss": 0.0805,
"macro_f1": 0.32098764181137085,
"num_tokens": 637442.0,
"repeat_count": 2.0,
- "routers_loss": 0.1270289123058319,
+ "routers_loss": 0.12636390328407288,
"skip_count": 0.0,
"step": 396,
"text_loss": 0.2799781560897827
@@ -3779,13 +3779,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.2080078125,
"learning_rate": 0.0007940000000000001,
- "loss": 0.0701,
+ "loss": 0.0724,
"macro_f1": 0.32098764181137085,
"num_tokens": 641231.0,
"repeat_count": 0.0,
- "routers_loss": 0.08012636005878448,
+ "routers_loss": 0.07933453470468521,
"skip_count": 2.0,
"step": 398,
"text_loss": 0.2507784366607666
@@ -3798,13 +3798,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.0007980000000000001,
- "loss": 0.0901,
+ "loss": 0.0909,
"macro_f1": 0.3272727429866791,
"num_tokens": 644560.0,
"repeat_count": 1.0,
- "routers_loss": 0.09315784275531769,
+ "routers_loss": 0.10324911028146744,
"skip_count": 0.0,
"step": 400,
"text_loss": 0.7756280303001404
@@ -3817,13 +3817,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0008020000000000001,
- "loss": 0.078,
+ "loss": 0.0783,
"macro_f1": 0.3144654333591461,
"num_tokens": 647393.0,
"repeat_count": 1.0,
- "routers_loss": 0.18492189049720764,
+ "routers_loss": 0.18546262383460999,
"skip_count": 2.0,
"step": 402,
"text_loss": 0.5013328194618225
@@ -3836,13 +3836,13 @@
"f1_execute": 0.8571428656578064,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.283203125,
"learning_rate": 0.0008060000000000001,
- "loss": 0.0801,
+ "loss": 0.0787,
"macro_f1": 0.2857142984867096,
"num_tokens": 650355.0,
"repeat_count": 3.0,
- "routers_loss": 0.32641324400901794,
+ "routers_loss": 0.3280293643474579,
"skip_count": 4.0,
"step": 404,
"text_loss": 0.2842077314853668
@@ -3855,13 +3855,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2080078125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.0008100000000000001,
- "loss": 0.0905,
+ "loss": 0.0901,
"macro_f1": 0.3333333432674408,
"num_tokens": 654280.0,
"repeat_count": 0.0,
- "routers_loss": 0.02722037397325039,
+ "routers_loss": 0.02623247355222702,
"skip_count": 0.0,
"step": 406,
"text_loss": 0.46742817759513855
@@ -3874,13 +3874,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.216796875,
"learning_rate": 0.0008139999999999999,
- "loss": 0.0958,
+ "loss": 0.0945,
"macro_f1": 0.3333333432674408,
"num_tokens": 657568.0,
"repeat_count": 0.0,
- "routers_loss": 0.010129833593964577,
+ "routers_loss": 0.009744114242494106,
"skip_count": 0.0,
"step": 408,
"text_loss": 0.7168047428131104
@@ -3893,13 +3893,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2373046875,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.0008179999999999999,
- "loss": 0.1084,
+ "loss": 0.1065,
"macro_f1": 0.32098764181137085,
"num_tokens": 660593.0,
"repeat_count": 0.0,
- "routers_loss": 0.07298308610916138,
+ "routers_loss": 0.07591600716114044,
"skip_count": 2.0,
"step": 410,
"text_loss": 0.449823260307312
@@ -3912,13 +3912,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.15625,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0008219999999999999,
- "loss": 0.0802,
+ "loss": 0.0795,
"macro_f1": 0.3333333432674408,
"num_tokens": 663916.0,
"repeat_count": 0.0,
- "routers_loss": 0.024257874116301537,
+ "routers_loss": 0.02076602540910244,
"skip_count": 0.0,
"step": 412,
"text_loss": 0.4764713943004608
@@ -3931,13 +3931,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1904296875,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.000826,
- "loss": 0.0842,
+ "loss": 0.0836,
"macro_f1": 0.3272727429866791,
"num_tokens": 667502.0,
"repeat_count": 0.0,
- "routers_loss": 0.048864223062992096,
+ "routers_loss": 0.049170155078172684,
"skip_count": 1.0,
"step": 414,
"text_loss": 0.30333325266838074
@@ -3950,13 +3950,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1513671875,
"learning_rate": 0.00083,
- "loss": 0.1026,
+ "loss": 0.1021,
"macro_f1": 0.3272727429866791,
"num_tokens": 670510.0,
"repeat_count": 1.0,
- "routers_loss": 0.1592330038547516,
+ "routers_loss": 0.15554003417491913,
"skip_count": 0.0,
"step": 416,
"text_loss": 0.3691870868206024
@@ -3969,13 +3969,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000834,
- "loss": 0.0963,
+ "loss": 0.1013,
"macro_f1": 0.3333333432674408,
"num_tokens": 674761.0,
"repeat_count": 0.0,
- "routers_loss": 0.02291976846754551,
+ "routers_loss": 0.024516675621271133,
"skip_count": 0.0,
"step": 418,
"text_loss": 0.32850381731987
@@ -3988,13 +3988,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.000838,
- "loss": 0.0634,
+ "loss": 0.0649,
"macro_f1": 0.3333333432674408,
"num_tokens": 678055.0,
"repeat_count": 0.0,
- "routers_loss": 0.010272650048136711,
+ "routers_loss": 0.011026890948414803,
"skip_count": 0.0,
"step": 420,
"text_loss": 0.6637290716171265
@@ -4007,13 +4007,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28125,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000842,
- "loss": 0.0786,
+ "loss": 0.0771,
"macro_f1": 0.3272727429866791,
"num_tokens": 680979.0,
"repeat_count": 0.0,
- "routers_loss": 0.0692613497376442,
+ "routers_loss": 0.07451887428760529,
"skip_count": 1.0,
"step": 422,
"text_loss": 0.27131685614585876
@@ -4026,13 +4026,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12890625,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.000846,
- "loss": 0.0706,
+ "loss": 0.0714,
"macro_f1": 0.32098764181137085,
"num_tokens": 684144.0,
"repeat_count": 1.0,
- "routers_loss": 0.12713804841041565,
+ "routers_loss": 0.11341800540685654,
"skip_count": 1.0,
"step": 424,
"text_loss": 0.652126669883728
@@ -4045,13 +4045,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.00085,
- "loss": 0.0758,
+ "loss": 0.0754,
"macro_f1": 0.3272727429866791,
"num_tokens": 687004.0,
"repeat_count": 1.0,
- "routers_loss": 0.08670130372047424,
+ "routers_loss": 0.08985847979784012,
"skip_count": 0.0,
"step": 426,
"text_loss": 0.2589428424835205
@@ -4064,13 +4064,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.23828125,
"learning_rate": 0.000854,
- "loss": 0.0857,
+ "loss": 0.0866,
"macro_f1": 0.3333333432674408,
"num_tokens": 689702.0,
"repeat_count": 0.0,
- "routers_loss": 0.01053862925618887,
+ "routers_loss": 0.011355436407029629,
"skip_count": 0.0,
"step": 428,
"text_loss": 0.8909716010093689
@@ -4083,13 +4083,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.000858,
- "loss": 0.0615,
+ "loss": 0.0623,
"macro_f1": 0.3333333432674408,
"num_tokens": 692698.0,
"repeat_count": 0.0,
- "routers_loss": 0.012946994043886662,
+ "routers_loss": 0.013788948766887188,
"skip_count": 0.0,
"step": 430,
"text_loss": 0.19141142070293427
@@ -4102,13 +4102,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.000862,
- "loss": 0.0498,
+ "loss": 0.0499,
"macro_f1": 0.32098764181137085,
"num_tokens": 696007.0,
"repeat_count": 0.0,
- "routers_loss": 0.08222822099924088,
+ "routers_loss": 0.07998392730951309,
"skip_count": 2.0,
"step": 432,
"text_loss": 0.1611809879541397
@@ -4121,13 +4121,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.173828125,
"learning_rate": 0.000866,
- "loss": 0.0532,
+ "loss": 0.0541,
"macro_f1": 0.32098764181137085,
"num_tokens": 700271.0,
"repeat_count": 0.0,
- "routers_loss": 0.07086442410945892,
+ "routers_loss": 0.06988382339477539,
"skip_count": 2.0,
"step": 434,
"text_loss": 0.37254223227500916
@@ -4140,13 +4140,13 @@
"f1_execute": 0.8333333730697632,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.00087,
- "loss": 0.0825,
+ "loss": 0.0834,
"macro_f1": 0.2777777910232544,
"num_tokens": 703519.0,
"repeat_count": 3.0,
- "routers_loss": 0.29007306694984436,
+ "routers_loss": 0.28240787982940674,
"skip_count": 5.0,
"step": 436,
"text_loss": 0.29636648297309875
@@ -4159,13 +4159,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.396484375,
+ "grad_norm": 0.423828125,
"learning_rate": 0.000874,
- "loss": 0.0658,
+ "loss": 0.0657,
"macro_f1": 0.3333333432674408,
"num_tokens": 706826.0,
"repeat_count": 0.0,
- "routers_loss": 0.014652491547167301,
+ "routers_loss": 0.013924967497587204,
"skip_count": 0.0,
"step": 438,
"text_loss": 0.20867908000946045
@@ -4178,13 +4178,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.000878,
- "loss": 0.0685,
+ "loss": 0.0657,
"macro_f1": 0.3333333432674408,
"num_tokens": 710530.0,
"repeat_count": 0.0,
- "routers_loss": 0.013720969669520855,
+ "routers_loss": 0.01170142088085413,
"skip_count": 0.0,
"step": 440,
"text_loss": 0.7273373007774353
@@ -4197,13 +4197,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.173828125,
+ "grad_norm": 0.171875,
"learning_rate": 0.000882,
- "loss": 0.0771,
+ "loss": 0.076,
"macro_f1": 0.3333333432674408,
"num_tokens": 713503.0,
"repeat_count": 0.0,
- "routers_loss": 0.011687638238072395,
+ "routers_loss": 0.011930872686207294,
"skip_count": 0.0,
"step": 442,
"text_loss": 0.39314430952072144
@@ -4216,13 +4216,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.0008860000000000001,
- "loss": 0.0604,
+ "loss": 0.0592,
"macro_f1": 0.3333333432674408,
"num_tokens": 716582.0,
"repeat_count": 0.0,
- "routers_loss": 0.007869532331824303,
+ "routers_loss": 0.008630385622382164,
"skip_count": 0.0,
"step": 444,
"text_loss": 0.5925271511077881
@@ -4230,18 +4230,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 2.0939242735544465,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.23046875,
"learning_rate": 0.0008900000000000001,
- "loss": 0.0797,
- "macro_f1": 0.3076923191547394,
+ "loss": 0.0811,
+ "macro_f1": 0.3006536066532135,
"num_tokens": 719941.0,
"repeat_count": 3.0,
- "routers_loss": 0.3034668564796448,
+ "routers_loss": 0.3015584945678711,
"skip_count": 1.0,
"step": 446,
"text_loss": 0.5059905052185059
@@ -4254,13 +4254,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2314453125,
+ "grad_norm": 0.203125,
"learning_rate": 0.000894,
- "loss": 0.0823,
+ "loss": 0.0822,
"macro_f1": 0.31446540355682373,
"num_tokens": 723113.0,
"repeat_count": 1.0,
- "routers_loss": 0.11066079139709473,
+ "routers_loss": 0.10897493362426758,
"skip_count": 1.0,
"step": 448,
"text_loss": 0.19616436958312988
@@ -4273,13 +4273,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3046875,
+ "grad_norm": 0.33984375,
"learning_rate": 0.000898,
- "loss": 0.0773,
+ "loss": 0.0782,
"macro_f1": 0.32098764181137085,
"num_tokens": 726193.0,
"repeat_count": 0.0,
- "routers_loss": 0.0755370482802391,
+ "routers_loss": 0.07236456125974655,
"skip_count": 2.0,
"step": 450,
"text_loss": 0.1773054152727127
@@ -4292,13 +4292,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.28125,
+ "grad_norm": 0.3203125,
"learning_rate": 0.000902,
- "loss": 0.0596,
+ "loss": 0.058,
"macro_f1": 0.3272727429866791,
"num_tokens": 729275.0,
"repeat_count": 1.0,
- "routers_loss": 0.08470689505338669,
+ "routers_loss": 0.08184371143579483,
"skip_count": 0.0,
"step": 452,
"text_loss": 0.4927310049533844
@@ -4311,13 +4311,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19921875,
+ "grad_norm": 0.1953125,
"learning_rate": 0.000906,
- "loss": 0.0608,
+ "loss": 0.0607,
"macro_f1": 0.3333333432674408,
"num_tokens": 731948.0,
"repeat_count": 0.0,
- "routers_loss": 0.0130238626152277,
+ "routers_loss": 0.014033539220690727,
"skip_count": 0.0,
"step": 454,
"text_loss": 0.4745742678642273
@@ -4330,13 +4330,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.00091,
- "loss": 0.0652,
+ "loss": 0.0651,
"macro_f1": 0.3333333432674408,
"num_tokens": 735351.0,
"repeat_count": 0.0,
- "routers_loss": 0.007108641788363457,
+ "routers_loss": 0.0071774693205952644,
"skip_count": 0.0,
"step": 456,
"text_loss": 0.18523462116718292
@@ -4351,11 +4351,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.400390625,
"learning_rate": 0.0009140000000000001,
- "loss": 0.0746,
+ "loss": 0.0738,
"macro_f1": 0.5492662787437439,
"num_tokens": 738587.0,
"repeat_count": 0.0,
- "routers_loss": 0.06834109872579575,
+ "routers_loss": 0.07781517505645752,
"skip_count": 2.0,
"step": 458,
"text_loss": 0.3459635376930237
@@ -4368,13 +4368,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.28125,
"learning_rate": 0.0009180000000000001,
- "loss": 0.0733,
+ "loss": 0.0723,
"macro_f1": 0.3076923191547394,
"num_tokens": 741779.0,
"repeat_count": 0.0,
- "routers_loss": 0.10230778902769089,
+ "routers_loss": 0.09529037028551102,
"skip_count": 2.0,
"step": 460,
"text_loss": 0.20197433233261108
@@ -4387,13 +4387,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.1865234375,
"learning_rate": 0.0009220000000000001,
- "loss": 0.0528,
+ "loss": 0.0519,
"macro_f1": 0.3333333432674408,
"num_tokens": 745355.0,
"repeat_count": 0.0,
- "routers_loss": 0.009987542405724525,
+ "routers_loss": 0.009765669703483582,
"skip_count": 0.0,
"step": 462,
"text_loss": 0.7031404376029968
@@ -4406,13 +4406,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.125,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009260000000000001,
- "loss": 0.0536,
+ "loss": 0.0527,
"macro_f1": 0.3272727429866791,
"num_tokens": 748628.0,
"repeat_count": 0.0,
- "routers_loss": 0.03448869287967682,
+ "routers_loss": 0.03344850242137909,
"skip_count": 1.0,
"step": 464,
"text_loss": 0.21274663507938385
@@ -4425,13 +4425,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.173828125,
"learning_rate": 0.00093,
- "loss": 0.053,
+ "loss": 0.0534,
"macro_f1": 0.3076923191547394,
"num_tokens": 751472.0,
"repeat_count": 2.0,
- "routers_loss": 0.13631699979305267,
+ "routers_loss": 0.1354292333126068,
"skip_count": 2.0,
"step": 466,
"text_loss": 0.5350717306137085
@@ -4444,13 +4444,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.142578125,
"learning_rate": 0.000934,
- "loss": 0.06,
+ "loss": 0.0598,
"macro_f1": 0.3272727429866791,
"num_tokens": 754479.0,
"repeat_count": 0.0,
- "routers_loss": 0.053951870650053024,
+ "routers_loss": 0.056420840322971344,
"skip_count": 1.0,
"step": 468,
"text_loss": 0.28153330087661743
@@ -4463,13 +4463,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.228515625,
+ "grad_norm": 0.234375,
"learning_rate": 0.0009379999999999999,
- "loss": 0.059,
+ "loss": 0.0597,
"macro_f1": 0.31446540355682373,
"num_tokens": 757872.0,
"repeat_count": 1.0,
- "routers_loss": 0.14479905366897583,
+ "routers_loss": 0.1622387170791626,
"skip_count": 1.0,
"step": 470,
"text_loss": 0.22956843674182892
@@ -4482,13 +4482,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.44140625,
+ "grad_norm": 0.5,
"learning_rate": 0.000942,
- "loss": 0.0913,
+ "loss": 0.0953,
"macro_f1": 0.32098764181137085,
"num_tokens": 760468.0,
"repeat_count": 0.0,
- "routers_loss": 0.056221429258584976,
+ "routers_loss": 0.05146972835063934,
"skip_count": 2.0,
"step": 472,
"text_loss": 0.4513966739177704
@@ -4501,13 +4501,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1904296875,
+ "grad_norm": 0.212890625,
"learning_rate": 0.000946,
- "loss": 0.0591,
+ "loss": 0.0592,
"macro_f1": 0.3272727429866791,
"num_tokens": 763519.0,
"repeat_count": 1.0,
- "routers_loss": 0.09729792177677155,
+ "routers_loss": 0.09022669494152069,
"skip_count": 0.0,
"step": 474,
"text_loss": 0.25758957862854004
@@ -4520,13 +4520,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12158203125,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.00095,
- "loss": 0.0496,
+ "loss": 0.0498,
"macro_f1": 0.3272727429866791,
"num_tokens": 767391.0,
"repeat_count": 0.0,
- "routers_loss": 0.029447713866829872,
+ "routers_loss": 0.03044828027486801,
"skip_count": 1.0,
"step": 476,
"text_loss": 0.21366681158542633
@@ -4539,13 +4539,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.271484375,
+ "grad_norm": 0.291015625,
"learning_rate": 0.000954,
- "loss": 0.0801,
+ "loss": 0.0802,
"macro_f1": 0.3272727429866791,
"num_tokens": 770338.0,
"repeat_count": 0.0,
- "routers_loss": 0.09337342530488968,
+ "routers_loss": 0.10397060960531235,
"skip_count": 1.0,
"step": 478,
"text_loss": 1.0396177768707275
@@ -4560,11 +4560,11 @@
"f1_skip": 0.0,
"grad_norm": 0.267578125,
"learning_rate": 0.000958,
- "loss": 0.1102,
+ "loss": 0.1099,
"macro_f1": 0.285714328289032,
"num_tokens": 773699.0,
"repeat_count": 2.0,
- "routers_loss": 0.23193210363388062,
+ "routers_loss": 0.22604143619537354,
"skip_count": 4.0,
"step": 480,
"text_loss": 0.2570283114910126
@@ -4572,18 +4572,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 2.2629879659524508,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.146484375,
"learning_rate": 0.000962,
- "loss": 0.0669,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0667,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 777473.0,
"repeat_count": 0.0,
- "routers_loss": 0.046257760375738144,
+ "routers_loss": 0.048258859664201736,
"skip_count": 1.0,
"step": 482,
"text_loss": 0.2540103495121002
@@ -4596,13 +4596,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1708984375,
+ "grad_norm": 0.197265625,
"learning_rate": 0.000966,
- "loss": 0.0552,
+ "loss": 0.0592,
"macro_f1": 0.3333333432674408,
"num_tokens": 780833.0,
"repeat_count": 0.0,
- "routers_loss": 0.01683143898844719,
+ "routers_loss": 0.023018671199679375,
"skip_count": 0.0,
"step": 484,
"text_loss": 0.38524550199508667
@@ -4615,13 +4615,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.326171875,
+ "grad_norm": 0.314453125,
"learning_rate": 0.0009699999999999999,
- "loss": 0.071,
+ "loss": 0.0709,
"macro_f1": 0.3272727429866791,
"num_tokens": 783656.0,
"repeat_count": 0.0,
- "routers_loss": 0.04129387438297272,
+ "routers_loss": 0.044845327734947205,
"skip_count": 1.0,
"step": 486,
"text_loss": 0.5859048366546631
@@ -4634,13 +4634,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000974,
- "loss": 0.0605,
+ "loss": 0.0615,
"macro_f1": 0.3333333432674408,
"num_tokens": 787173.0,
"repeat_count": 0.0,
- "routers_loss": 0.01262948103249073,
+ "routers_loss": 0.010898692533373833,
"skip_count": 0.0,
"step": 488,
"text_loss": 0.3456067442893982
@@ -4653,13 +4653,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2578125,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000978,
- "loss": 0.081,
+ "loss": 0.0796,
"macro_f1": 0.32098764181137085,
"num_tokens": 790395.0,
"repeat_count": 0.0,
- "routers_loss": 0.07404553890228271,
+ "routers_loss": 0.06497956812381744,
"skip_count": 2.0,
"step": 490,
"text_loss": 0.3751123249530792
@@ -4672,13 +4672,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.000982,
- "loss": 0.0751,
+ "loss": 0.0772,
"macro_f1": 0.3272727429866791,
"num_tokens": 793137.0,
"repeat_count": 0.0,
- "routers_loss": 0.06795930862426758,
+ "routers_loss": 0.07763728499412537,
"skip_count": 1.0,
"step": 492,
"text_loss": 0.43296709656715393
@@ -4691,13 +4691,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.1416015625,
"learning_rate": 0.0009860000000000001,
- "loss": 0.0804,
+ "loss": 0.0819,
"macro_f1": 0.3333333432674408,
"num_tokens": 796497.0,
"repeat_count": 0.0,
- "routers_loss": 0.02233024686574936,
+ "routers_loss": 0.02127906307578087,
"skip_count": 0.0,
"step": 494,
"text_loss": 0.4841311275959015
@@ -4710,13 +4710,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.2138671875,
"learning_rate": 0.00099,
- "loss": 0.0731,
+ "loss": 0.073,
"macro_f1": 0.3272727429866791,
"num_tokens": 799361.0,
"repeat_count": 1.0,
- "routers_loss": 0.07979031652212143,
+ "routers_loss": 0.09518691152334213,
"skip_count": 0.0,
"step": 496,
"text_loss": 0.5094487071037292
@@ -4729,13 +4729,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.130859375,
"learning_rate": 0.000994,
- "loss": 0.0795,
+ "loss": 0.0789,
"macro_f1": 0.5492662787437439,
"num_tokens": 802629.0,
"repeat_count": 0.0,
- "routers_loss": 0.045646365731954575,
+ "routers_loss": 0.0563947930932045,
"skip_count": 2.0,
"step": 498,
"text_loss": 0.42783617973327637
@@ -4748,13 +4748,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.1865234375,
"learning_rate": 0.000998,
"loss": 0.0476,
"macro_f1": 0.3272727429866791,
"num_tokens": 805881.0,
"repeat_count": 1.0,
- "routers_loss": 0.09717849642038345,
+ "routers_loss": 0.10570426285266876,
"skip_count": 0.0,
"step": 500,
"text_loss": 0.28395503759384155
@@ -4767,13 +4767,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.30078125,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0009999999760498814,
- "loss": 0.0894,
+ "loss": 0.0849,
"macro_f1": 0.5492662787437439,
"num_tokens": 809283.0,
"repeat_count": 0.0,
- "routers_loss": 0.03948225453495979,
+ "routers_loss": 0.031202208250761032,
"skip_count": 2.0,
"step": 502,
"text_loss": 0.32970911264419556
@@ -4786,13 +4786,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.15625,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009999997844489475,
- "loss": 0.0557,
+ "loss": 0.0574,
"macro_f1": 0.3272727429866791,
"num_tokens": 812440.0,
"repeat_count": 0.0,
- "routers_loss": 0.0742638111114502,
+ "routers_loss": 0.07647835463285446,
"skip_count": 1.0,
"step": 504,
"text_loss": 0.4901447296142578
@@ -4805,13 +4805,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.25,
"learning_rate": 0.000999999401247153,
- "loss": 0.0682,
+ "loss": 0.0668,
"macro_f1": 0.32098764181137085,
"num_tokens": 815716.0,
"repeat_count": 0.0,
- "routers_loss": 0.08293049037456512,
+ "routers_loss": 0.08515176922082901,
"skip_count": 2.0,
"step": 506,
"text_loss": 0.6157599687576294
@@ -4824,13 +4824,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.26171875,
+ "grad_norm": 0.25390625,
"learning_rate": 0.0009999988264446445,
- "loss": 0.0697,
+ "loss": 0.0686,
"macro_f1": 0.3333333432674408,
"num_tokens": 819086.0,
"repeat_count": 0.0,
- "routers_loss": 0.010080376639962196,
+ "routers_loss": 0.00946938619017601,
"skip_count": 0.0,
"step": 508,
"text_loss": 0.5053519010543823
@@ -4843,13 +4843,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009999980600416424,
- "loss": 0.0611,
+ "loss": 0.0574,
"macro_f1": 0.3333333432674408,
"num_tokens": 822268.0,
"repeat_count": 0.0,
- "routers_loss": 0.009179878048598766,
+ "routers_loss": 0.01058756373822689,
"skip_count": 0.0,
"step": 510,
"text_loss": 0.5570021867752075
@@ -4862,13 +4862,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11083984375,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.000999997102038441,
- "loss": 0.0689,
+ "loss": 0.0678,
"macro_f1": 0.3333333432674408,
"num_tokens": 825728.0,
"repeat_count": 0.0,
- "routers_loss": 0.006718529388308525,
+ "routers_loss": 0.008705209009349346,
"skip_count": 0.0,
"step": 512,
"text_loss": 0.6519040465354919
@@ -4881,13 +4881,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.220703125,
"learning_rate": 0.0009999959524354064,
- "loss": 0.0826,
+ "loss": 0.083,
"macro_f1": 0.3272727429866791,
"num_tokens": 829459.0,
"repeat_count": 0.0,
- "routers_loss": 0.049344487488269806,
+ "routers_loss": 0.04024193435907364,
"skip_count": 1.0,
"step": 514,
"text_loss": 0.5290043950080872
@@ -4900,13 +4900,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.259765625,
+ "grad_norm": 0.25390625,
"learning_rate": 0.00099999461123298,
- "loss": 0.0739,
+ "loss": 0.0727,
"macro_f1": 0.3333333432674408,
"num_tokens": 832291.0,
"repeat_count": 0.0,
- "routers_loss": 0.013402626849710941,
+ "routers_loss": 0.015742862597107887,
"skip_count": 0.0,
"step": 516,
"text_loss": 0.7910057902336121
@@ -4919,13 +4919,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.232421875,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.000999993078431675,
- "loss": 0.0761,
+ "loss": 0.0759,
"macro_f1": 0.3076923191547394,
"num_tokens": 835399.0,
"repeat_count": 1.0,
- "routers_loss": 0.16964484751224518,
+ "routers_loss": 0.16753782331943512,
"skip_count": 3.0,
"step": 518,
"text_loss": 0.45196083188056946
@@ -4938,13 +4938,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.236328125,
"learning_rate": 0.0009999913540320792,
- "loss": 0.095,
+ "loss": 0.0968,
"macro_f1": 0.31446540355682373,
"num_tokens": 838993.0,
"repeat_count": 0.0,
- "routers_loss": 0.08609295636415482,
+ "routers_loss": 0.09357143193483353,
"skip_count": 2.0,
"step": 520,
"text_loss": 0.5499435663223267
@@ -4957,13 +4957,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.2392578125,
+ "grad_norm": 0.2451171875,
"learning_rate": 0.0009999894380348536,
- "loss": 0.0816,
+ "loss": 0.0821,
"macro_f1": 0.5492662787437439,
"num_tokens": 842652.0,
"repeat_count": 0.0,
- "routers_loss": 0.05354784056544304,
+ "routers_loss": 0.056803856045007706,
"skip_count": 2.0,
"step": 522,
"text_loss": 0.197520449757576
@@ -4976,13 +4976,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.2333984375,
"learning_rate": 0.000999987330440732,
- "loss": 0.0715,
+ "loss": 0.0725,
"macro_f1": 0.4871794879436493,
"num_tokens": 847061.0,
"repeat_count": 0.0,
- "routers_loss": 0.09146631509065628,
+ "routers_loss": 0.08962195366621017,
"skip_count": 3.0,
"step": 524,
"text_loss": 0.27509039640426636
@@ -4995,13 +4995,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1875,
+ "grad_norm": 0.189453125,
"learning_rate": 0.000999985031250522,
- "loss": 0.0574,
+ "loss": 0.0561,
"macro_f1": 0.3333333432674408,
"num_tokens": 850780.0,
"repeat_count": 0.0,
- "routers_loss": 0.02344255894422531,
+ "routers_loss": 0.022930558770895004,
"skip_count": 0.0,
"step": 526,
"text_loss": 0.13291706144809723
@@ -5014,13 +5014,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1982421875,
+ "grad_norm": 0.197265625,
"learning_rate": 0.0009999825404651053,
- "loss": 0.0621,
+ "loss": 0.0614,
"macro_f1": 0.3333333432674408,
"num_tokens": 853886.0,
"repeat_count": 0.0,
- "routers_loss": 0.018271517008543015,
+ "routers_loss": 0.017097990959882736,
"skip_count": 0.0,
"step": 528,
"text_loss": 0.21706295013427734
@@ -5033,13 +5033,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2060546875,
+ "grad_norm": 0.212890625,
"learning_rate": 0.0009999798580854356,
- "loss": 0.0717,
+ "loss": 0.0724,
"macro_f1": 0.3333333432674408,
"num_tokens": 857364.0,
"repeat_count": 0.0,
- "routers_loss": 0.026990914717316628,
+ "routers_loss": 0.02831801027059555,
"skip_count": 0.0,
"step": 530,
"text_loss": 0.9035662412643433
@@ -5052,13 +5052,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16015625,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.000999976984112541,
- "loss": 0.0681,
+ "loss": 0.0674,
"macro_f1": 0.3333333432674408,
"num_tokens": 860661.0,
"repeat_count": 0.0,
- "routers_loss": 0.019737249240279198,
+ "routers_loss": 0.019671892747282982,
"skip_count": 0.0,
"step": 532,
"text_loss": 0.8354863524436951
@@ -5071,13 +5071,13 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.3046875,
+ "grad_norm": 0.2890625,
"learning_rate": 0.0009999739185475231,
- "loss": 0.0978,
+ "loss": 0.0963,
"macro_f1": 0.47333335876464844,
"num_tokens": 864124.0,
"repeat_count": 2.0,
- "routers_loss": 0.212640181183815,
+ "routers_loss": 0.21383361518383026,
"skip_count": 3.0,
"step": 534,
"text_loss": 0.23422949016094208
@@ -5090,13 +5090,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.0009999706613915565,
- "loss": 0.0602,
+ "loss": 0.0598,
"macro_f1": 0.32098767161369324,
"num_tokens": 866976.0,
"repeat_count": 0.0,
- "routers_loss": 0.07302755117416382,
+ "routers_loss": 0.07158871740102768,
"skip_count": 1.0,
"step": 536,
"text_loss": 0.11800774186849594
@@ -5109,13 +5109,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.296875,
+ "grad_norm": 0.26953125,
"learning_rate": 0.0009999672126458894,
- "loss": 0.0825,
+ "loss": 0.0822,
"macro_f1": 0.3272727429866791,
"num_tokens": 870549.0,
"repeat_count": 0.0,
- "routers_loss": 0.08667246252298355,
+ "routers_loss": 0.08185924589633942,
"skip_count": 1.0,
"step": 538,
"text_loss": 0.19232480227947235
@@ -5128,13 +5128,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1318359375,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.000999963572311843,
- "loss": 0.0597,
+ "loss": 0.0604,
"macro_f1": 0.3333333432674408,
"num_tokens": 873733.0,
"repeat_count": 0.0,
- "routers_loss": 0.015047167427837849,
+ "routers_loss": 0.01633382774889469,
"skip_count": 0.0,
"step": 540,
"text_loss": 0.3725031912326813
@@ -5147,13 +5147,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.15234375,
"learning_rate": 0.0009999597403908128,
- "loss": 0.076,
+ "loss": 0.0761,
"macro_f1": 0.3272727429866791,
"num_tokens": 877099.0,
"repeat_count": 0.0,
- "routers_loss": 0.07481446117162704,
+ "routers_loss": 0.0782657191157341,
"skip_count": 1.0,
"step": 542,
"text_loss": 0.17589199542999268
@@ -5166,13 +5166,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.2177734375,
"learning_rate": 0.0009999557168842669,
- "loss": 0.0724,
+ "loss": 0.0716,
"macro_f1": 0.5492662787437439,
"num_tokens": 879883.0,
"repeat_count": 0.0,
- "routers_loss": 0.049495212733745575,
+ "routers_loss": 0.05275818333029747,
"skip_count": 2.0,
"step": 544,
"text_loss": 0.26448264718055725
@@ -5185,13 +5185,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.2490234375,
"learning_rate": 0.0009999515017937468,
- "loss": 0.0718,
+ "loss": 0.071,
"macro_f1": 0.32098764181137085,
"num_tokens": 882223.0,
"repeat_count": 0.0,
- "routers_loss": 0.08043002337217331,
+ "routers_loss": 0.09335892647504807,
"skip_count": 2.0,
"step": 546,
"text_loss": 0.208544060587883
@@ -5204,13 +5204,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.34765625,
+ "grad_norm": 0.376953125,
"learning_rate": 0.0009999470951208684,
- "loss": 0.086,
+ "loss": 0.0855,
"macro_f1": 0.32098764181137085,
"num_tokens": 885241.0,
"repeat_count": 2.0,
- "routers_loss": 0.22461950778961182,
+ "routers_loss": 0.22983254492282867,
"skip_count": 0.0,
"step": 548,
"text_loss": 0.6612338423728943
@@ -5223,13 +5223,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.216796875,
"learning_rate": 0.00099994249686732,
- "loss": 0.0798,
+ "loss": 0.0786,
"macro_f1": 0.3272727429866791,
"num_tokens": 887897.0,
"repeat_count": 1.0,
- "routers_loss": 0.11754962801933289,
+ "routers_loss": 0.12858282029628754,
"skip_count": 0.0,
"step": 550,
"text_loss": 0.4673548936843872
@@ -5242,13 +5242,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1611328125,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.0009999377070348638,
- "loss": 0.0978,
+ "loss": 0.0944,
"macro_f1": 0.3333333432674408,
"num_tokens": 891224.0,
"repeat_count": 0.0,
- "routers_loss": 0.017412789165973663,
+ "routers_loss": 0.017421770840883255,
"skip_count": 0.0,
"step": 552,
"text_loss": 0.6419258117675781
@@ -5261,13 +5261,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.15625,
"learning_rate": 0.000999932725625335,
- "loss": 0.0792,
+ "loss": 0.0791,
"macro_f1": 0.32098764181137085,
"num_tokens": 894578.0,
"repeat_count": 0.0,
- "routers_loss": 0.08969525247812271,
+ "routers_loss": 0.07890026271343231,
"skip_count": 2.0,
"step": 554,
"text_loss": 0.5970752239227295
@@ -5280,13 +5280,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2158203125,
+ "grad_norm": 0.216796875,
"learning_rate": 0.0009999275526406427,
- "loss": 0.0803,
+ "loss": 0.0796,
"macro_f1": 0.31446540355682373,
"num_tokens": 897145.0,
"repeat_count": 1.0,
- "routers_loss": 0.09876437485218048,
+ "routers_loss": 0.09836960583925247,
"skip_count": 1.0,
"step": 556,
"text_loss": 0.752425491809845
@@ -5299,13 +5299,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.1875,
"learning_rate": 0.0009999221880827693,
- "loss": 0.0887,
+ "loss": 0.0882,
"macro_f1": 0.3333333432674408,
"num_tokens": 900565.0,
"repeat_count": 0.0,
- "routers_loss": 0.019108204171061516,
+ "routers_loss": 0.017694659531116486,
"skip_count": 0.0,
"step": 558,
"text_loss": 0.195619136095047
@@ -5318,32 +5318,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.2021484375,
"learning_rate": 0.0009999166319537703,
- "loss": 0.0573,
+ "loss": 0.0561,
"macro_f1": 0.3333333432674408,
"num_tokens": 903506.0,
"repeat_count": 0.0,
- "routers_loss": 0.019048813730478287,
+ "routers_loss": 0.019375264644622803,
"skip_count": 0.0,
"step": 560,
"text_loss": 0.4603337347507477
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
"epoch": 2.638685060170238,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1435546875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.146484375,
"learning_rate": 0.0009999108842557748,
- "loss": 0.0947,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0953,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 906380.0,
"repeat_count": 0.0,
- "routers_loss": 0.11889495700597763,
+ "routers_loss": 0.12013207376003265,
"skip_count": 3.0,
"step": 562,
"text_loss": 0.6279402375221252
@@ -5356,13 +5356,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.228515625,
+ "grad_norm": 0.255859375,
"learning_rate": 0.0009999049449909854,
- "loss": 0.0771,
+ "loss": 0.0799,
"macro_f1": 0.3272727429866791,
"num_tokens": 909116.0,
"repeat_count": 0.0,
- "routers_loss": 0.06202332302927971,
+ "routers_loss": 0.06441342830657959,
"skip_count": 1.0,
"step": 564,
"text_loss": 0.23741699755191803
@@ -5375,13 +5375,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.15234375,
"learning_rate": 0.0009998988141616781,
- "loss": 0.0623,
+ "loss": 0.064,
"macro_f1": 0.32098767161369324,
"num_tokens": 912189.0,
"repeat_count": 0.0,
- "routers_loss": 0.08294244855642319,
+ "routers_loss": 0.08309414982795715,
"skip_count": 1.0,
"step": 566,
"text_loss": 0.27780941128730774
@@ -5394,13 +5394,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009998924917702023,
- "loss": 0.0885,
+ "loss": 0.0876,
"macro_f1": 0.3272727429866791,
"num_tokens": 916279.0,
"repeat_count": 1.0,
- "routers_loss": 0.07545182853937149,
+ "routers_loss": 0.07197169959545135,
"skip_count": 0.0,
"step": 568,
"text_loss": 0.6371755599975586
@@ -5413,13 +5413,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.2255859375,
"learning_rate": 0.0009998859778189806,
- "loss": 0.0712,
+ "loss": 0.0706,
"macro_f1": 0.3333333432674408,
"num_tokens": 919490.0,
"repeat_count": 0.0,
- "routers_loss": 0.008711219765245914,
+ "routers_loss": 0.008022273890674114,
"skip_count": 0.0,
"step": 570,
"text_loss": 0.6028938889503479
@@ -5432,13 +5432,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.000999879272310509,
- "loss": 0.0837,
+ "loss": 0.084,
"macro_f1": 0.3333333432674408,
"num_tokens": 923694.0,
"repeat_count": 0.0,
- "routers_loss": 0.01639273390173912,
+ "routers_loss": 0.01634674146771431,
"skip_count": 0.0,
"step": 572,
"text_loss": 0.7177054286003113
@@ -5451,13 +5451,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1669921875,
+ "grad_norm": 0.17578125,
"learning_rate": 0.0009998723752473574,
- "loss": 0.0707,
+ "loss": 0.0716,
"macro_f1": 0.3272727429866791,
"num_tokens": 926933.0,
"repeat_count": 0.0,
- "routers_loss": 0.04997137933969498,
+ "routers_loss": 0.060559045523405075,
"skip_count": 1.0,
"step": 574,
"text_loss": 0.5203254818916321
@@ -5470,13 +5470,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1845703125,
+ "grad_norm": 0.185546875,
"learning_rate": 0.0009998652866321687,
- "loss": 0.0799,
+ "loss": 0.0801,
"macro_f1": 0.3333333432674408,
"num_tokens": 929832.0,
"repeat_count": 0.0,
- "routers_loss": 0.011360209435224533,
+ "routers_loss": 0.011485611088573933,
"skip_count": 0.0,
"step": 576,
"text_loss": 0.6147452592849731
@@ -5489,13 +5489,13 @@
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1669921875,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.000999858006467659,
- "loss": 0.0658,
+ "loss": 0.0649,
"macro_f1": 0.29333335161209106,
"num_tokens": 933266.0,
"repeat_count": 2.0,
- "routers_loss": 0.31349560618400574,
+ "routers_loss": 0.2929030954837799,
"skip_count": 4.0,
"step": 578,
"text_loss": 0.1720666140317917
@@ -5508,13 +5508,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.24609375,
"learning_rate": 0.0009998505347566186,
- "loss": 0.0801,
+ "loss": 0.0782,
"macro_f1": 0.32098764181137085,
"num_tokens": 937545.0,
"repeat_count": 0.0,
- "routers_loss": 0.058660347014665604,
+ "routers_loss": 0.053780000656843185,
"skip_count": 2.0,
"step": 580,
"text_loss": 0.3258405327796936
@@ -5527,13 +5527,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.1416015625,
"learning_rate": 0.00099984287150191,
- "loss": 0.0578,
+ "loss": 0.0582,
"macro_f1": 0.3333333432674408,
"num_tokens": 941001.0,
"repeat_count": 0.0,
- "routers_loss": 0.025836754590272903,
+ "routers_loss": 0.02637636847794056,
"skip_count": 0.0,
"step": 582,
"text_loss": 0.23762771487236023
@@ -5546,13 +5546,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009998350167064705,
- "loss": 0.0683,
+ "loss": 0.0672,
"macro_f1": 0.3333333432674408,
"num_tokens": 943989.0,
"repeat_count": 0.0,
- "routers_loss": 0.016504868865013123,
+ "routers_loss": 0.01637580618262291,
"skip_count": 0.0,
"step": 584,
"text_loss": 0.7460582852363586
@@ -5565,13 +5565,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1787109375,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009998269703733096,
- "loss": 0.0685,
+ "loss": 0.0686,
"macro_f1": 0.3272727429866791,
"num_tokens": 947245.0,
"repeat_count": 1.0,
- "routers_loss": 0.1379794180393219,
+ "routers_loss": 0.13934117555618286,
"skip_count": 0.0,
"step": 586,
"text_loss": 0.5284690260887146
@@ -5584,13 +5584,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.13671875,
"learning_rate": 0.0009998187325055106,
- "loss": 0.0657,
+ "loss": 0.0667,
"macro_f1": 0.3333333432674408,
"num_tokens": 950116.0,
"repeat_count": 0.0,
- "routers_loss": 0.01802757754921913,
+ "routers_loss": 0.02138397842645645,
"skip_count": 0.0,
"step": 588,
"text_loss": 0.3920256197452545
@@ -5603,13 +5603,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0009998103031062305,
- "loss": 0.0762,
+ "loss": 0.0778,
"macro_f1": 0.3333333432674408,
"num_tokens": 953277.0,
"repeat_count": 0.0,
- "routers_loss": 0.006902900990098715,
+ "routers_loss": 0.007098200265318155,
"skip_count": 0.0,
"step": 590,
"text_loss": 0.7472905516624451
@@ -5622,13 +5622,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3046875,
+ "grad_norm": 0.318359375,
"learning_rate": 0.0009998016821786994,
- "loss": 0.0912,
+ "loss": 0.0872,
"macro_f1": 0.32098764181137085,
"num_tokens": 958229.0,
"repeat_count": 1.0,
- "routers_loss": 0.08348741382360458,
+ "routers_loss": 0.07946522533893585,
"skip_count": 1.0,
"step": 592,
"text_loss": 0.5506448745727539
@@ -5641,13 +5641,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.000999792869726221,
- "loss": 0.0527,
+ "loss": 0.0523,
"macro_f1": 0.3272727429866791,
"num_tokens": 961016.0,
"repeat_count": 0.0,
- "routers_loss": 0.08290062099695206,
+ "routers_loss": 0.0850791186094284,
"skip_count": 1.0,
"step": 594,
"text_loss": 0.3824431002140045
@@ -5660,13 +5660,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.0009997838657521717,
- "loss": 0.0643,
+ "loss": 0.0632,
"macro_f1": 0.3333333432674408,
"num_tokens": 963847.0,
"repeat_count": 0.0,
- "routers_loss": 0.018620988354086876,
+ "routers_loss": 0.016370445489883423,
"skip_count": 0.0,
"step": 596,
"text_loss": 0.2139475792646408
@@ -5679,13 +5679,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009997746702600026,
- "loss": 0.073,
+ "loss": 0.0702,
"macro_f1": 0.307692289352417,
"num_tokens": 966619.0,
"repeat_count": 0.0,
- "routers_loss": 0.1211671382188797,
+ "routers_loss": 0.1310746818780899,
"skip_count": 3.0,
"step": 598,
"text_loss": 0.3651018440723419
@@ -5698,13 +5698,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2353515625,
+ "grad_norm": 0.23828125,
"learning_rate": 0.0009997652832532372,
- "loss": 0.079,
+ "loss": 0.0792,
"macro_f1": 0.3272727429866791,
"num_tokens": 970418.0,
"repeat_count": 1.0,
- "routers_loss": 0.15485027432441711,
+ "routers_loss": 0.14303378760814667,
"skip_count": 0.0,
"step": 600,
"text_loss": 0.7094736099243164
@@ -5717,13 +5717,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009997557047354722,
- "loss": 0.0562,
+ "loss": 0.0531,
"macro_f1": 0.3272727429866791,
"num_tokens": 973491.0,
"repeat_count": 0.0,
- "routers_loss": 0.036684274673461914,
+ "routers_loss": 0.03334212675690651,
"skip_count": 1.0,
"step": 602,
"text_loss": 0.4812237024307251
@@ -5731,18 +5731,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 2.835926034634576,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.302734375,
+ "grad_norm": 0.2890625,
"learning_rate": 0.0009997459347103783,
- "loss": 0.0985,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0956,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 976672.0,
"repeat_count": 0.0,
- "routers_loss": 0.026901578530669212,
+ "routers_loss": 0.02831871062517166,
"skip_count": 0.0,
"step": 604,
"text_loss": 0.21737146377563477
@@ -5755,13 +5755,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12158203125,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009997359731816998,
- "loss": 0.0632,
+ "loss": 0.0646,
"macro_f1": 0.3333333432674408,
"num_tokens": 979898.0,
"repeat_count": 0.0,
- "routers_loss": 0.01700405217707157,
+ "routers_loss": 0.017968013882637024,
"skip_count": 0.0,
"step": 606,
"text_loss": 0.5458008050918579
@@ -5774,13 +5774,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2099609375,
+ "grad_norm": 0.224609375,
"learning_rate": 0.0009997258201532536,
- "loss": 0.0758,
+ "loss": 0.0751,
"macro_f1": 0.3333333432674408,
"num_tokens": 982811.0,
"repeat_count": 0.0,
- "routers_loss": 0.015013590455055237,
+ "routers_loss": 0.016256732866168022,
"skip_count": 0.0,
"step": 608,
"text_loss": 0.8643257021903992
@@ -5793,13 +5793,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0009997154756289303,
- "loss": 0.0576,
+ "loss": 0.0561,
"macro_f1": 0.3333333432674408,
"num_tokens": 985245.0,
"repeat_count": 0.0,
- "routers_loss": 0.02037946693599224,
+ "routers_loss": 0.021214161068201065,
"skip_count": 0.0,
"step": 610,
"text_loss": 0.2204967886209488
@@ -5812,13 +5812,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.150390625,
"learning_rate": 0.000999704939612694,
- "loss": 0.0648,
+ "loss": 0.0636,
"macro_f1": 0.3006536364555359,
"num_tokens": 988539.0,
"repeat_count": 3.0,
- "routers_loss": 0.22834022343158722,
+ "routers_loss": 0.23249399662017822,
"skip_count": 2.0,
"step": 612,
"text_loss": 0.32489025592803955
@@ -5831,13 +5831,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009996942121085824,
- "loss": 0.0449,
+ "loss": 0.0445,
"macro_f1": 0.3333333432674408,
"num_tokens": 991660.0,
"repeat_count": 0.0,
- "routers_loss": 0.009838113561272621,
+ "routers_loss": 0.010706410743296146,
"skip_count": 0.0,
"step": 614,
"text_loss": 0.4551754891872406
@@ -5850,13 +5850,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.353515625,
+ "grad_norm": 0.3671875,
"learning_rate": 0.000999683293120706,
- "loss": 0.1009,
+ "loss": 0.1016,
"macro_f1": 0.3333333432674408,
"num_tokens": 994828.0,
"repeat_count": 0.0,
- "routers_loss": 0.005943270865827799,
+ "routers_loss": 0.006676184479147196,
"skip_count": 0.0,
"step": 616,
"text_loss": 0.6212068200111389
@@ -5869,13 +5869,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.38671875,
+ "grad_norm": 0.408203125,
"learning_rate": 0.0009996721826532491,
- "loss": 0.0941,
+ "loss": 0.0976,
"macro_f1": 0.3076923191547394,
"num_tokens": 997951.0,
"repeat_count": 2.0,
- "routers_loss": 0.21597740054130554,
+ "routers_loss": 0.2148125320672989,
"skip_count": 2.0,
"step": 618,
"text_loss": 0.26514527201652527
@@ -5888,13 +5888,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.1904296875,
"learning_rate": 0.000999660880710469,
- "loss": 0.0896,
+ "loss": 0.0909,
"macro_f1": 0.3333333432674408,
"num_tokens": 1001139.0,
"repeat_count": 0.0,
- "routers_loss": 0.023726588115096092,
+ "routers_loss": 0.022332455962896347,
"skip_count": 0.0,
"step": 620,
"text_loss": 0.26131340861320496
@@ -5907,13 +5907,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009996493872966971,
"loss": 0.0732,
"macro_f1": 0.3272727429866791,
"num_tokens": 1003678.0,
"repeat_count": 1.0,
- "routers_loss": 0.08467255532741547,
+ "routers_loss": 0.08348730951547623,
"skip_count": 0.0,
"step": 622,
"text_loss": 0.19151706993579865
@@ -5926,13 +5926,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.173828125,
"learning_rate": 0.0009996377024163374,
- "loss": 0.0816,
+ "loss": 0.0822,
"macro_f1": 0.3333333432674408,
"num_tokens": 1007082.0,
"repeat_count": 0.0,
- "routers_loss": 0.029468854889273643,
+ "routers_loss": 0.028577150776982307,
"skip_count": 0.0,
"step": 624,
"text_loss": 0.305387407541275
@@ -5945,13 +5945,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.11279296875,
"learning_rate": 0.0009996258260738676,
- "loss": 0.0891,
+ "loss": 0.0892,
"macro_f1": 0.3272727429866791,
"num_tokens": 1010064.0,
"repeat_count": 1.0,
- "routers_loss": 0.09438466280698776,
+ "routers_loss": 0.08312026411294937,
"skip_count": 0.0,
"step": 626,
"text_loss": 0.49436143040657043
@@ -5964,13 +5964,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009996137582738388,
- "loss": 0.0581,
+ "loss": 0.0591,
"macro_f1": 0.3333333432674408,
"num_tokens": 1013462.0,
"repeat_count": 0.0,
- "routers_loss": 0.013679586350917816,
+ "routers_loss": 0.013337327167391777,
"skip_count": 0.0,
"step": 628,
"text_loss": 0.6515294313430786
@@ -5983,13 +5983,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.140625,
"learning_rate": 0.000999601499020875,
- "loss": 0.0528,
+ "loss": 0.0537,
"macro_f1": 0.3333333432674408,
"num_tokens": 1016246.0,
"repeat_count": 0.0,
- "routers_loss": 0.029532987624406815,
+ "routers_loss": 0.029126765206456184,
"skip_count": 0.0,
"step": 630,
"text_loss": 0.18834827840328217
@@ -6002,13 +6002,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009995890483196746,
- "loss": 0.0601,
+ "loss": 0.0602,
"macro_f1": 0.3272727429866791,
"num_tokens": 1019286.0,
"repeat_count": 0.0,
- "routers_loss": 0.05516733601689339,
+ "routers_loss": 0.054844800382852554,
"skip_count": 1.0,
"step": 632,
"text_loss": 0.6988179087638855
@@ -6021,13 +6021,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.357421875,
+ "grad_norm": 0.322265625,
"learning_rate": 0.0009995764061750086,
- "loss": 0.0785,
+ "loss": 0.0767,
"macro_f1": 0.3333333432674408,
"num_tokens": 1022207.0,
"repeat_count": 0.0,
- "routers_loss": 0.010254866443574429,
+ "routers_loss": 0.010095693171024323,
"skip_count": 0.0,
"step": 634,
"text_loss": 0.558451771736145
@@ -6040,13 +6040,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.2890625,
"learning_rate": 0.000999563572591721,
- "loss": 0.0518,
+ "loss": 0.0521,
"macro_f1": 0.32098764181137085,
"num_tokens": 1025319.0,
"repeat_count": 1.0,
- "routers_loss": 0.07528360933065414,
+ "routers_loss": 0.0698433518409729,
"skip_count": 1.0,
"step": 636,
"text_loss": 0.5961872935295105
@@ -6059,13 +6059,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1064453125,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009995505475747302,
- "loss": 0.0844,
+ "loss": 0.0849,
"macro_f1": 0.3272727429866791,
"num_tokens": 1028362.0,
"repeat_count": 0.0,
- "routers_loss": 0.04301584139466286,
+ "routers_loss": 0.040211405605077744,
"skip_count": 1.0,
"step": 638,
"text_loss": 0.546863317489624
@@ -6078,13 +6078,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11572265625,
+ "grad_norm": 0.119140625,
"learning_rate": 0.0009995373311290272,
- "loss": 0.0699,
+ "loss": 0.0709,
"macro_f1": 0.3144654333591461,
"num_tokens": 1032199.0,
"repeat_count": 2.0,
- "routers_loss": 0.14521080255508423,
+ "routers_loss": 0.1457643061876297,
"skip_count": 1.0,
"step": 640,
"text_loss": 0.2137298285961151
@@ -6097,13 +6097,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1328125,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.0009995239232596764,
- "loss": 0.0543,
+ "loss": 0.0545,
"macro_f1": 0.3333333432674408,
"num_tokens": 1035801.0,
"repeat_count": 0.0,
- "routers_loss": 0.01074797473847866,
+ "routers_loss": 0.011394930072128773,
"skip_count": 0.0,
"step": 642,
"text_loss": 0.43054503202438354
@@ -6116,13 +6116,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009995103239718163,
- "loss": 0.0659,
+ "loss": 0.0665,
"macro_f1": 0.3333333432674408,
"num_tokens": 1039223.0,
"repeat_count": 0.0,
- "routers_loss": 0.009271817281842232,
+ "routers_loss": 0.00997432041913271,
"skip_count": 0.0,
"step": 644,
"text_loss": 0.7749615907669067
@@ -6135,13 +6135,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.2275390625,
"learning_rate": 0.0009994965332706573,
- "loss": 0.0737,
+ "loss": 0.0755,
"macro_f1": 0.3144654333591461,
"num_tokens": 1042154.0,
"repeat_count": 3.0,
- "routers_loss": 0.10257050395011902,
+ "routers_loss": 0.10589150339365005,
"skip_count": 0.0,
"step": 646,
"text_loss": 0.7812211513519287
@@ -6154,13 +6154,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.0009994825511614846,
- "loss": 0.0363,
+ "loss": 0.0383,
"macro_f1": 0.3272727429866791,
"num_tokens": 1045250.0,
"repeat_count": 0.0,
- "routers_loss": 0.07091924548149109,
+ "routers_loss": 0.0748734176158905,
"skip_count": 1.0,
"step": 648,
"text_loss": 0.844803512096405
@@ -6173,13 +6173,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11572265625,
+ "grad_norm": 0.1220703125,
"learning_rate": 0.0009994683776496562,
- "loss": 0.0421,
+ "loss": 0.0433,
"macro_f1": 0.3272727429866791,
"num_tokens": 1048446.0,
"repeat_count": 0.0,
- "routers_loss": 0.034446243196725845,
+ "routers_loss": 0.03742415830492973,
"skip_count": 1.0,
"step": 650,
"text_loss": 0.2098839282989502
@@ -6192,13 +6192,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009994540127406034,
- "loss": 0.0593,
+ "loss": 0.0591,
"macro_f1": 0.32098764181137085,
"num_tokens": 1051840.0,
"repeat_count": 0.0,
- "routers_loss": 0.06077485531568527,
+ "routers_loss": 0.06025516986846924,
"skip_count": 2.0,
"step": 652,
"text_loss": 0.27727583050727844
@@ -6211,13 +6211,13 @@
"f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.181640625,
"learning_rate": 0.0009994394564398306,
- "loss": 0.0537,
+ "loss": 0.0519,
"macro_f1": 0.521541953086853,
"num_tokens": 1055142.0,
"repeat_count": 4.0,
- "routers_loss": 0.2382282167673111,
+ "routers_loss": 0.22807340323925018,
"skip_count": 2.0,
"step": 654,
"text_loss": 0.9672397971153259
@@ -6230,13 +6230,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.142578125,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009994247087529158,
- "loss": 0.0613,
+ "loss": 0.0618,
"macro_f1": 0.3333333432674408,
"num_tokens": 1057698.0,
"repeat_count": 0.0,
- "routers_loss": 0.011971636675298214,
+ "routers_loss": 0.01348950993269682,
"skip_count": 0.0,
"step": 656,
"text_loss": 0.6375506520271301
@@ -6249,13 +6249,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.212890625,
+ "grad_norm": 0.1953125,
"learning_rate": 0.0009994097696855106,
- "loss": 0.0414,
+ "loss": 0.0412,
"macro_f1": 0.3333333432674408,
"num_tokens": 1060624.0,
"repeat_count": 0.0,
- "routers_loss": 0.010221127420663834,
+ "routers_loss": 0.009649243205785751,
"skip_count": 0.0,
"step": 658,
"text_loss": 0.5315385460853577
@@ -6268,13 +6268,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2265625,
+ "grad_norm": 0.2041015625,
"learning_rate": 0.0009993946392433395,
- "loss": 0.061,
+ "loss": 0.0609,
"macro_f1": 0.307692289352417,
"num_tokens": 1065076.0,
"repeat_count": 0.0,
- "routers_loss": 0.11860335618257523,
+ "routers_loss": 0.1250980943441391,
"skip_count": 3.0,
"step": 660,
"text_loss": 0.25780341029167175
@@ -6287,13 +6287,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009993793174322006,
- "loss": 0.0485,
+ "loss": 0.0471,
"macro_f1": 0.3333333432674408,
"num_tokens": 1068365.0,
"repeat_count": 0.0,
- "routers_loss": 0.011139829643070698,
+ "routers_loss": 0.011544390581548214,
"skip_count": 0.0,
"step": 662,
"text_loss": 0.34876301884651184
@@ -6306,13 +6306,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.166015625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009993638042579654,
- "loss": 0.0478,
+ "loss": 0.0473,
"macro_f1": 0.3272727429866791,
"num_tokens": 1071693.0,
"repeat_count": 0.0,
- "routers_loss": 0.03978770971298218,
+ "routers_loss": 0.03777370601892471,
"skip_count": 1.0,
"step": 664,
"text_loss": 0.21811571717262268
@@ -6327,11 +6327,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.203125,
"learning_rate": 0.0009993480997265783,
- "loss": 0.0481,
+ "loss": 0.0475,
"macro_f1": 0.5492662787437439,
"num_tokens": 1074733.0,
"repeat_count": 0.0,
- "routers_loss": 0.051231011748313904,
+ "routers_loss": 0.049949806183576584,
"skip_count": 2.0,
"step": 666,
"text_loss": 0.38410288095474243
@@ -6344,13 +6344,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.10302734375,
"learning_rate": 0.0009993322038440572,
- "loss": 0.0615,
+ "loss": 0.0605,
"macro_f1": 0.3333333432674408,
"num_tokens": 1077993.0,
"repeat_count": 0.0,
- "routers_loss": 0.024917088449001312,
+ "routers_loss": 0.0247171800583601,
"skip_count": 0.0,
"step": 668,
"text_loss": 0.25576895475387573
@@ -6363,13 +6363,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1982421875,
+ "grad_norm": 0.216796875,
"learning_rate": 0.000999316116616494,
- "loss": 0.0627,
+ "loss": 0.0619,
"macro_f1": 0.3333333432674408,
"num_tokens": 1080491.0,
"repeat_count": 0.0,
- "routers_loss": 0.008834881708025932,
+ "routers_loss": 0.008118715137243271,
"skip_count": 0.0,
"step": 670,
"text_loss": 0.6269792914390564
@@ -6382,13 +6382,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.21875,
+ "grad_norm": 0.173828125,
"learning_rate": 0.0009992998380500527,
"loss": 0.0462,
"macro_f1": 0.3272727429866791,
"num_tokens": 1083817.0,
"repeat_count": 0.0,
- "routers_loss": 0.033405229449272156,
+ "routers_loss": 0.03366057574748993,
"skip_count": 1.0,
"step": 672,
"text_loss": 0.26891493797302246
@@ -6401,13 +6401,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.13671875,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009992833681509716,
- "loss": 0.0523,
+ "loss": 0.0529,
"macro_f1": 0.3333333432674408,
"num_tokens": 1087368.0,
"repeat_count": 0.0,
- "routers_loss": 0.020753704011440277,
+ "routers_loss": 0.020552074536681175,
"skip_count": 0.0,
"step": 674,
"text_loss": 0.14421936869621277
@@ -6420,13 +6420,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.18359375,
"learning_rate": 0.0009992667069255619,
- "loss": 0.0698,
+ "loss": 0.0696,
"macro_f1": 0.31446540355682373,
"num_tokens": 1090452.0,
"repeat_count": 0.0,
- "routers_loss": 0.06932353973388672,
+ "routers_loss": 0.06937336176633835,
"skip_count": 2.0,
"step": 676,
"text_loss": 0.24999259412288666
@@ -6439,13 +6439,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.08740234375,
"learning_rate": 0.0009992498543802085,
- "loss": 0.059,
+ "loss": 0.0588,
"macro_f1": 0.3272727429866791,
"num_tokens": 1093996.0,
"repeat_count": 1.0,
- "routers_loss": 0.032903749495744705,
+ "routers_loss": 0.0380021296441555,
"skip_count": 0.0,
"step": 678,
"text_loss": 0.42473849654197693
@@ -6458,32 +6458,32 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.2099609375,
+ "grad_norm": 0.2119140625,
"learning_rate": 0.0009992328105213688,
- "loss": 0.0417,
+ "loss": 0.0411,
"macro_f1": 0.4400000274181366,
"num_tokens": 1096837.0,
"repeat_count": 1.0,
- "routers_loss": 0.19733747839927673,
+ "routers_loss": 0.20885063707828522,
"skip_count": 4.0,
"step": 680,
"text_loss": 0.3829527199268341
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 26.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 3.2019371881420606,
- "f1_execute": 1.0,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.154296875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009992155753555747,
- "loss": 0.0729,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0722,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1100320.0,
"repeat_count": 0.0,
- "routers_loss": 0.013452666811645031,
+ "routers_loss": 0.018230699002742767,
"skip_count": 2.0,
"step": 682,
"text_loss": 0.6190969944000244
@@ -6496,13 +6496,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.30859375,
"learning_rate": 0.0009991981488894303,
"loss": 0.0681,
"macro_f1": 0.32098767161369324,
"num_tokens": 1103682.0,
"repeat_count": 0.0,
- "routers_loss": 0.05302857980132103,
+ "routers_loss": 0.05550144240260124,
"skip_count": 1.0,
"step": 684,
"text_loss": 0.44418027997016907
@@ -6515,13 +6515,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.2158203125,
"learning_rate": 0.0009991805311296133,
- "loss": 0.0527,
+ "loss": 0.0507,
"macro_f1": 0.32098764181137085,
"num_tokens": 1106427.0,
"repeat_count": 0.0,
- "routers_loss": 0.08124994486570358,
+ "routers_loss": 0.07990608364343643,
"skip_count": 2.0,
"step": 686,
"text_loss": 0.5577231645584106
@@ -6534,13 +6534,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.22265625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009991627220828753,
- "loss": 0.0579,
+ "loss": 0.0568,
"macro_f1": 0.32098764181137085,
"num_tokens": 1109314.0,
"repeat_count": 0.0,
- "routers_loss": 0.058633625507354736,
+ "routers_loss": 0.05167485028505325,
"skip_count": 2.0,
"step": 688,
"text_loss": 0.27325430512428284
@@ -6553,13 +6553,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009991447217560408,
- "loss": 0.0533,
+ "loss": 0.0521,
"macro_f1": 0.5492662787437439,
"num_tokens": 1112748.0,
"repeat_count": 0.0,
- "routers_loss": 0.04703643172979355,
+ "routers_loss": 0.04621964320540428,
"skip_count": 2.0,
"step": 690,
"text_loss": 0.5288321375846863
@@ -6572,13 +6572,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.000999126530156007,
- "loss": 0.0485,
+ "loss": 0.0499,
"macro_f1": 0.307692289352417,
"num_tokens": 1116965.0,
"repeat_count": 1.0,
- "routers_loss": 0.11615128815174103,
+ "routers_loss": 0.11950276792049408,
"skip_count": 2.0,
"step": 692,
"text_loss": 0.14215624332427979
@@ -6591,13 +6591,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2314453125,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.0009991081472897454,
- "loss": 0.0718,
+ "loss": 0.0722,
"macro_f1": 0.3333333432674408,
"num_tokens": 1120570.0,
"repeat_count": 0.0,
- "routers_loss": 0.017403846606612206,
+ "routers_loss": 0.01905500330030918,
"skip_count": 0.0,
"step": 694,
"text_loss": 0.41862696409225464
@@ -6610,13 +6610,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.0009990895731643002,
- "loss": 0.0444,
+ "loss": 0.0464,
"macro_f1": 0.3272727429866791,
"num_tokens": 1124009.0,
"repeat_count": 1.0,
- "routers_loss": 0.07067303359508514,
+ "routers_loss": 0.06974572688341141,
"skip_count": 0.0,
"step": 696,
"text_loss": 0.41160130500793457
@@ -6629,13 +6629,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.000999070807786789,
- "loss": 0.0527,
+ "loss": 0.0531,
"macro_f1": 0.3272727429866791,
"num_tokens": 1127370.0,
"repeat_count": 1.0,
- "routers_loss": 0.07131028175354004,
+ "routers_loss": 0.07055293023586273,
"skip_count": 0.0,
"step": 698,
"text_loss": 0.48068273067474365
@@ -6648,13 +6648,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.18359375,
+ "grad_norm": 0.197265625,
"learning_rate": 0.000999051851164403,
- "loss": 0.0629,
+ "loss": 0.0619,
"macro_f1": 0.32098764181137085,
"num_tokens": 1130234.0,
"repeat_count": 1.0,
- "routers_loss": 0.1152748316526413,
+ "routers_loss": 0.12506946921348572,
"skip_count": 1.0,
"step": 700,
"text_loss": 0.47925490140914917
@@ -6667,13 +6667,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.216796875,
+ "grad_norm": 0.1943359375,
"learning_rate": 0.000999032703304406,
- "loss": 0.0663,
+ "loss": 0.0674,
"macro_f1": 0.3333333432674408,
"num_tokens": 1132874.0,
"repeat_count": 0.0,
- "routers_loss": 0.0077212234027683735,
+ "routers_loss": 0.00809287466108799,
"skip_count": 0.0,
"step": 702,
"text_loss": 0.47433632612228394
@@ -6686,13 +6686,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.1064453125,
"learning_rate": 0.0009990133642141358,
- "loss": 0.0494,
+ "loss": 0.0497,
"macro_f1": 0.5492662787437439,
"num_tokens": 1136011.0,
"repeat_count": 0.0,
- "routers_loss": 0.02726336568593979,
+ "routers_loss": 0.0319170281291008,
"skip_count": 2.0,
"step": 704,
"text_loss": 0.6574832201004028
@@ -6705,13 +6705,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.32421875,
+ "grad_norm": 0.33984375,
"learning_rate": 0.000998993833901003,
- "loss": 0.0615,
+ "loss": 0.0619,
"macro_f1": 0.32098764181137085,
"num_tokens": 1139674.0,
"repeat_count": 0.0,
- "routers_loss": 0.0958542674779892,
+ "routers_loss": 0.09850362688302994,
"skip_count": 2.0,
"step": 706,
"text_loss": 0.7660127282142639
@@ -6724,13 +6724,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009989741123724919,
- "loss": 0.0583,
+ "loss": 0.0574,
"macro_f1": 0.3333333432674408,
"num_tokens": 1143558.0,
"repeat_count": 0.0,
- "routers_loss": 0.007100600749254227,
+ "routers_loss": 0.006673311349004507,
"skip_count": 0.0,
"step": 708,
"text_loss": 0.5976111888885498
@@ -6743,13 +6743,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009989541996361594,
- "loss": 0.0445,
+ "loss": 0.045,
"macro_f1": 0.3333333432674408,
"num_tokens": 1146122.0,
"repeat_count": 0.0,
- "routers_loss": 0.0047812811098992825,
+ "routers_loss": 0.004988791421055794,
"skip_count": 0.0,
"step": 710,
"text_loss": 0.5256119966506958
@@ -6762,13 +6762,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009989340956996367,
- "loss": 0.052,
+ "loss": 0.0528,
"macro_f1": 0.3333333432674408,
"num_tokens": 1149546.0,
"repeat_count": 0.0,
- "routers_loss": 0.006643407512456179,
+ "routers_loss": 0.0067769973538815975,
"skip_count": 0.0,
"step": 712,
"text_loss": 0.5040497779846191
@@ -6781,13 +6781,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2890625,
+ "grad_norm": 0.26953125,
"learning_rate": 0.0009989138005706273,
- "loss": 0.0719,
+ "loss": 0.0735,
"macro_f1": 0.32098764181137085,
"num_tokens": 1153195.0,
"repeat_count": 0.0,
- "routers_loss": 0.0910436138510704,
+ "routers_loss": 0.09899546951055527,
"skip_count": 2.0,
"step": 714,
"text_loss": 0.20803412795066833
@@ -6800,13 +6800,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1484375,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.000998893314256908,
- "loss": 0.0649,
+ "loss": 0.064,
"macro_f1": 0.3333333432674408,
"num_tokens": 1157081.0,
"repeat_count": 0.0,
- "routers_loss": 0.010978946462273598,
+ "routers_loss": 0.010492355562746525,
"skip_count": 0.0,
"step": 716,
"text_loss": 0.23077639937400818
@@ -6819,13 +6819,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009988726367663298,
- "loss": 0.0543,
+ "loss": 0.0539,
"macro_f1": 0.3333333432674408,
"num_tokens": 1160079.0,
"repeat_count": 0.0,
- "routers_loss": 0.009956461377441883,
+ "routers_loss": 0.01063773687928915,
"skip_count": 0.0,
"step": 718,
"text_loss": 0.6085864901542664
@@ -6838,13 +6838,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009988517681068163,
- "loss": 0.0412,
+ "loss": 0.0421,
"macro_f1": 0.3272727429866791,
"num_tokens": 1163249.0,
"repeat_count": 1.0,
- "routers_loss": 0.057210199534893036,
+ "routers_loss": 0.05981874838471413,
"skip_count": 0.0,
"step": 720,
"text_loss": 0.4047050476074219
@@ -6857,32 +6857,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009988307082863638,
- "loss": 0.0364,
+ "loss": 0.0361,
"macro_f1": 0.3333333432674408,
"num_tokens": 1166259.0,
"repeat_count": 0.0,
- "routers_loss": 0.01035996899008751,
+ "routers_loss": 0.009750043973326683,
"skip_count": 0.0,
"step": 722,
"text_loss": 0.5306474566459656
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 3.3991781626063986,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.2412109375,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.240234375,
"learning_rate": 0.0009988094573130434,
- "loss": 0.0661,
- "macro_f1": 0.3076923191547394,
+ "loss": 0.063,
+ "macro_f1": 0.5359477400779724,
"num_tokens": 1168887.0,
"repeat_count": 2.0,
- "routers_loss": 0.18087820708751678,
+ "routers_loss": 0.18601104617118835,
"skip_count": 2.0,
"step": 724,
"text_loss": 0.53528892993927
@@ -6895,32 +6895,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009987880151949974,
- "loss": 0.0505,
+ "loss": 0.0496,
"macro_f1": 0.3272727429866791,
"num_tokens": 1172625.0,
"repeat_count": 0.0,
- "routers_loss": 0.04720238968729973,
+ "routers_loss": 0.02845010720193386,
"skip_count": 1.0,
"step": 726,
"text_loss": 0.4760453701019287
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 26.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 3.417963017317288,
- "f1_execute": 1.0,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.2216796875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.2177734375,
"learning_rate": 0.0009987663819404434,
- "loss": 0.0603,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.06,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1176580.0,
"repeat_count": 0.0,
- "routers_loss": 0.015407778322696686,
+ "routers_loss": 0.017596980556845665,
"skip_count": 2.0,
"step": 728,
"text_loss": 0.5146099328994751
@@ -6933,13 +6933,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.000998744557557671,
- "loss": 0.0489,
+ "loss": 0.0484,
"macro_f1": 0.3272727429866791,
"num_tokens": 1179804.0,
"repeat_count": 0.0,
- "routers_loss": 0.060891781002283096,
+ "routers_loss": 0.0625474750995636,
"skip_count": 1.0,
"step": 730,
"text_loss": 0.27738022804260254
@@ -6947,18 +6947,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 3.436747872028177,
- "f1_execute": 0.943396270275116,
+ "f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.203125,
"learning_rate": 0.0009987225420550433,
- "loss": 0.0825,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0796,
+ "macro_f1": 0.307692289352417,
"num_tokens": 1182658.0,
"repeat_count": 1.0,
- "routers_loss": 0.1661442220211029,
+ "routers_loss": 0.16188351809978485,
"skip_count": 2.0,
"step": 732,
"text_loss": 0.23231445252895355
@@ -6966,18 +6966,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 3.446140299383622,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.2001953125,
"learning_rate": 0.0009987003354409965,
- "loss": 0.0634,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0626,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1185451.0,
"repeat_count": 0.0,
- "routers_loss": 0.02108248695731163,
+ "routers_loss": 0.02391529455780983,
"skip_count": 0.0,
"step": 734,
"text_loss": 0.4496627151966095
@@ -6990,13 +6990,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.248046875,
+ "grad_norm": 0.234375,
"learning_rate": 0.0009986779377240405,
- "loss": 0.0534,
+ "loss": 0.0513,
"macro_f1": 0.32098767161369324,
"num_tokens": 1188666.0,
"repeat_count": 0.0,
- "routers_loss": 0.08318125456571579,
+ "routers_loss": 0.08435963839292526,
"skip_count": 1.0,
"step": 736,
"text_loss": 0.4950787127017975
@@ -7009,13 +7009,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11962890625,
+ "grad_norm": 0.1220703125,
"learning_rate": 0.000998655348912758,
- "loss": 0.0514,
+ "loss": 0.0515,
"macro_f1": 0.3333333432674408,
"num_tokens": 1193035.0,
"repeat_count": 0.0,
- "routers_loss": 0.015889234840869904,
+ "routers_loss": 0.01648722216486931,
"skip_count": 0.0,
"step": 738,
"text_loss": 0.24761848151683807
@@ -7028,13 +7028,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1513671875,
"learning_rate": 0.0009986325690158051,
"loss": 0.0435,
"macro_f1": 0.3333333432674408,
"num_tokens": 1196840.0,
"repeat_count": 0.0,
- "routers_loss": 0.01378484908491373,
+ "routers_loss": 0.013143910095095634,
"skip_count": 0.0,
"step": 740,
"text_loss": 0.15662719309329987
@@ -7047,13 +7047,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1787109375,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009986095980419113,
- "loss": 0.076,
+ "loss": 0.0757,
"macro_f1": 0.3333333432674408,
"num_tokens": 1200573.0,
"repeat_count": 0.0,
- "routers_loss": 0.02673683874309063,
+ "routers_loss": 0.026706280186772346,
"skip_count": 0.0,
"step": 742,
"text_loss": 0.16725164651870728
@@ -7066,13 +7066,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.185546875,
+ "grad_norm": 0.1982421875,
"learning_rate": 0.0009985864359998787,
- "loss": 0.0778,
+ "loss": 0.0795,
"macro_f1": 0.3006536364555359,
"num_tokens": 1203589.0,
"repeat_count": 2.0,
- "routers_loss": 0.27776041626930237,
+ "routers_loss": 0.28607678413391113,
"skip_count": 3.0,
"step": 744,
"text_loss": 0.6350882053375244
@@ -7085,13 +7085,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009985630828985835,
- "loss": 0.0575,
+ "loss": 0.0572,
"macro_f1": 0.3272727429866791,
"num_tokens": 1206422.0,
"repeat_count": 0.0,
- "routers_loss": 0.0575483962893486,
+ "routers_loss": 0.05685260891914368,
"skip_count": 1.0,
"step": 746,
"text_loss": 0.33779552578926086
@@ -7104,13 +7104,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009985395387469742,
- "loss": 0.0478,
+ "loss": 0.0458,
"macro_f1": 0.5492662787437439,
"num_tokens": 1211588.0,
"repeat_count": 0.0,
- "routers_loss": 0.0458797849714756,
+ "routers_loss": 0.0437830351293087,
"skip_count": 2.0,
"step": 748,
"text_loss": 0.28664472699165344
@@ -7123,13 +7123,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.15625,
"learning_rate": 0.0009985158035540735,
- "loss": 0.0701,
+ "loss": 0.0714,
"macro_f1": 0.32098764181137085,
"num_tokens": 1214580.0,
"repeat_count": 2.0,
- "routers_loss": 0.07850238680839539,
+ "routers_loss": 0.07074898481369019,
"skip_count": 0.0,
"step": 750,
"text_loss": 0.3939313292503357
@@ -7142,13 +7142,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.21484375,
"learning_rate": 0.0009984918773289762,
- "loss": 0.0702,
+ "loss": 0.0699,
"macro_f1": 0.3333333432674408,
"num_tokens": 1217388.0,
"repeat_count": 0.0,
- "routers_loss": 0.009507967159152031,
+ "routers_loss": 0.009757856838405132,
"skip_count": 0.0,
"step": 752,
"text_loss": 0.37641215324401855
@@ -7161,13 +7161,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1484375,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009984677600808512,
- "loss": 0.0543,
+ "loss": 0.054,
"macro_f1": 0.3333333432674408,
"num_tokens": 1219960.0,
"repeat_count": 0.0,
- "routers_loss": 0.02620997279882431,
+ "routers_loss": 0.02515069581568241,
"skip_count": 0.0,
"step": 754,
"text_loss": 0.155938982963562
@@ -7180,13 +7180,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3359375,
+ "grad_norm": 0.30078125,
"learning_rate": 0.0009984434518189405,
- "loss": 0.0791,
+ "loss": 0.0764,
"macro_f1": 0.3333333432674408,
"num_tokens": 1223234.0,
"repeat_count": 0.0,
- "routers_loss": 0.02798631228506565,
+ "routers_loss": 0.025766927748918533,
"skip_count": 0.0,
"step": 756,
"text_loss": 0.691118061542511
@@ -7201,11 +7201,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1416015625,
"learning_rate": 0.0009984189525525584,
- "loss": 0.046,
+ "loss": 0.0451,
"macro_f1": 0.5359477400779724,
"num_tokens": 1225764.0,
"repeat_count": 2.0,
- "routers_loss": 0.16614431142807007,
+ "routers_loss": 0.1782722771167755,
"skip_count": 2.0,
"step": 758,
"text_loss": 0.3592209219932556
@@ -7218,13 +7218,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.193359375,
+ "grad_norm": 0.189453125,
"learning_rate": 0.0009983942622910935,
- "loss": 0.0669,
+ "loss": 0.0659,
"macro_f1": 0.3333333432674408,
"num_tokens": 1230097.0,
"repeat_count": 0.0,
- "routers_loss": 0.008541896007955074,
+ "routers_loss": 0.00825568474829197,
"skip_count": 0.0,
"step": 760,
"text_loss": 0.4646475315093994
@@ -7237,13 +7237,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009983693810440074,
- "loss": 0.0478,
+ "loss": 0.0477,
"macro_f1": 0.32098764181137085,
"num_tokens": 1233140.0,
"repeat_count": 0.0,
- "routers_loss": 0.045411624014377594,
+ "routers_loss": 0.04156976938247681,
"skip_count": 2.0,
"step": 762,
"text_loss": 0.298682302236557
@@ -7256,13 +7256,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.380859375,
+ "grad_norm": 0.3515625,
"learning_rate": 0.000998344308820834,
- "loss": 0.0689,
+ "loss": 0.0666,
"macro_f1": 0.3272727429866791,
"num_tokens": 1236305.0,
"repeat_count": 0.0,
- "routers_loss": 0.052299100905656815,
+ "routers_loss": 0.05697929114103317,
"skip_count": 1.0,
"step": 764,
"text_loss": 0.5249121189117432
@@ -7275,13 +7275,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.18359375,
"learning_rate": 0.0009983190456311817,
- "loss": 0.0602,
+ "loss": 0.0592,
"macro_f1": 0.3144654333591461,
"num_tokens": 1239673.0,
"repeat_count": 0.0,
- "routers_loss": 0.09140212833881378,
+ "routers_loss": 0.09547408670186996,
"skip_count": 3.0,
"step": 766,
"text_loss": 0.41277334094047546
@@ -7294,13 +7294,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.201171875,
+ "grad_norm": 0.185546875,
"learning_rate": 0.000998293591484731,
- "loss": 0.0475,
+ "loss": 0.0484,
"macro_f1": 0.5492662787437439,
"num_tokens": 1242292.0,
"repeat_count": 0.0,
- "routers_loss": 0.030750583857297897,
+ "routers_loss": 0.030693158507347107,
"skip_count": 2.0,
"step": 768,
"text_loss": 0.1583656519651413
@@ -7313,13 +7313,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000998267946391236,
- "loss": 0.052,
+ "loss": 0.051,
"macro_f1": 0.3333333432674408,
"num_tokens": 1244661.0,
"repeat_count": 0.0,
- "routers_loss": 0.010202950797975063,
+ "routers_loss": 0.01211300864815712,
"skip_count": 0.0,
"step": 770,
"text_loss": 0.4629349112510681
@@ -7332,13 +7332,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0009982421103605238,
- "loss": 0.0434,
+ "loss": 0.0441,
"macro_f1": 0.32098764181137085,
"num_tokens": 1248688.0,
"repeat_count": 0.0,
- "routers_loss": 0.07364192605018616,
+ "routers_loss": 0.0665968507528305,
"skip_count": 2.0,
"step": 772,
"text_loss": 0.4019293785095215
@@ -7353,11 +7353,11 @@
"f1_skip": 0.0,
"grad_norm": 0.2890625,
"learning_rate": 0.000998216083402495,
- "loss": 0.0606,
+ "loss": 0.0613,
"macro_f1": 0.32098764181137085,
"num_tokens": 1251395.0,
"repeat_count": 0.0,
- "routers_loss": 0.06553081423044205,
+ "routers_loss": 0.07186859846115112,
"skip_count": 2.0,
"step": 774,
"text_loss": 0.4659276604652405
@@ -7370,13 +7370,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.302734375,
"learning_rate": 0.0009981898655271235,
- "loss": 0.0475,
+ "loss": 0.0488,
"macro_f1": 0.3333333432674408,
"num_tokens": 1254888.0,
"repeat_count": 0.0,
- "routers_loss": 0.008751659654080868,
+ "routers_loss": 0.007823926396667957,
"skip_count": 0.0,
"step": 776,
"text_loss": 0.5160359740257263
@@ -7389,13 +7389,13 @@
"f1_execute": 0.9130434989929199,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.11962890625,
"learning_rate": 0.0009981634567444557,
- "loss": 0.0777,
+ "loss": 0.0775,
"macro_f1": 0.590062141418457,
"num_tokens": 1258250.0,
"repeat_count": 3.0,
- "routers_loss": 0.24522721767425537,
+ "routers_loss": 0.24624499678611755,
"skip_count": 4.0,
"step": 778,
"text_loss": 0.29319918155670166
@@ -7408,13 +7408,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.263671875,
"learning_rate": 0.0009981368570646115,
"loss": 0.0885,
"macro_f1": 0.3272727429866791,
"num_tokens": 1260916.0,
"repeat_count": 0.0,
- "routers_loss": 0.03767623379826546,
+ "routers_loss": 0.030730176717042923,
"skip_count": 1.0,
"step": 780,
"text_loss": 0.624981164932251
@@ -7427,13 +7427,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009981100664977838,
- "loss": 0.0708,
+ "loss": 0.0699,
"macro_f1": 0.3333333432674408,
"num_tokens": 1264004.0,
"repeat_count": 0.0,
- "routers_loss": 0.006098059006035328,
+ "routers_loss": 0.006829176563769579,
"skip_count": 0.0,
"step": 782,
"text_loss": 0.6137266159057617
@@ -7446,13 +7446,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009980830850542391,
- "loss": 0.0589,
+ "loss": 0.058,
"macro_f1": 0.3333333432674408,
"num_tokens": 1267130.0,
"repeat_count": 0.0,
- "routers_loss": 0.01731623336672783,
+ "routers_loss": 0.018471000716090202,
"skip_count": 0.0,
"step": 784,
"text_loss": 0.15213175117969513
@@ -7465,13 +7465,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2294921875,
+ "grad_norm": 0.2353515625,
"learning_rate": 0.0009980559127443166,
- "loss": 0.0526,
+ "loss": 0.052,
"macro_f1": 0.3333333432674408,
"num_tokens": 1271129.0,
"repeat_count": 0.0,
- "routers_loss": 0.0076471962966024876,
+ "routers_loss": 0.007903140969574451,
"skip_count": 0.0,
"step": 786,
"text_loss": 0.5768613219261169
@@ -7484,13 +7484,13 @@
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.130859375,
"learning_rate": 0.000998028549578429,
- "loss": 0.0745,
+ "loss": 0.0719,
"macro_f1": 0.307692289352417,
"num_tokens": 1274232.0,
"repeat_count": 0.0,
- "routers_loss": 0.0637628585100174,
+ "routers_loss": 0.06737866252660751,
"skip_count": 3.0,
"step": 788,
"text_loss": 0.2877073585987091
@@ -7503,13 +7503,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009980009955670615,
- "loss": 0.0699,
+ "loss": 0.0698,
"macro_f1": 0.3144654333591461,
"num_tokens": 1277193.0,
"repeat_count": 0.0,
- "routers_loss": 0.10882514715194702,
+ "routers_loss": 0.10194934904575348,
"skip_count": 3.0,
"step": 790,
"text_loss": 0.11860492825508118
@@ -7522,13 +7522,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.126953125,
"learning_rate": 0.000997973250720773,
- "loss": 0.056,
+ "loss": 0.0552,
"macro_f1": 0.32098764181137085,
"num_tokens": 1280960.0,
"repeat_count": 0.0,
- "routers_loss": 0.10924118757247925,
+ "routers_loss": 0.10297708213329315,
"skip_count": 2.0,
"step": 792,
"text_loss": 0.13477706909179688
@@ -7541,13 +7541,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009979453150501954,
- "loss": 0.0664,
+ "loss": 0.0663,
"macro_f1": 0.32098764181137085,
"num_tokens": 1284611.0,
"repeat_count": 1.0,
- "routers_loss": 0.06571807712316513,
+ "routers_loss": 0.06122037023305893,
"skip_count": 1.0,
"step": 794,
"text_loss": 0.40569379925727844
@@ -7560,13 +7560,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1181640625,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.000997917188566034,
- "loss": 0.0616,
+ "loss": 0.062,
"macro_f1": 0.32098764181137085,
"num_tokens": 1287834.0,
"repeat_count": 0.0,
- "routers_loss": 0.058966971933841705,
+ "routers_loss": 0.061135001480579376,
"skip_count": 2.0,
"step": 796,
"text_loss": 0.2829287648200989
@@ -7579,32 +7579,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.109375,
"learning_rate": 0.0009978888712790664,
- "loss": 0.067,
+ "loss": 0.0654,
"macro_f1": 0.3272727429866791,
"num_tokens": 1291666.0,
"repeat_count": 0.0,
- "routers_loss": 0.04844636470079422,
+ "routers_loss": 0.04841872677206993,
"skip_count": 1.0,
"step": 798,
"text_loss": 1.011757254600525
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.4000000059604645,
- "avg_layers": 26.0,
+ "acc_skip": 0.20000000298023224,
+ "avg_layers": 27.0,
"epoch": 3.756090402113296,
- "f1_execute": 0.9166666865348816,
+ "f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
- "f1_skip": 0.5714285969734192,
- "grad_norm": 0.1416015625,
+ "f1_skip": 0.3333333134651184,
+ "grad_norm": 0.14453125,
"learning_rate": 0.0009978603632001444,
- "loss": 0.0634,
- "macro_f1": 0.4960317611694336,
+ "loss": 0.0636,
+ "macro_f1": 0.4104308485984802,
"num_tokens": 1294627.0,
"repeat_count": 1.0,
- "routers_loss": 0.1591777801513672,
+ "routers_loss": 0.15698759257793427,
"skip_count": 5.0,
"step": 800,
"text_loss": 0.4457623362541199
@@ -7617,13 +7617,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2734375,
+ "grad_norm": 0.283203125,
"learning_rate": 0.0009978316643401916,
- "loss": 0.0694,
+ "loss": 0.0688,
"macro_f1": 0.3333333432674408,
"num_tokens": 1297711.0,
"repeat_count": 0.0,
- "routers_loss": 0.017735568806529045,
+ "routers_loss": 0.018952010199427605,
"skip_count": 0.0,
"step": 802,
"text_loss": 0.2069481462240219
@@ -7636,13 +7636,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.14453125,
"learning_rate": 0.0009978027747102062,
- "loss": 0.0477,
+ "loss": 0.0479,
"macro_f1": 0.3333333432674408,
"num_tokens": 1300569.0,
"repeat_count": 0.0,
- "routers_loss": 0.012401525862514973,
+ "routers_loss": 0.014538386836647987,
"skip_count": 0.0,
"step": 804,
"text_loss": 0.4983852505683899
@@ -7655,13 +7655,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2080078125,
+ "grad_norm": 0.2109375,
"learning_rate": 0.0009977736943212584,
- "loss": 0.0735,
+ "loss": 0.0721,
"macro_f1": 0.32098764181137085,
"num_tokens": 1303969.0,
"repeat_count": 0.0,
- "routers_loss": 0.10736164450645447,
+ "routers_loss": 0.11164087057113647,
"skip_count": 2.0,
"step": 806,
"text_loss": 0.2910642921924591
@@ -7674,13 +7674,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2001953125,
+ "grad_norm": 0.1826171875,
"learning_rate": 0.000997744423184492,
- "loss": 0.0428,
+ "loss": 0.0424,
"macro_f1": 0.3272727429866791,
"num_tokens": 1307263.0,
"repeat_count": 0.0,
- "routers_loss": 0.0595436617732048,
+ "routers_loss": 0.06073406711220741,
"skip_count": 1.0,
"step": 808,
"text_loss": 0.18831779062747955
@@ -7693,13 +7693,13 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.26171875,
"learning_rate": 0.0009977149613111236,
- "loss": 0.0494,
+ "loss": 0.0486,
"macro_f1": 0.4400000274181366,
"num_tokens": 1309953.0,
"repeat_count": 1.0,
- "routers_loss": 0.12617000937461853,
+ "routers_loss": 0.11035524308681488,
"skip_count": 4.0,
"step": 810,
"text_loss": 0.7872759699821472
@@ -7712,13 +7712,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1669921875,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.0009976853087124433,
- "loss": 0.0537,
+ "loss": 0.0536,
"macro_f1": 0.3333333432674408,
"num_tokens": 1313243.0,
"repeat_count": 0.0,
- "routers_loss": 0.021242506802082062,
+ "routers_loss": 0.021804286167025566,
"skip_count": 0.0,
"step": 812,
"text_loss": 0.22349292039871216
@@ -7731,13 +7731,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.318359375,
+ "grad_norm": 0.28125,
"learning_rate": 0.0009976554653998138,
- "loss": 0.0617,
+ "loss": 0.0612,
"macro_f1": 0.31446540355682373,
"num_tokens": 1316165.0,
"repeat_count": 0.0,
- "routers_loss": 0.10387415438890457,
+ "routers_loss": 0.10715524107217789,
"skip_count": 2.0,
"step": 814,
"text_loss": 0.18035532534122467
@@ -7750,13 +7750,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.000997625431384671,
- "loss": 0.0565,
+ "loss": 0.0564,
"macro_f1": 0.3333333432674408,
"num_tokens": 1319206.0,
"repeat_count": 0.0,
- "routers_loss": 0.007816939614713192,
+ "routers_loss": 0.007173649035394192,
"skip_count": 0.0,
"step": 816,
"text_loss": 0.48928648233413696
@@ -7769,13 +7769,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.0009975952066785243,
- "loss": 0.0654,
+ "loss": 0.0655,
"macro_f1": 0.3006536364555359,
"num_tokens": 1322549.0,
"repeat_count": 1.0,
- "routers_loss": 0.22526368498802185,
+ "routers_loss": 0.22308112680912018,
"skip_count": 4.0,
"step": 818,
"text_loss": 0.5211259722709656
@@ -7788,13 +7788,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.0009975647912929557,
- "loss": 0.056,
+ "loss": 0.0564,
"macro_f1": 0.3333333432674408,
"num_tokens": 1325213.0,
"repeat_count": 0.0,
- "routers_loss": 0.010998851619660854,
+ "routers_loss": 0.00998698640614748,
"skip_count": 0.0,
"step": 820,
"text_loss": 0.7117052674293518
@@ -7807,13 +7807,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.15234375,
"learning_rate": 0.0009975341852396205,
- "loss": 0.0712,
+ "loss": 0.0723,
"macro_f1": 0.32098764181137085,
"num_tokens": 1328383.0,
"repeat_count": 0.0,
- "routers_loss": 0.07115054875612259,
+ "routers_loss": 0.07454588264226913,
"skip_count": 2.0,
"step": 822,
"text_loss": 0.34539610147476196
@@ -7826,13 +7826,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1552734375,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009975033885302469,
- "loss": 0.0611,
+ "loss": 0.0604,
"macro_f1": 0.3333333432674408,
"num_tokens": 1331406.0,
"repeat_count": 0.0,
- "routers_loss": 0.008062695153057575,
+ "routers_loss": 0.009157589636743069,
"skip_count": 0.0,
"step": 824,
"text_loss": 0.7484824657440186
@@ -7845,13 +7845,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.0009974724011766363,
- "loss": 0.0496,
+ "loss": 0.0474,
"macro_f1": 0.3272727429866791,
"num_tokens": 1334410.0,
"repeat_count": 1.0,
- "routers_loss": 0.16666285693645477,
+ "routers_loss": 0.17149391770362854,
"skip_count": 0.0,
"step": 826,
"text_loss": 0.5913820266723633
@@ -7864,13 +7864,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1708984375,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009974412231906632,
- "loss": 0.0567,
+ "loss": 0.058,
"macro_f1": 0.32098764181137085,
"num_tokens": 1337653.0,
"repeat_count": 1.0,
- "routers_loss": 0.0908689796924591,
+ "routers_loss": 0.09743282198905945,
"skip_count": 1.0,
"step": 828,
"text_loss": 0.2505693733692169
@@ -7883,13 +7883,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16015625,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0009974098545842748,
- "loss": 0.0648,
+ "loss": 0.0638,
"macro_f1": 0.3272727429866791,
"num_tokens": 1340860.0,
"repeat_count": 0.0,
- "routers_loss": 0.04364728182554245,
+ "routers_loss": 0.041490405797958374,
"skip_count": 1.0,
"step": 830,
"text_loss": 0.5585370063781738
@@ -7897,18 +7897,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 3.906369239800411,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2060546875,
+ "grad_norm": 0.193359375,
"learning_rate": 0.0009973782953694918,
- "loss": 0.0772,
- "macro_f1": 0.3076923191547394,
+ "loss": 0.0746,
+ "macro_f1": 0.3006536066532135,
"num_tokens": 1344232.0,
"repeat_count": 1.0,
- "routers_loss": 0.15315109491348267,
+ "routers_loss": 0.16080693900585175,
"skip_count": 3.0,
"step": 832,
"text_loss": 0.4782734513282776
@@ -7921,13 +7921,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.000997346545558408,
- "loss": 0.0527,
+ "loss": 0.0522,
"macro_f1": 0.3333333432674408,
"num_tokens": 1347667.0,
"repeat_count": 0.0,
- "routers_loss": 0.01342768594622612,
+ "routers_loss": 0.01173500344157219,
"skip_count": 0.0,
"step": 834,
"text_loss": 0.25036177039146423
@@ -7940,13 +7940,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1748046875,
+ "grad_norm": 0.173828125,
"learning_rate": 0.0009973146051631895,
- "loss": 0.0513,
+ "loss": 0.0522,
"macro_f1": 0.3333333432674408,
"num_tokens": 1350707.0,
"repeat_count": 0.0,
- "routers_loss": 0.01158806961029768,
+ "routers_loss": 0.011477196589112282,
"skip_count": 0.0,
"step": 836,
"text_loss": 0.5482863187789917
@@ -7959,13 +7959,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1650390625,
"learning_rate": 0.0009972824741960764,
- "loss": 0.0549,
+ "loss": 0.0536,
"macro_f1": 0.3333333432674408,
"num_tokens": 1353704.0,
"repeat_count": 0.0,
- "routers_loss": 0.01255605649203062,
+ "routers_loss": 0.010528896935284138,
"skip_count": 0.0,
"step": 838,
"text_loss": 0.6732596158981323
@@ -7978,13 +7978,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12255859375,
+ "grad_norm": 0.1181640625,
"learning_rate": 0.000997250152669381,
- "loss": 0.0578,
+ "loss": 0.0573,
"macro_f1": 0.3333333432674408,
"num_tokens": 1356608.0,
"repeat_count": 0.0,
- "routers_loss": 0.010225459933280945,
+ "routers_loss": 0.010678744874894619,
"skip_count": 0.0,
"step": 840,
"text_loss": 0.5479338765144348
@@ -7997,13 +7997,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.181640625,
"learning_rate": 0.000997217640595489,
- "loss": 0.0633,
+ "loss": 0.0631,
"macro_f1": 0.3333333432674408,
"num_tokens": 1359809.0,
"repeat_count": 0.0,
- "routers_loss": 0.007837744429707527,
+ "routers_loss": 0.00835978239774704,
"skip_count": 0.0,
"step": 842,
"text_loss": 0.42543259263038635
@@ -8016,13 +8016,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.0009971849379868593,
- "loss": 0.0674,
+ "loss": 0.0653,
"macro_f1": 0.3333333432674408,
"num_tokens": 1362201.0,
"repeat_count": 0.0,
- "routers_loss": 0.008631376549601555,
+ "routers_loss": 0.009930923581123352,
"skip_count": 0.0,
"step": 844,
"text_loss": 0.720462441444397
@@ -8035,13 +8035,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009971520448560235,
- "loss": 0.0612,
+ "loss": 0.0615,
"macro_f1": 0.3272727429866791,
"num_tokens": 1365790.0,
"repeat_count": 0.0,
- "routers_loss": 0.06206027418375015,
+ "routers_loss": 0.06344373524188995,
"skip_count": 1.0,
"step": 846,
"text_loss": 0.8423607349395752
@@ -8049,18 +8049,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 3.9815086586439685,
- "f1_execute": 0.9411765336990356,
+ "f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
- "f1_skip": 0.5,
- "grad_norm": 0.16015625,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.16796875,
"learning_rate": 0.000997118961215586,
- "loss": 0.0678,
- "macro_f1": 0.480392187833786,
+ "loss": 0.0674,
+ "macro_f1": 0.4533333480358124,
"num_tokens": 1368387.0,
"repeat_count": 1.0,
- "routers_loss": 0.1463794708251953,
+ "routers_loss": 0.14688406884670258,
"skip_count": 3.0,
"step": 848,
"text_loss": 0.3933577537536621
@@ -8073,13 +8073,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2451171875,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000997085687078225,
- "loss": 0.052,
+ "loss": 0.0518,
"macro_f1": 0.3333333432674408,
"num_tokens": 1371189.0,
"repeat_count": 0.0,
- "routers_loss": 0.01140492781996727,
+ "routers_loss": 0.009953443892300129,
"skip_count": 0.0,
"step": 850,
"text_loss": 0.41469162702560425
@@ -8092,13 +8092,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.15625,
"learning_rate": 0.0009970522224566909,
- "loss": 0.0563,
+ "loss": 0.0555,
"macro_f1": 0.32098767161369324,
"num_tokens": 1374008.0,
"repeat_count": 0.0,
- "routers_loss": 0.05136030167341232,
+ "routers_loss": 0.048870690166950226,
"skip_count": 1.0,
"step": 852,
"text_loss": 0.613615870475769
@@ -8111,32 +8111,32 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.283203125,
"learning_rate": 0.0009970185673638075,
- "loss": 0.0627,
+ "loss": 0.0629,
"macro_f1": 0.32098764181137085,
"num_tokens": 1376662.0,
"repeat_count": 1.0,
- "routers_loss": 0.07274381071329117,
+ "routers_loss": 0.06865929812192917,
"skip_count": 1.0,
"step": 854,
"text_loss": 0.4392736256122589
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 4.01878485471089,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.162109375,
"learning_rate": 0.0009969847218124716,
- "loss": 0.0503,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0506,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1380049.0,
"repeat_count": 0.0,
- "routers_loss": 0.024335317313671112,
+ "routers_loss": 0.02382219396531582,
"skip_count": 1.0,
"step": 856,
"text_loss": 0.19115346670150757
@@ -8149,13 +8149,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.240234375,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009969506858156527,
- "loss": 0.0359,
+ "loss": 0.0344,
"macro_f1": 0.3272727429866791,
"num_tokens": 1383008.0,
"repeat_count": 0.0,
- "routers_loss": 0.046614740043878555,
+ "routers_loss": 0.03907281160354614,
"skip_count": 1.0,
"step": 858,
"text_loss": 0.34842637181282043
@@ -8168,13 +8168,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11181640625,
+ "grad_norm": 0.12060546875,
"learning_rate": 0.0009969164593863935,
- "loss": 0.0372,
+ "loss": 0.0365,
"macro_f1": 0.3333333432674408,
"num_tokens": 1387051.0,
"repeat_count": 0.0,
- "routers_loss": 0.006380240898579359,
+ "routers_loss": 0.007645803038030863,
"skip_count": 0.0,
"step": 860,
"text_loss": 0.3810436725616455
@@ -8187,13 +8187,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009968820425378098,
- "loss": 0.0473,
+ "loss": 0.0463,
"macro_f1": 0.3272727429866791,
"num_tokens": 1390244.0,
"repeat_count": 1.0,
- "routers_loss": 0.04770716652274132,
+ "routers_loss": 0.04435238987207413,
"skip_count": 0.0,
"step": 862,
"text_loss": 0.34853485226631165
@@ -8206,32 +8206,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.3359375,
+ "grad_norm": 0.28515625,
"learning_rate": 0.00099684743528309,
- "loss": 0.0434,
+ "loss": 0.0424,
"macro_f1": 0.3333333432674408,
"num_tokens": 1392976.0,
"repeat_count": 0.0,
- "routers_loss": 0.006983708590269089,
+ "routers_loss": 0.006071661598980427,
"skip_count": 0.0,
"step": 864,
"text_loss": 0.6395178437232971
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 4.065746991488113,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.080078125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0810546875,
"learning_rate": 0.0009968126376354958,
- "loss": 0.0476,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0477,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1396061.0,
"repeat_count": 0.0,
- "routers_loss": 0.046313900500535965,
+ "routers_loss": 0.05011235550045967,
"skip_count": 2.0,
"step": 866,
"text_loss": 0.09103966504335403
@@ -8244,32 +8244,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009967776496083616,
"loss": 0.0509,
"macro_f1": 0.3272727429866791,
"num_tokens": 1398993.0,
"repeat_count": 1.0,
- "routers_loss": 0.0401870422065258,
+ "routers_loss": 0.03979124873876572,
"skip_count": 0.0,
"step": 868,
"text_loss": 0.27257058024406433
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 4.084531846199002,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.14453125,
"learning_rate": 0.000996742471215095,
- "loss": 0.0505,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0516,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1402080.0,
"repeat_count": 0.0,
- "routers_loss": 0.03313451260328293,
+ "routers_loss": 0.030823837965726852,
"skip_count": 2.0,
"step": 870,
"text_loss": 0.7047103047370911
@@ -8282,13 +8282,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009967071024691763,
- "loss": 0.0468,
+ "loss": 0.0461,
"macro_f1": 0.3333333432674408,
"num_tokens": 1404890.0,
"repeat_count": 0.0,
- "routers_loss": 0.010118982754647732,
+ "routers_loss": 0.009721715934574604,
"skip_count": 0.0,
"step": 872,
"text_loss": 0.959106981754303
@@ -8301,13 +8301,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.000996671543384159,
- "loss": 0.0498,
+ "loss": 0.05,
"macro_f1": 0.3333333432674408,
"num_tokens": 1407853.0,
"repeat_count": 0.0,
- "routers_loss": 0.005856200121343136,
+ "routers_loss": 0.006025883834809065,
"skip_count": 0.0,
"step": 874,
"text_loss": 0.47571972012519836
@@ -8320,13 +8320,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.09765625,
"learning_rate": 0.0009966357939736692,
- "loss": 0.0417,
+ "loss": 0.0416,
"macro_f1": 0.3272727429866791,
"num_tokens": 1410723.0,
"repeat_count": 0.0,
- "routers_loss": 0.02768322452902794,
+ "routers_loss": 0.025964925065636635,
"skip_count": 0.0,
"step": 876,
"text_loss": 0.4964611530303955
@@ -8339,13 +8339,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1025390625,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009965998542514065,
- "loss": 0.0419,
+ "loss": 0.0415,
"macro_f1": 0.32098764181137085,
"num_tokens": 1414008.0,
"repeat_count": 0.0,
- "routers_loss": 0.09382032603025436,
+ "routers_loss": 0.09509637206792831,
"skip_count": 2.0,
"step": 878,
"text_loss": 0.621494710445404
@@ -8358,32 +8358,32 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009965637242311427,
- "loss": 0.0466,
+ "loss": 0.0472,
"macro_f1": 0.542222261428833,
"num_tokens": 1417447.0,
"repeat_count": 0.0,
- "routers_loss": 0.026867631822824478,
+ "routers_loss": 0.02520318515598774,
"skip_count": 4.0,
"step": 880,
"text_loss": 0.40209758281707764
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6666666865348816,
- "avg_layers": 24.0,
+ "acc_skip": 0.5,
+ "avg_layers": 25.0,
"epoch": 4.14088641033167,
- "f1_execute": 0.95652174949646,
+ "f1_execute": 0.936170220375061,
"f1_repeat": 0.0,
- "f1_skip": 0.800000011920929,
- "grad_norm": 0.26171875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.263671875,
"learning_rate": 0.000996527403926723,
- "loss": 0.0496,
- "macro_f1": 0.5855072736740112,
+ "loss": 0.0495,
+ "macro_f1": 0.5342789888381958,
"num_tokens": 1419905.0,
"repeat_count": 0.0,
- "routers_loss": 0.12731307744979858,
+ "routers_loss": 0.13183781504631042,
"skip_count": 6.0,
"step": 882,
"text_loss": 0.642185389995575
@@ -8396,13 +8396,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.0009964908933520655,
- "loss": 0.039,
+ "loss": 0.0375,
"macro_f1": 0.3333333432674408,
"num_tokens": 1423436.0,
"repeat_count": 0.0,
- "routers_loss": 0.008483970537781715,
+ "routers_loss": 0.009429510682821274,
"skip_count": 0.0,
"step": 884,
"text_loss": 0.48232755064964294
@@ -8415,13 +8415,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.18359375,
+ "grad_norm": 0.1669921875,
"learning_rate": 0.0009964541925211613,
- "loss": 0.0348,
+ "loss": 0.0349,
"macro_f1": 0.32098764181137085,
"num_tokens": 1426842.0,
"repeat_count": 0.0,
- "routers_loss": 0.07847871631383896,
+ "routers_loss": 0.07629609107971191,
"skip_count": 2.0,
"step": 886,
"text_loss": 0.16620934009552002
@@ -8434,13 +8434,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0009964173014480738,
- "loss": 0.036,
+ "loss": 0.0348,
"macro_f1": 0.5492662787437439,
"num_tokens": 1430430.0,
"repeat_count": 0.0,
- "routers_loss": 0.04574459046125412,
+ "routers_loss": 0.036814019083976746,
"skip_count": 2.0,
"step": 888,
"text_loss": 0.4866008758544922
@@ -8453,13 +8453,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10595703125,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009963802201469398,
- "loss": 0.0485,
+ "loss": 0.0476,
"macro_f1": 0.3333333432674408,
"num_tokens": 1433821.0,
"repeat_count": 0.0,
- "routers_loss": 0.004683624487370253,
+ "routers_loss": 0.0041250260546803474,
"skip_count": 0.0,
"step": 890,
"text_loss": 0.578216552734375
@@ -8472,13 +8472,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2158203125,
+ "grad_norm": 0.2373046875,
"learning_rate": 0.0009963429486319693,
- "loss": 0.0476,
+ "loss": 0.0463,
"macro_f1": 0.32098764181137085,
"num_tokens": 1436976.0,
"repeat_count": 0.0,
- "routers_loss": 0.06499828398227692,
+ "routers_loss": 0.06213559955358505,
"skip_count": 2.0,
"step": 892,
"text_loss": 0.221701517701149
@@ -8486,18 +8486,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
- "avg_layers": 25.0,
+ "avg_layers": 26.0,
"epoch": 4.197240974464338,
- "f1_execute": 0.9411764740943909,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.4000000059604645,
- "grad_norm": 0.310546875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.361328125,
"learning_rate": 0.0009963054869174446,
- "loss": 0.0326,
- "macro_f1": 0.44705885648727417,
+ "loss": 0.0313,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 1440397.0,
"repeat_count": 0.0,
- "routers_loss": 0.08285653591156006,
+ "routers_loss": 0.07532428950071335,
"skip_count": 2.0,
"step": 894,
"text_loss": 0.6922838091850281
@@ -8510,13 +8510,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.154296875,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.0009962678350177209,
- "loss": 0.0497,
+ "loss": 0.0472,
"macro_f1": 0.3272727429866791,
"num_tokens": 1443604.0,
"repeat_count": 0.0,
- "routers_loss": 0.04252336546778679,
+ "routers_loss": 0.0419243648648262,
"skip_count": 1.0,
"step": 896,
"text_loss": 0.22092342376708984
@@ -8524,18 +8524,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 4.216025829175227,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10302734375,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009962299929472268,
- "loss": 0.0349,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.034,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 1446257.0,
"repeat_count": 2.0,
- "routers_loss": 0.126711905002594,
+ "routers_loss": 0.10849297791719437,
"skip_count": 0.0,
"step": 898,
"text_loss": 0.26394811272621155
@@ -8548,13 +8548,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.000996191960720463,
- "loss": 0.0392,
+ "loss": 0.0394,
"macro_f1": 0.3333333432674408,
"num_tokens": 1449669.0,
"repeat_count": 0.0,
- "routers_loss": 0.00955706462264061,
+ "routers_loss": 0.0092767970636487,
"skip_count": 0.0,
"step": 900,
"text_loss": 0.5338577628135681
@@ -8567,13 +8567,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009961537383520042,
- "loss": 0.0377,
+ "loss": 0.0354,
"macro_f1": 0.3272727429866791,
"num_tokens": 1452450.0,
"repeat_count": 1.0,
- "routers_loss": 0.03127318620681763,
+ "routers_loss": 0.02985367365181446,
"skip_count": 0.0,
"step": 902,
"text_loss": 0.5875228047370911
@@ -8586,13 +8586,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.0009961153258564966,
- "loss": 0.0389,
+ "loss": 0.0378,
"macro_f1": 0.3144654333591461,
"num_tokens": 1456909.0,
"repeat_count": 0.0,
- "routers_loss": 0.06743519753217697,
+ "routers_loss": 0.06794842332601547,
"skip_count": 3.0,
"step": 904,
"text_loss": 0.40959444642066956
@@ -8605,13 +8605,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009960767232486604,
- "loss": 0.0477,
+ "loss": 0.0476,
"macro_f1": 0.3333333432674408,
"num_tokens": 1461712.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025313226506114006,
+ "routers_loss": 0.0023562447167932987,
"skip_count": 0.0,
"step": 906,
"text_loss": 0.3932875096797943
@@ -8624,13 +8624,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.08203125,
"learning_rate": 0.000996037930543288,
- "loss": 0.052,
+ "loss": 0.0505,
"macro_f1": 0.3272727429866791,
"num_tokens": 1464817.0,
"repeat_count": 0.0,
- "routers_loss": 0.037147488445043564,
+ "routers_loss": 0.03880339860916138,
"skip_count": 1.0,
"step": 908,
"text_loss": 0.17482402920722961
@@ -8643,13 +8643,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.234375,
+ "grad_norm": 0.2119140625,
"learning_rate": 0.000995998947755245,
- "loss": 0.0501,
+ "loss": 0.0479,
"macro_f1": 0.3272727429866791,
"num_tokens": 1467810.0,
"repeat_count": 0.0,
- "routers_loss": 0.021232586354017258,
+ "routers_loss": 0.01736828312277794,
"skip_count": 1.0,
"step": 910,
"text_loss": 0.4140470325946808
@@ -8662,13 +8662,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009959597748994695,
- "loss": 0.0759,
+ "loss": 0.0752,
"macro_f1": 0.3333333432674408,
"num_tokens": 1470802.0,
"repeat_count": 0.0,
- "routers_loss": 0.010563847608864307,
+ "routers_loss": 0.011824851855635643,
"skip_count": 0.0,
"step": 912,
"text_loss": 0.7153383493423462
@@ -8681,13 +8681,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009959204119909726,
- "loss": 0.0425,
+ "loss": 0.0421,
"macro_f1": 0.3272727429866791,
"num_tokens": 1474539.0,
"repeat_count": 0.0,
- "routers_loss": 0.0267612524330616,
+ "routers_loss": 0.025456594303250313,
"skip_count": 0.0,
"step": 914,
"text_loss": 0.42812058329582214
@@ -8700,13 +8700,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009958808590448385,
- "loss": 0.0501,
+ "loss": 0.0489,
"macro_f1": 0.3333333432674408,
"num_tokens": 1477552.0,
"repeat_count": 0.0,
- "routers_loss": 0.005838244222104549,
+ "routers_loss": 0.006795851048082113,
"skip_count": 0.0,
"step": 916,
"text_loss": 0.5402814149856567
@@ -8719,13 +8719,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009958411160762234,
- "loss": 0.0383,
+ "loss": 0.039,
"macro_f1": 0.3333333432674408,
"num_tokens": 1482547.0,
"repeat_count": 0.0,
- "routers_loss": 0.014642171561717987,
+ "routers_loss": 0.015615932643413544,
"skip_count": 0.0,
"step": 918,
"text_loss": 0.3836168050765991
@@ -8738,32 +8738,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009958011831003577,
- "loss": 0.0457,
+ "loss": 0.0448,
"macro_f1": 0.3272727429866791,
"num_tokens": 1485807.0,
"repeat_count": 0.0,
- "routers_loss": 0.04119620472192764,
+ "routers_loss": 0.043541423976421356,
"skip_count": 1.0,
"step": 920,
"text_loss": 0.4333936274051666
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 4.328734957440563,
- "f1_execute": 0.943396270275116,
- "f1_repeat": 0.0,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.154296875,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.000995761060132543,
- "loss": 0.0433,
- "macro_f1": 0.3144654333591461,
+ "loss": 0.0418,
+ "macro_f1": 0.6538461446762085,
"num_tokens": 1488941.0,
"repeat_count": 1.0,
- "routers_loss": 0.06713195145130157,
+ "routers_loss": 0.05866432189941406,
"skip_count": 2.0,
"step": 922,
"text_loss": 0.4106994867324829
@@ -8776,13 +8776,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009957207471881552,
- "loss": 0.0533,
+ "loss": 0.0531,
"macro_f1": 0.5492662787437439,
"num_tokens": 1492026.0,
"repeat_count": 0.0,
- "routers_loss": 0.024023180827498436,
+ "routers_loss": 0.02714901603758335,
"skip_count": 2.0,
"step": 924,
"text_loss": 0.542091429233551
@@ -8795,13 +8795,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.17578125,
+ "grad_norm": 0.1796875,
"learning_rate": 0.0009956802442826415,
- "loss": 0.0373,
+ "loss": 0.0386,
"macro_f1": 0.3272727429866791,
"num_tokens": 1494543.0,
"repeat_count": 1.0,
- "routers_loss": 0.05399841442704201,
+ "routers_loss": 0.0563737191259861,
"skip_count": 0.0,
"step": 926,
"text_loss": 0.47209203243255615
@@ -8814,13 +8814,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0009956395514315235,
- "loss": 0.0488,
+ "loss": 0.0496,
"macro_f1": 0.3272727429866791,
"num_tokens": 1497831.0,
"repeat_count": 1.0,
- "routers_loss": 0.0299264844506979,
+ "routers_loss": 0.03285066783428192,
"skip_count": 0.0,
"step": 928,
"text_loss": 0.6628931164741516
@@ -8833,13 +8833,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009955986686503943,
- "loss": 0.0467,
+ "loss": 0.0466,
"macro_f1": 0.3272727429866791,
"num_tokens": 1501375.0,
"repeat_count": 0.0,
- "routers_loss": 0.023478010669350624,
+ "routers_loss": 0.024297121912240982,
"skip_count": 1.0,
"step": 930,
"text_loss": 0.495676189661026
@@ -8852,13 +8852,13 @@
"f1_execute": 0.9387754797935486,
"f1_repeat": 1.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0009955575959549202,
- "loss": 0.0447,
+ "loss": 0.0424,
"macro_f1": 0.7795917987823486,
"num_tokens": 1504363.0,
"repeat_count": 1.0,
- "routers_loss": 0.12116194516420364,
+ "routers_loss": 0.12196464836597443,
"skip_count": 4.0,
"step": 932,
"text_loss": 0.26123273372650146
@@ -8871,13 +8871,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.0009955163333608408,
- "loss": 0.053,
+ "loss": 0.0538,
"macro_f1": 0.3333333432674408,
"num_tokens": 1507178.0,
"repeat_count": 0.0,
- "routers_loss": 0.011879723519086838,
+ "routers_loss": 0.012947078794240952,
"skip_count": 0.0,
"step": 934,
"text_loss": 0.32552677392959595
@@ -8890,13 +8890,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009954748808839674,
- "loss": 0.0373,
+ "loss": 0.0379,
"macro_f1": 0.3333333432674408,
"num_tokens": 1509910.0,
"repeat_count": 0.0,
- "routers_loss": 0.009245929308235645,
+ "routers_loss": 0.008946365676820278,
"skip_count": 0.0,
"step": 936,
"text_loss": 0.533141016960144
@@ -8909,13 +8909,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.140625,
"learning_rate": 0.000995433238540185,
- "loss": 0.0461,
+ "loss": 0.0466,
"macro_f1": 0.6538461446762085,
"num_tokens": 1512826.0,
"repeat_count": 1.0,
- "routers_loss": 0.032464127987623215,
+ "routers_loss": 0.029975678771734238,
"skip_count": 1.0,
"step": 938,
"text_loss": 0.2953577935695648
@@ -8928,13 +8928,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009953914063454512,
- "loss": 0.0515,
+ "loss": 0.0497,
"macro_f1": 0.3144654333591461,
"num_tokens": 1517230.0,
"repeat_count": 1.0,
- "routers_loss": 0.08835392445325851,
+ "routers_loss": 0.0889134630560875,
"skip_count": 2.0,
"step": 940,
"text_loss": 0.5368834733963013
@@ -8947,13 +8947,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.193359375,
"learning_rate": 0.000995349384315796,
- "loss": 0.0405,
+ "loss": 0.0413,
"macro_f1": 0.3333333432674408,
"num_tokens": 1519876.0,
"repeat_count": 0.0,
- "routers_loss": 0.014307246543467045,
+ "routers_loss": 0.013458753935992718,
"skip_count": 0.0,
"step": 942,
"text_loss": 0.2005518227815628
@@ -8966,13 +8966,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.000995307172467322,
- "loss": 0.0449,
+ "loss": 0.0444,
"macro_f1": 0.31446540355682373,
"num_tokens": 1522998.0,
"repeat_count": 1.0,
- "routers_loss": 0.10261563211679459,
+ "routers_loss": 0.08850377053022385,
"skip_count": 1.0,
"step": 944,
"text_loss": 0.227926567196846
@@ -8985,13 +8985,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009952647708162054,
- "loss": 0.0507,
+ "loss": 0.0503,
"macro_f1": 0.3272727429866791,
"num_tokens": 1527100.0,
"repeat_count": 0.0,
- "routers_loss": 0.03316422924399376,
+ "routers_loss": 0.03199794515967369,
"skip_count": 1.0,
"step": 946,
"text_loss": 0.4859686493873596
@@ -9004,13 +9004,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009952221793786942,
- "loss": 0.0352,
+ "loss": 0.0354,
"macro_f1": 0.3333333432674408,
"num_tokens": 1530028.0,
"repeat_count": 0.0,
- "routers_loss": 0.00902469176799059,
+ "routers_loss": 0.006507779937237501,
"skip_count": 0.0,
"step": 948,
"text_loss": 0.6855354905128479
@@ -9023,13 +9023,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.0009951793981711097,
- "loss": 0.0581,
+ "loss": 0.0584,
"macro_f1": 0.6538461446762085,
"num_tokens": 1533254.0,
"repeat_count": 1.0,
- "routers_loss": 0.06710167229175568,
+ "routers_loss": 0.06175103038549423,
"skip_count": 1.0,
"step": 950,
"text_loss": 0.7590400576591492
@@ -9042,13 +9042,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009951364272098458,
- "loss": 0.0294,
+ "loss": 0.0295,
"macro_f1": 0.5492662787437439,
"num_tokens": 1536239.0,
"repeat_count": 0.0,
- "routers_loss": 0.04208769276738167,
+ "routers_loss": 0.03773383051156998,
"skip_count": 2.0,
"step": 952,
"text_loss": 0.669784665107727
@@ -9061,13 +9061,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009950932665113688,
- "loss": 0.0505,
+ "loss": 0.0507,
"macro_f1": 0.32098764181137085,
"num_tokens": 1539682.0,
"repeat_count": 0.0,
- "routers_loss": 0.06530380249023438,
+ "routers_loss": 0.07280613481998444,
"skip_count": 2.0,
"step": 954,
"text_loss": 0.3365570902824402
@@ -9080,13 +9080,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009950499160922184,
- "loss": 0.0545,
+ "loss": 0.0541,
"macro_f1": 0.3333333432674408,
"num_tokens": 1542875.0,
"repeat_count": 0.0,
- "routers_loss": 0.01803453080356121,
+ "routers_loss": 0.01770266517996788,
"skip_count": 0.0,
"step": 956,
"text_loss": 0.0921545997262001
@@ -9099,13 +9099,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.09375,
"learning_rate": 0.000995006375969006,
- "loss": 0.0481,
+ "loss": 0.0473,
"macro_f1": 0.3272727429866791,
"num_tokens": 1547135.0,
"repeat_count": 1.0,
- "routers_loss": 0.08461762219667435,
+ "routers_loss": 0.07672002166509628,
"skip_count": 0.0,
"step": 958,
"text_loss": 0.5887606739997864
@@ -9120,11 +9120,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1376953125,
"learning_rate": 0.0009949626461584165,
- "loss": 0.0441,
+ "loss": 0.043,
"macro_f1": 0.3333333432674408,
"num_tokens": 1550100.0,
"repeat_count": 0.0,
- "routers_loss": 0.007111486047506332,
+ "routers_loss": 0.006247182376682758,
"skip_count": 0.0,
"step": 960,
"text_loss": 0.5777931213378906
@@ -9137,13 +9137,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.11181640625,
+ "grad_norm": 0.119140625,
"learning_rate": 0.0009949187266772076,
- "loss": 0.0361,
+ "loss": 0.0366,
"macro_f1": 0.5492662787437439,
"num_tokens": 1553192.0,
"repeat_count": 0.0,
- "routers_loss": 0.029776185750961304,
+ "routers_loss": 0.030319908633828163,
"skip_count": 2.0,
"step": 962,
"text_loss": 0.2370252162218094
@@ -9156,13 +9156,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009948746175422088,
- "loss": 0.0506,
+ "loss": 0.0511,
"macro_f1": 0.3333333432674408,
"num_tokens": 1556318.0,
"repeat_count": 0.0,
- "routers_loss": 0.007108999416232109,
+ "routers_loss": 0.006004320923238993,
"skip_count": 0.0,
"step": 964,
"text_loss": 0.6271032094955444
@@ -9175,13 +9175,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000994830318770323,
- "loss": 0.0498,
+ "loss": 0.0514,
"macro_f1": 0.3333333432674408,
"num_tokens": 1559195.0,
"repeat_count": 0.0,
- "routers_loss": 0.01126947533339262,
+ "routers_loss": 0.011544366367161274,
"skip_count": 0.0,
"step": 966,
"text_loss": 0.47256720066070557
@@ -9194,13 +9194,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009947858303785255,
- "loss": 0.0366,
+ "loss": 0.0374,
"macro_f1": 0.6603773832321167,
"num_tokens": 1561813.0,
"repeat_count": 1.0,
- "routers_loss": 0.05142999067902565,
+ "routers_loss": 0.05258861929178238,
"skip_count": 1.0,
"step": 968,
"text_loss": 0.7703132629394531
@@ -9213,13 +9213,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.0009947411523838648,
- "loss": 0.0461,
+ "loss": 0.0453,
"macro_f1": 0.3333333432674408,
"num_tokens": 1564634.0,
"repeat_count": 0.0,
- "routers_loss": 0.010770819149911404,
+ "routers_loss": 0.011216280050575733,
"skip_count": 0.0,
"step": 970,
"text_loss": 0.4666804075241089
@@ -9232,13 +9232,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0009946962848034608,
- "loss": 0.0692,
+ "loss": 0.0696,
"macro_f1": 0.3333333432674408,
"num_tokens": 1567959.0,
"repeat_count": 0.0,
- "routers_loss": 0.008775795809924603,
+ "routers_loss": 0.009387624450027943,
"skip_count": 0.0,
"step": 972,
"text_loss": 0.4067264199256897
@@ -9251,13 +9251,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.203125,
"learning_rate": 0.0009946512276545075,
- "loss": 0.0403,
+ "loss": 0.0397,
"macro_f1": 0.3272727429866791,
"num_tokens": 1571221.0,
"repeat_count": 1.0,
- "routers_loss": 0.05100395902991295,
+ "routers_loss": 0.041713520884513855,
"skip_count": 0.0,
"step": 974,
"text_loss": 0.5242366194725037
@@ -9270,13 +9270,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.25390625,
+ "grad_norm": 0.228515625,
"learning_rate": 0.0009946059809542705,
- "loss": 0.0503,
+ "loss": 0.0487,
"macro_f1": 0.7644445300102234,
"num_tokens": 1575033.0,
"repeat_count": 2.0,
- "routers_loss": 0.06653711199760437,
+ "routers_loss": 0.05748331546783447,
"skip_count": 2.0,
"step": 976,
"text_loss": 0.5704690217971802
@@ -9284,18 +9284,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 4.591722923393014,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0009945605447200887,
- "loss": 0.0435,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0445,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1579050.0,
"repeat_count": 0.0,
- "routers_loss": 0.009865665808320045,
+ "routers_loss": 0.016765203326940536,
"skip_count": 0.0,
"step": 978,
"text_loss": 0.4804173707962036
@@ -9308,13 +9308,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.0009945149189693732,
- "loss": 0.0399,
+ "loss": 0.0406,
"macro_f1": 0.5492662787437439,
"num_tokens": 1582967.0,
"repeat_count": 0.0,
- "routers_loss": 0.021175632253289223,
+ "routers_loss": 0.021518222987651825,
"skip_count": 2.0,
"step": 980,
"text_loss": 0.4138598144054413
@@ -9327,32 +9327,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11181640625,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.0009944691037196078,
- "loss": 0.0472,
+ "loss": 0.0456,
"macro_f1": 0.3333333432674408,
"num_tokens": 1586282.0,
"repeat_count": 0.0,
- "routers_loss": 0.011803832836449146,
+ "routers_loss": 0.012246460653841496,
"skip_count": 0.0,
"step": 982,
"text_loss": 0.22561736404895782
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 0.5,
"acc_skip": 0.800000011920929,
- "avg_layers": 23.0,
+ "avg_layers": 24.0,
"epoch": 4.6199002054593485,
- "f1_execute": 0.9090908765792847,
- "f1_repeat": 0.0,
+ "f1_execute": 0.930232584476471,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 0.8000000715255737,
- "grad_norm": 0.142578125,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009944230989883491,
- "loss": 0.0467,
- "macro_f1": 0.5696970224380493,
+ "loss": 0.0456,
+ "macro_f1": 0.7989664077758789,
"num_tokens": 1589279.0,
"repeat_count": 2.0,
- "routers_loss": 0.08856551349163055,
+ "routers_loss": 0.09344895929098129,
"skip_count": 5.0,
"step": 984,
"text_loss": 0.4416656494140625
@@ -9365,13 +9365,13 @@
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.111328125,
"learning_rate": 0.0009943769047932264,
- "loss": 0.0413,
+ "loss": 0.0404,
"macro_f1": 0.5359477400779724,
"num_tokens": 1592398.0,
"repeat_count": 2.0,
- "routers_loss": 0.08593414723873138,
+ "routers_loss": 0.08916857838630676,
"skip_count": 2.0,
"step": 986,
"text_loss": 0.5536438822746277
@@ -9384,13 +9384,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.154296875,
+ "grad_norm": 0.15234375,
"learning_rate": 0.000994330521151941,
- "loss": 0.0399,
+ "loss": 0.039,
"macro_f1": 0.32098764181137085,
"num_tokens": 1596213.0,
"repeat_count": 1.0,
- "routers_loss": 0.07049509882926941,
+ "routers_loss": 0.06114347651600838,
"skip_count": 1.0,
"step": 988,
"text_loss": 0.5835405588150024
@@ -9403,13 +9403,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.205078125,
+ "grad_norm": 0.1953125,
"learning_rate": 0.000994283948082267,
- "loss": 0.0595,
+ "loss": 0.0573,
"macro_f1": 0.3333333432674408,
"num_tokens": 1598827.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019258069805800915,
+ "routers_loss": 0.0017335431184619665,
"skip_count": 0.0,
"step": 990,
"text_loss": 0.5857380032539368
@@ -9422,13 +9422,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009942371856020522,
- "loss": 0.0335,
+ "loss": 0.0341,
"macro_f1": 0.3333333432674408,
"num_tokens": 1602915.0,
"repeat_count": 0.0,
- "routers_loss": 0.014094089157879353,
+ "routers_loss": 0.014606470242142677,
"skip_count": 0.0,
"step": 992,
"text_loss": 0.6939892768859863
@@ -9436,18 +9436,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 31.0,
"epoch": 4.666862342236572,
- "f1_execute": 0.9583333134651184,
+ "f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009941902337292155,
- "loss": 0.0603,
- "macro_f1": 0.6527777910232544,
+ "loss": 0.06,
+ "macro_f1": 0.6598639488220215,
"num_tokens": 1605776.0,
"repeat_count": 3.0,
- "routers_loss": 0.06360147893428802,
+ "routers_loss": 0.06297315657138824,
"skip_count": 1.0,
"step": 994,
"text_loss": 0.37616831064224243
@@ -9460,13 +9460,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009941430924817487,
- "loss": 0.0573,
+ "loss": 0.0572,
"macro_f1": 0.5492662787437439,
"num_tokens": 1609856.0,
"repeat_count": 0.0,
- "routers_loss": 0.0326208658516407,
+ "routers_loss": 0.03297794610261917,
"skip_count": 2.0,
"step": 996,
"text_loss": 0.2098303586244583
@@ -9479,13 +9479,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09912109375,
+ "grad_norm": 0.10107421875,
"learning_rate": 0.000994095761877717,
- "loss": 0.0502,
+ "loss": 0.0499,
"macro_f1": 0.3333333432674408,
"num_tokens": 1612904.0,
"repeat_count": 0.0,
- "routers_loss": 0.012660752050578594,
+ "routers_loss": 0.012901155278086662,
"skip_count": 0.0,
"step": 998,
"text_loss": 0.20103533565998077
@@ -9498,13 +9498,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.265625,
+ "grad_norm": 0.259765625,
"learning_rate": 0.000994048241935257,
- "loss": 0.0537,
+ "loss": 0.0535,
"macro_f1": 0.3272727429866791,
"num_tokens": 1615540.0,
"repeat_count": 0.0,
- "routers_loss": 0.021756287664175034,
+ "routers_loss": 0.020434845238924026,
"skip_count": 0.0,
"step": 1000,
"text_loss": 0.32709044218063354
@@ -9512,37 +9512,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 4.70443205165835,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1669921875,
"learning_rate": 0.0009940005326725789,
- "loss": 0.0447,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.0453,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 1618786.0,
"repeat_count": 0.0,
- "routers_loss": 0.07292548567056656,
+ "routers_loss": 0.07831378281116486,
"skip_count": 2.0,
"step": 1002,
"text_loss": 0.5789632797241211
},
{
- "acc_repeat": 0.5,
+ "acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 4.713824479013795,
- "f1_execute": 0.9811320900917053,
- "f1_repeat": 0.6666666865348816,
+ "f1_execute": 0.9629629254341125,
+ "f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1787109375,
+ "grad_norm": 0.21875,
"learning_rate": 0.0009939526341079647,
- "loss": 0.0505,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0511,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 1621736.0,
"repeat_count": 2.0,
- "routers_loss": 0.03397528454661369,
+ "routers_loss": 0.04863874986767769,
"skip_count": 0.0,
"step": 1004,
"text_loss": 0.6128849387168884
@@ -9555,13 +9555,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009939045462597693,
- "loss": 0.0544,
+ "loss": 0.0538,
"macro_f1": 0.3333333432674408,
"num_tokens": 1624649.0,
"repeat_count": 0.0,
- "routers_loss": 0.005987613927572966,
+ "routers_loss": 0.00677989237010479,
"skip_count": 0.0,
"step": 1006,
"text_loss": 0.6168264150619507
@@ -9574,13 +9574,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009938562691464202,
- "loss": 0.0522,
+ "loss": 0.0524,
"macro_f1": 0.3333333432674408,
"num_tokens": 1627700.0,
"repeat_count": 0.0,
- "routers_loss": 0.021656684577465057,
+ "routers_loss": 0.019490402191877365,
"skip_count": 0.0,
"step": 1008,
"text_loss": 0.17463822662830353
@@ -9593,32 +9593,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.000993807802786417,
- "loss": 0.0487,
+ "loss": 0.0475,
"macro_f1": 0.3333333432674408,
"num_tokens": 1630714.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014992234064266086,
+ "routers_loss": 0.0019022391643375158,
"skip_count": 0.0,
"step": 1010,
"text_loss": 0.5675593018531799
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.5,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 4.751394188435574,
- "f1_execute": 0.9411764740943909,
- "f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.158203125,
+ "f1_execute": 0.9599999785423279,
+ "f1_repeat": 0.6666666865348816,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009937591471983322,
- "loss": 0.0491,
- "macro_f1": 0.5359477400779724,
+ "loss": 0.0501,
+ "macro_f1": 0.7644444704055786,
"num_tokens": 1633770.0,
"repeat_count": 1.0,
- "routers_loss": 0.03448791801929474,
+ "routers_loss": 0.042485643178224564,
"skip_count": 2.0,
"step": 1012,
"text_loss": 0.42387229204177856
@@ -9631,13 +9631,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1357421875,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0009937103024008109,
- "loss": 0.0541,
+ "loss": 0.0545,
"macro_f1": 0.3272727429866791,
"num_tokens": 1637120.0,
"repeat_count": 0.0,
- "routers_loss": 0.08285929262638092,
+ "routers_loss": 0.09427817165851593,
"skip_count": 1.0,
"step": 1014,
"text_loss": 0.49511051177978516
@@ -9650,13 +9650,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.125,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009936612684125702,
- "loss": 0.0515,
+ "loss": 0.0503,
"macro_f1": 0.3333333432674408,
"num_tokens": 1640165.0,
"repeat_count": 0.0,
- "routers_loss": 0.00486504752188921,
+ "routers_loss": 0.005106127820909023,
"skip_count": 0.0,
"step": 1016,
"text_loss": 0.5398799180984497
@@ -9669,13 +9669,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.275390625,
+ "grad_norm": 0.2734375,
"learning_rate": 0.0009936120452524004,
- "loss": 0.051,
+ "loss": 0.0506,
"macro_f1": 0.3333333432674408,
"num_tokens": 1643251.0,
"repeat_count": 0.0,
- "routers_loss": 0.017805909737944603,
+ "routers_loss": 0.016914300620555878,
"skip_count": 0.0,
"step": 1018,
"text_loss": 0.20882178843021393
@@ -9688,13 +9688,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1865234375,
+ "grad_norm": 0.1962890625,
"learning_rate": 0.0009935626329391637,
- "loss": 0.0547,
+ "loss": 0.0537,
"macro_f1": 0.32098764181137085,
"num_tokens": 1646560.0,
"repeat_count": 0.0,
- "routers_loss": 0.12958799302577972,
+ "routers_loss": 0.13481520116329193,
"skip_count": 2.0,
"step": 1020,
"text_loss": 0.5719883441925049
@@ -9707,13 +9707,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009935130314917948,
- "loss": 0.0595,
+ "loss": 0.0602,
"macro_f1": 0.5492662787437439,
"num_tokens": 1649538.0,
"repeat_count": 0.0,
- "routers_loss": 0.07447081059217453,
+ "routers_loss": 0.07700438797473907,
"skip_count": 2.0,
"step": 1022,
"text_loss": 0.1303367167711258
@@ -9726,13 +9726,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009934632409293015,
- "loss": 0.0619,
+ "loss": 0.0611,
"macro_f1": 0.32098764181137085,
"num_tokens": 1652397.0,
"repeat_count": 1.0,
- "routers_loss": 0.12529553472995758,
+ "routers_loss": 0.11416907608509064,
"skip_count": 1.0,
"step": 1024,
"text_loss": 0.24076920747756958
@@ -9745,13 +9745,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.279296875,
+ "grad_norm": 0.306640625,
"learning_rate": 0.0009934132612707631,
- "loss": 0.0491,
+ "loss": 0.0507,
"macro_f1": 0.31446540355682373,
"num_tokens": 1654938.0,
"repeat_count": 0.0,
- "routers_loss": 0.08664281666278839,
+ "routers_loss": 0.09484589844942093,
"skip_count": 2.0,
"step": 1026,
"text_loss": 0.1652517318725586
@@ -9764,13 +9764,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009933630925353324,
- "loss": 0.0394,
+ "loss": 0.0395,
"macro_f1": 0.3333333432674408,
"num_tokens": 1658536.0,
"repeat_count": 0.0,
- "routers_loss": 0.0067965323105454445,
+ "routers_loss": 0.00741987070068717,
"skip_count": 0.0,
"step": 1028,
"text_loss": 0.49296700954437256
@@ -9783,13 +9783,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1845703125,
"learning_rate": 0.0009933127347422337,
- "loss": 0.0607,
+ "loss": 0.0602,
"macro_f1": 0.32098764181137085,
"num_tokens": 1661446.0,
"repeat_count": 0.0,
- "routers_loss": 0.08319470286369324,
+ "routers_loss": 0.08399344235658646,
"skip_count": 2.0,
"step": 1030,
"text_loss": 0.22363591194152832
@@ -9802,13 +9802,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.158203125,
"learning_rate": 0.0009932621879107648,
- "loss": 0.0476,
+ "loss": 0.0475,
"macro_f1": 0.3333333432674408,
"num_tokens": 1664612.0,
"repeat_count": 0.0,
- "routers_loss": 0.002826537238433957,
+ "routers_loss": 0.0031781597062945366,
"skip_count": 0.0,
"step": 1032,
"text_loss": 0.36083245277404785
@@ -9823,11 +9823,11 @@
"f1_skip": 0.0,
"grad_norm": 0.2275390625,
"learning_rate": 0.000993211452060295,
- "loss": 0.0431,
+ "loss": 0.042,
"macro_f1": 0.3272727429866791,
"num_tokens": 1667467.0,
"repeat_count": 0.0,
- "routers_loss": 0.03491095453500748,
+ "routers_loss": 0.03595469892024994,
"skip_count": 1.0,
"step": 1034,
"text_loss": 0.16372856497764587
@@ -9840,13 +9840,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.173828125,
+ "grad_norm": 0.189453125,
"learning_rate": 0.000993160527210266,
- "loss": 0.0616,
+ "loss": 0.061,
"macro_f1": 0.3144654333591461,
"num_tokens": 1670675.0,
"repeat_count": 3.0,
- "routers_loss": 0.1828247457742691,
+ "routers_loss": 0.1597205102443695,
"skip_count": 0.0,
"step": 1036,
"text_loss": 0.6049913763999939
@@ -9859,13 +9859,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2099609375,
+ "grad_norm": 0.2197265625,
"learning_rate": 0.000993109413380193,
- "loss": 0.0563,
+ "loss": 0.0562,
"macro_f1": 0.3333333432674408,
"num_tokens": 1673477.0,
"repeat_count": 0.0,
- "routers_loss": 0.010931054130196571,
+ "routers_loss": 0.009756010957062244,
"skip_count": 0.0,
"step": 1038,
"text_loss": 0.7034620642662048
@@ -9878,13 +9878,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.158203125,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.0009930581105896624,
- "loss": 0.0569,
+ "loss": 0.0559,
"macro_f1": 0.3272727429866791,
"num_tokens": 1676809.0,
"repeat_count": 0.0,
- "routers_loss": 0.023222090676426888,
+ "routers_loss": 0.020718922838568687,
"skip_count": 0.0,
"step": 1040,
"text_loss": 0.2814720571041107
@@ -9897,13 +9897,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1962890625,
+ "grad_norm": 0.1923828125,
"learning_rate": 0.0009930066188583338,
- "loss": 0.0453,
+ "loss": 0.0445,
"macro_f1": 0.32098764181137085,
"num_tokens": 1679398.0,
"repeat_count": 1.0,
- "routers_loss": 0.07085686922073364,
+ "routers_loss": 0.04755603149533272,
"skip_count": 1.0,
"step": 1042,
"text_loss": 0.5445759296417236
@@ -9916,13 +9916,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.126953125,
"learning_rate": 0.0009929549382059388,
- "loss": 0.0515,
+ "loss": 0.0509,
"macro_f1": 0.3333333432674408,
"num_tokens": 1682269.0,
"repeat_count": 0.0,
- "routers_loss": 0.010158216580748558,
+ "routers_loss": 0.01040949858725071,
"skip_count": 0.0,
"step": 1044,
"text_loss": 0.2876914143562317
@@ -9935,13 +9935,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0009929030686522816,
- "loss": 0.0372,
+ "loss": 0.0363,
"macro_f1": 0.3333333432674408,
"num_tokens": 1685428.0,
"repeat_count": 0.0,
- "routers_loss": 0.007876895368099213,
+ "routers_loss": 0.008158888667821884,
"skip_count": 0.0,
"step": 1046,
"text_loss": 0.49053525924682617
@@ -9954,13 +9954,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009928510102172386,
- "loss": 0.0501,
+ "loss": 0.0498,
"macro_f1": 0.3333333432674408,
"num_tokens": 1688252.0,
"repeat_count": 0.0,
- "routers_loss": 0.004859173204749823,
+ "routers_loss": 0.005102572031319141,
"skip_count": 0.0,
"step": 1048,
"text_loss": 0.5274341106414795
@@ -9973,13 +9973,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.17578125,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.0009927987629207587,
- "loss": 0.0582,
+ "loss": 0.0564,
"macro_f1": 0.3333333432674408,
"num_tokens": 1691289.0,
"repeat_count": 0.0,
- "routers_loss": 0.01798083633184433,
+ "routers_loss": 0.016768503934144974,
"skip_count": 0.0,
"step": 1050,
"text_loss": 0.9935035109519958
@@ -9987,18 +9987,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 4.939242735544467,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1376953125,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009927463267828634,
"loss": 0.0488,
- "macro_f1": 0.3272727429866791,
+ "macro_f1": 0.3333333432674408,
"num_tokens": 1694148.0,
"repeat_count": 0.0,
- "routers_loss": 0.014295363798737526,
+ "routers_loss": 0.010905829258263111,
"skip_count": 0.0,
"step": 1052,
"text_loss": 0.20895758271217346
@@ -10011,13 +10011,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.000992693701823646,
- "loss": 0.0635,
+ "loss": 0.0624,
"macro_f1": 0.3272727429866791,
"num_tokens": 1698543.0,
"repeat_count": 1.0,
- "routers_loss": 0.1038367822766304,
+ "routers_loss": 0.10533971339464188,
"skip_count": 0.0,
"step": 1054,
"text_loss": 0.5776236653327942
@@ -10030,13 +10030,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2421875,
+ "grad_norm": 0.255859375,
"learning_rate": 0.0009926408880632726,
- "loss": 0.057,
+ "loss": 0.0556,
"macro_f1": 0.3272727429866791,
"num_tokens": 1702460.0,
"repeat_count": 0.0,
- "routers_loss": 0.029780643060803413,
+ "routers_loss": 0.026313411071896553,
"skip_count": 1.0,
"step": 1056,
"text_loss": 0.34990596771240234
@@ -10049,13 +10049,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10107421875,
+ "grad_norm": 0.099609375,
"learning_rate": 0.0009925878855219818,
- "loss": 0.0398,
+ "loss": 0.0391,
"macro_f1": 0.3333333432674408,
"num_tokens": 1705686.0,
"repeat_count": 0.0,
- "routers_loss": 0.008537676185369492,
+ "routers_loss": 0.007763393223285675,
"skip_count": 0.0,
"step": 1058,
"text_loss": 0.4980163276195526
@@ -10068,13 +10068,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.171875,
+ "grad_norm": 0.177734375,
"learning_rate": 0.000992534694220084,
- "loss": 0.0617,
+ "loss": 0.0613,
"macro_f1": 0.3272727429866791,
"num_tokens": 1708739.0,
"repeat_count": 0.0,
- "routers_loss": 0.03966755419969559,
+ "routers_loss": 0.03998444974422455,
"skip_count": 1.0,
"step": 1060,
"text_loss": 0.29092350602149963
@@ -10087,13 +10087,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1484375,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.000992481314177962,
- "loss": 0.0311,
+ "loss": 0.0312,
"macro_f1": 0.32098764181137085,
"num_tokens": 1711903.0,
"repeat_count": 1.0,
- "routers_loss": 0.06651833653450012,
+ "routers_loss": 0.06966045498847961,
"skip_count": 1.0,
"step": 1062,
"text_loss": 0.6267179250717163
@@ -10106,13 +10106,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2431640625,
+ "grad_norm": 0.244140625,
"learning_rate": 0.0009924277454160717,
- "loss": 0.0557,
+ "loss": 0.0548,
"macro_f1": 0.3272727429866791,
"num_tokens": 1715974.0,
"repeat_count": 0.0,
- "routers_loss": 0.05130369961261749,
+ "routers_loss": 0.05536063387989998,
"skip_count": 1.0,
"step": 1064,
"text_loss": 0.5813798904418945
@@ -10125,13 +10125,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1337890625,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009923739879549402,
- "loss": 0.0435,
+ "loss": 0.0423,
"macro_f1": 0.3333333432674408,
"num_tokens": 1718828.0,
"repeat_count": 0.0,
- "routers_loss": 0.020534176379442215,
+ "routers_loss": 0.020993782207369804,
"skip_count": 0.0,
"step": 1066,
"text_loss": 0.22665327787399292
@@ -10144,13 +10144,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.0888671875,
"learning_rate": 0.0009923200418151677,
- "loss": 0.0305,
+ "loss": 0.0301,
"macro_f1": 0.3333333432674408,
"num_tokens": 1722419.0,
"repeat_count": 0.0,
- "routers_loss": 0.007514918688684702,
+ "routers_loss": 0.007351701147854328,
"skip_count": 0.0,
"step": 1068,
"text_loss": 0.5796169638633728
@@ -10163,13 +10163,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009922659070174264,
- "loss": 0.0461,
+ "loss": 0.0452,
"macro_f1": 0.3272727429866791,
"num_tokens": 1725663.0,
"repeat_count": 1.0,
- "routers_loss": 0.024598751217126846,
+ "routers_loss": 0.026033315807580948,
"skip_count": 0.0,
"step": 1070,
"text_loss": 0.25742828845977783
@@ -10182,32 +10182,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009922115835824612,
- "loss": 0.0408,
+ "loss": 0.041,
"macro_f1": 0.3333333432674408,
"num_tokens": 1729239.0,
"repeat_count": 0.0,
- "routers_loss": 0.011866633780300617,
+ "routers_loss": 0.0118600158020854,
"skip_count": 0.0,
"step": 1072,
"text_loss": 0.21630282700061798
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 5.042265923099501,
- "f1_execute": 0.9818181991577148,
- "f1_repeat": 0.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009921570715310884,
- "loss": 0.036,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0364,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 1732507.0,
"repeat_count": 1.0,
- "routers_loss": 0.01755746826529503,
+ "routers_loss": 0.016118815168738365,
"skip_count": 0.0,
"step": 1074,
"text_loss": 0.5639925003051758
@@ -10220,13 +10220,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009921023708841974,
- "loss": 0.0415,
+ "loss": 0.0407,
"macro_f1": 0.3333333432674408,
"num_tokens": 1736182.0,
"repeat_count": 0.0,
- "routers_loss": 0.003976983483880758,
+ "routers_loss": 0.004275390412658453,
"skip_count": 0.0,
"step": 1076,
"text_loss": 0.5758615136146545
@@ -10239,13 +10239,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.0009920474816627496,
- "loss": 0.0378,
+ "loss": 0.037,
"macro_f1": 0.3333333432674408,
"num_tokens": 1739559.0,
"repeat_count": 0.0,
- "routers_loss": 0.013548235408961773,
+ "routers_loss": 0.01299292128533125,
"skip_count": 0.0,
"step": 1078,
"text_loss": 0.18221625685691833
@@ -10258,13 +10258,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009919924038877788,
"loss": 0.0343,
"macro_f1": 0.32098764181137085,
"num_tokens": 1742890.0,
"repeat_count": 0.0,
- "routers_loss": 0.03923165053129196,
+ "routers_loss": 0.038295745849609375,
"skip_count": 2.0,
"step": 1080,
"text_loss": 0.17354349792003632
@@ -10277,13 +10277,13 @@
"f1_execute": 0.9583333134651184,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009919371375803905,
- "loss": 0.0464,
+ "loss": 0.0455,
"macro_f1": 0.8194444179534912,
"num_tokens": 1746433.0,
"repeat_count": 2.0,
- "routers_loss": 0.046429626643657684,
+ "routers_loss": 0.04052971675992012,
"skip_count": 3.0,
"step": 1082,
"text_loss": 0.2250112146139145
@@ -10296,13 +10296,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1025390625,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009918816827617632,
- "loss": 0.0346,
+ "loss": 0.0353,
"macro_f1": 0.3333333432674408,
"num_tokens": 1750802.0,
"repeat_count": 0.0,
- "routers_loss": 0.008998732082545757,
+ "routers_loss": 0.009114136919379234,
"skip_count": 0.0,
"step": 1084,
"text_loss": 0.2526719272136688
@@ -10315,13 +10315,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.000991826039453147,
- "loss": 0.0386,
+ "loss": 0.0392,
"macro_f1": 0.3333333432674408,
"num_tokens": 1754272.0,
"repeat_count": 0.0,
- "routers_loss": 0.005173585377633572,
+ "routers_loss": 0.004904678091406822,
"skip_count": 0.0,
"step": 1086,
"text_loss": 0.7308789491653442
@@ -10334,13 +10334,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.138671875,
"learning_rate": 0.000991770207675865,
- "loss": 0.0308,
+ "loss": 0.0327,
"macro_f1": 0.6666666865348816,
"num_tokens": 1757231.0,
"repeat_count": 0.0,
- "routers_loss": 0.024098891764879227,
+ "routers_loss": 0.02129189297556877,
"skip_count": 2.0,
"step": 1088,
"text_loss": 0.21764220297336578
@@ -10353,13 +10353,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009917141874513113,
"loss": 0.0315,
"macro_f1": 0.3333333432674408,
"num_tokens": 1760003.0,
"repeat_count": 0.0,
- "routers_loss": 0.014002764597535133,
+ "routers_loss": 0.01310618408024311,
"skip_count": 0.0,
"step": 1090,
"text_loss": 0.33892181515693665
@@ -10372,32 +10372,32 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009916579788009537,
- "loss": 0.0462,
+ "loss": 0.0457,
"macro_f1": 0.5492662787437439,
"num_tokens": 1763052.0,
"repeat_count": 0.0,
- "routers_loss": 0.017871137708425522,
+ "routers_loss": 0.02059309557080269,
"skip_count": 2.0,
"step": 1092,
"text_loss": 0.6551769375801086
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 5.136190196653947,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1044921875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10546875,
"learning_rate": 0.0009916015817463312,
"loss": 0.0385,
- "macro_f1": 0.32098764181137085,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1766655.0,
"repeat_count": 0.0,
- "routers_loss": 0.033123619854450226,
+ "routers_loss": 0.0274797435849905,
"skip_count": 2.0,
"step": 1094,
"text_loss": 0.3984372019767761
@@ -10410,13 +10410,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.000991544996309055,
- "loss": 0.0267,
+ "loss": 0.0271,
"macro_f1": 0.3333333432674408,
"num_tokens": 1769997.0,
"repeat_count": 0.0,
- "routers_loss": 0.01279227901250124,
+ "routers_loss": 0.01437368243932724,
"skip_count": 0.0,
"step": 1096,
"text_loss": 0.4203338921070099
@@ -10429,13 +10429,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.150390625,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.000991488222510809,
- "loss": 0.0295,
+ "loss": 0.0292,
"macro_f1": 0.3333333432674408,
"num_tokens": 1773130.0,
"repeat_count": 0.0,
- "routers_loss": 0.001354650012217462,
+ "routers_loss": 0.001382062560878694,
"skip_count": 0.0,
"step": 1098,
"text_loss": 0.43132516741752625
@@ -10448,13 +10448,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.123046875,
"learning_rate": 0.000991431260373349,
- "loss": 0.0326,
+ "loss": 0.0329,
"macro_f1": 0.3144654333591461,
"num_tokens": 1775682.0,
"repeat_count": 1.0,
- "routers_loss": 0.1097714751958847,
+ "routers_loss": 0.1115434318780899,
"skip_count": 2.0,
"step": 1100,
"text_loss": 0.3218227028846741
@@ -10467,13 +10467,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.111328125,
"learning_rate": 0.000991374109918503,
- "loss": 0.0187,
+ "loss": 0.0185,
"macro_f1": 0.3333333432674408,
"num_tokens": 1778407.0,
"repeat_count": 0.0,
- "routers_loss": 0.009649592451751232,
+ "routers_loss": 0.009529678151011467,
"skip_count": 0.0,
"step": 1102,
"text_loss": 0.17183731496334076
@@ -10486,13 +10486,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.11083984375,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.000991316771168171,
- "loss": 0.0447,
+ "loss": 0.044,
"macro_f1": 0.5492662787437439,
"num_tokens": 1781518.0,
"repeat_count": 0.0,
- "routers_loss": 0.020858706906437874,
+ "routers_loss": 0.018668074160814285,
"skip_count": 2.0,
"step": 1104,
"text_loss": 1.1324785947799683
@@ -10505,13 +10505,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.134765625,
+ "grad_norm": 0.125,
"learning_rate": 0.0009912592441443258,
- "loss": 0.0428,
+ "loss": 0.0411,
"macro_f1": 0.3272727429866791,
"num_tokens": 1784878.0,
"repeat_count": 0.0,
- "routers_loss": 0.048101235181093216,
+ "routers_loss": 0.04145100712776184,
"skip_count": 1.0,
"step": 1106,
"text_loss": 0.6082063317298889
@@ -10524,13 +10524,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009912015288690112,
- "loss": 0.0435,
+ "loss": 0.0421,
"macro_f1": 0.3272727429866791,
"num_tokens": 1788978.0,
"repeat_count": 0.0,
- "routers_loss": 0.02875671721994877,
+ "routers_loss": 0.021450644358992577,
"skip_count": 1.0,
"step": 1108,
"text_loss": 0.5597621202468872
@@ -10543,13 +10543,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08349609375,
+ "grad_norm": 0.083984375,
"learning_rate": 0.0009911436253643444,
- "loss": 0.0247,
+ "loss": 0.0238,
"macro_f1": 0.3333333432674408,
"num_tokens": 1792321.0,
"repeat_count": 0.0,
- "routers_loss": 0.019005145877599716,
+ "routers_loss": 0.017405325546860695,
"skip_count": 0.0,
"step": 1110,
"text_loss": 0.2560598850250244
@@ -10562,13 +10562,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.255859375,
+ "grad_norm": 0.2294921875,
"learning_rate": 0.0009910855336525137,
- "loss": 0.0393,
+ "loss": 0.0383,
"macro_f1": 0.3333333432674408,
"num_tokens": 1795182.0,
"repeat_count": 0.0,
- "routers_loss": 0.007238700054585934,
+ "routers_loss": 0.007162237539887428,
"skip_count": 0.0,
"step": 1112,
"text_loss": 0.3438240587711334
@@ -10581,13 +10581,13 @@
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.125,
+ "grad_norm": 0.115234375,
"learning_rate": 0.00099102725375578,
"loss": 0.0326,
"macro_f1": 0.480392187833786,
"num_tokens": 1798987.0,
"repeat_count": 1.0,
- "routers_loss": 0.12206140905618668,
+ "routers_loss": 0.11149197816848755,
"skip_count": 3.0,
"step": 1114,
"text_loss": 0.20455503463745117
@@ -10595,18 +10595,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 5.239506897563839,
- "f1_execute": 0.8799999952316284,
+ "f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.10791015625,
"learning_rate": 0.0009909687856964767,
- "loss": 0.0366,
- "macro_f1": 0.29333335161209106,
+ "loss": 0.035,
+ "macro_f1": 0.3006536364555359,
"num_tokens": 1802064.0,
"repeat_count": 2.0,
- "routers_loss": 0.15721899271011353,
+ "routers_loss": 0.12679415941238403,
"skip_count": 3.0,
"step": 1116,
"text_loss": 0.11996729671955109
@@ -10619,32 +10619,32 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.125,
+ "grad_norm": 0.12451171875,
"learning_rate": 0.0009909101294970082,
- "loss": 0.0366,
+ "loss": 0.0365,
"macro_f1": 0.5492662787437439,
"num_tokens": 1805412.0,
"repeat_count": 0.0,
- "routers_loss": 0.05058665946125984,
+ "routers_loss": 0.05108053982257843,
"skip_count": 2.0,
"step": 1118,
"text_loss": 0.13224145770072937
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 5.258291752274729,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.123046875,
"learning_rate": 0.0009908512851798522,
- "loss": 0.0454,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0455,
+ "macro_f1": 0.6603773832321167,
"num_tokens": 1808196.0,
"repeat_count": 1.0,
- "routers_loss": 0.023021472617983818,
+ "routers_loss": 0.02131766639649868,
"skip_count": 1.0,
"step": 1120,
"text_loss": 0.7824069261550903
@@ -10657,13 +10657,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1435546875,
+ "grad_norm": 0.138671875,
"learning_rate": 0.0009907922527675576,
- "loss": 0.0409,
+ "loss": 0.0405,
"macro_f1": 0.3333333432674408,
"num_tokens": 1811622.0,
"repeat_count": 0.0,
- "routers_loss": 0.006660689599812031,
+ "routers_loss": 0.006226244382560253,
"skip_count": 0.0,
"step": 1122,
"text_loss": 0.5419743061065674
@@ -10676,13 +10676,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.12890625,
"learning_rate": 0.000990733032282746,
- "loss": 0.0547,
+ "loss": 0.0535,
"macro_f1": 0.5492662787437439,
"num_tokens": 1814628.0,
"repeat_count": 0.0,
- "routers_loss": 0.031727343797683716,
+ "routers_loss": 0.03088250942528248,
"skip_count": 2.0,
"step": 1124,
"text_loss": 0.37100958824157715
@@ -10695,13 +10695,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.0810546875,
"learning_rate": 0.000990673623748111,
- "loss": 0.0351,
+ "loss": 0.0348,
"macro_f1": 0.32098767161369324,
"num_tokens": 1817205.0,
"repeat_count": 0.0,
- "routers_loss": 0.06140992045402527,
+ "routers_loss": 0.05495348572731018,
"skip_count": 1.0,
"step": 1126,
"text_loss": 0.20241330564022064
@@ -10709,18 +10709,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
- "avg_layers": 25.0,
+ "avg_layers": 26.0,
"epoch": 5.295861461696507,
- "f1_execute": 0.9411764740943909,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.4000000059604645,
- "grad_norm": 0.09814453125,
+ "f1_skip": 0.5,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0009906140271864173,
- "loss": 0.0436,
- "macro_f1": 0.44705885648727417,
+ "loss": 0.0433,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 1820141.0,
"repeat_count": 0.0,
- "routers_loss": 0.03872275352478027,
+ "routers_loss": 0.037809282541275024,
"skip_count": 2.0,
"step": 1128,
"text_loss": 0.32965806126594543
@@ -10728,18 +10728,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 5.305253889051952,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09228515625,
+ "grad_norm": 0.0908203125,
"learning_rate": 0.0009905542426205032,
- "loss": 0.0353,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0348,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 1824011.0,
"repeat_count": 0.0,
- "routers_loss": 0.031013142317533493,
+ "routers_loss": 0.03320181369781494,
"skip_count": 1.0,
"step": 1130,
"text_loss": 0.36329755187034607
@@ -10752,13 +10752,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1123046875,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009904942700732777,
- "loss": 0.0333,
+ "loss": 0.0335,
"macro_f1": 0.3333333432674408,
"num_tokens": 1826873.0,
"repeat_count": 0.0,
- "routers_loss": 0.004357635974884033,
+ "routers_loss": 0.004102326463907957,
"skip_count": 0.0,
"step": 1132,
"text_loss": 0.6692602038383484
@@ -10771,13 +10771,13 @@
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11279296875,
+ "grad_norm": 0.08544921875,
"learning_rate": 0.0009904341095677226,
"loss": 0.03,
"macro_f1": 0.29333335161209106,
"num_tokens": 1830103.0,
"repeat_count": 2.0,
- "routers_loss": 0.2376353144645691,
+ "routers_loss": 0.2376193106174469,
"skip_count": 4.0,
"step": 1134,
"text_loss": 0.19212862849235535
@@ -10790,13 +10790,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10888671875,
+ "grad_norm": 0.119140625,
"learning_rate": 0.0009903737611268919,
- "loss": 0.0446,
+ "loss": 0.0445,
"macro_f1": 0.3333333432674408,
"num_tokens": 1833201.0,
"repeat_count": 0.0,
- "routers_loss": 0.004978097043931484,
+ "routers_loss": 0.005253395065665245,
"skip_count": 0.0,
"step": 1136,
"text_loss": 0.6773360371589661
@@ -10809,13 +10809,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009903132247739107,
- "loss": 0.0309,
+ "loss": 0.0305,
"macro_f1": 0.3076923191547394,
"num_tokens": 1836045.0,
"repeat_count": 1.0,
- "routers_loss": 0.14195409417152405,
+ "routers_loss": 0.14382585883140564,
"skip_count": 3.0,
"step": 1138,
"text_loss": 0.2882297933101654
@@ -10828,13 +10828,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.15234375,
+ "grad_norm": 0.150390625,
"learning_rate": 0.0009902525005319766,
- "loss": 0.0403,
+ "loss": 0.04,
"macro_f1": 0.5427350401878357,
"num_tokens": 1839721.0,
"repeat_count": 1.0,
- "routers_loss": 0.04005253314971924,
+ "routers_loss": 0.04033960774540901,
"skip_count": 2.0,
"step": 1140,
"text_loss": 0.7172559499740601
@@ -10847,13 +10847,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.12109375,
"learning_rate": 0.0009901915884243597,
- "loss": 0.0353,
+ "loss": 0.0351,
"macro_f1": 0.6666666865348816,
"num_tokens": 1842614.0,
"repeat_count": 1.0,
- "routers_loss": 0.006839688867330551,
+ "routers_loss": 0.005162308923900127,
"skip_count": 0.0,
"step": 1142,
"text_loss": 0.42892804741859436
@@ -10866,13 +10866,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.0009901304884744014,
- "loss": 0.0396,
+ "loss": 0.0386,
"macro_f1": 0.3144654333591461,
"num_tokens": 1845444.0,
"repeat_count": 1.0,
- "routers_loss": 0.10174567997455597,
+ "routers_loss": 0.10117656737565994,
"skip_count": 2.0,
"step": 1144,
"text_loss": 0.20806430280208588
@@ -10885,13 +10885,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009900692007055152,
- "loss": 0.0365,
+ "loss": 0.0357,
"macro_f1": 0.3333333432674408,
"num_tokens": 1848558.0,
"repeat_count": 0.0,
- "routers_loss": 0.014655748382210732,
+ "routers_loss": 0.014107038266956806,
"skip_count": 0.0,
"step": 1146,
"text_loss": 0.5355974435806274
@@ -10904,13 +10904,13 @@
"f1_execute": 0.9166666865348816,
"f1_repeat": 0.4000000059604645,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.158203125,
+ "grad_norm": 0.16015625,
"learning_rate": 0.000990007725141187,
- "loss": 0.0467,
+ "loss": 0.0449,
"macro_f1": 0.6611111164093018,
"num_tokens": 1852723.0,
"repeat_count": 4.0,
- "routers_loss": 0.16960746049880981,
+ "routers_loss": 0.15537866950035095,
"skip_count": 2.0,
"step": 1148,
"text_loss": 0.6388513445854187
@@ -10923,32 +10923,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1220703125,
+ "grad_norm": 0.1181640625,
"learning_rate": 0.0009899460618049741,
- "loss": 0.0399,
+ "loss": 0.0397,
"macro_f1": 0.3333333432674408,
"num_tokens": 1856181.0,
"repeat_count": 0.0,
- "routers_loss": 0.011591178365051746,
+ "routers_loss": 0.011800912208855152,
"skip_count": 0.0,
"step": 1150,
"text_loss": 0.6113069653511047
},
{
- "acc_repeat": 0.5,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 30.0,
"epoch": 5.408570589961843,
- "f1_execute": 0.9811320900917053,
- "f1_repeat": 0.6666666865348816,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09912109375,
+ "grad_norm": 0.1005859375,
"learning_rate": 0.000989884210720506,
- "loss": 0.0332,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0331,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 1859685.0,
"repeat_count": 2.0,
- "routers_loss": 0.04036068916320801,
+ "routers_loss": 0.022900646552443504,
"skip_count": 0.0,
"step": 1152,
"text_loss": 0.25718021392822266
@@ -10961,13 +10961,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009898221719114844,
- "loss": 0.0366,
+ "loss": 0.0354,
"macro_f1": 0.3272727429866791,
"num_tokens": 1862505.0,
"repeat_count": 0.0,
- "routers_loss": 0.030165785923600197,
+ "routers_loss": 0.026814989745616913,
"skip_count": 1.0,
"step": 1154,
"text_loss": 0.5426549911499023
@@ -10980,13 +10980,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0966796875,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009897599454016823,
- "loss": 0.0421,
+ "loss": 0.0401,
"macro_f1": 0.3333333432674408,
"num_tokens": 1866266.0,
"repeat_count": 0.0,
- "routers_loss": 0.003615695284679532,
+ "routers_loss": 0.0032623792067170143,
"skip_count": 0.0,
"step": 1156,
"text_loss": 0.37752896547317505
@@ -10999,13 +10999,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.07080078125,
"learning_rate": 0.0009896975312149454,
- "loss": 0.0377,
+ "loss": 0.0369,
"macro_f1": 0.3333333432674408,
"num_tokens": 1870216.0,
"repeat_count": 0.0,
- "routers_loss": 0.01679840311408043,
+ "routers_loss": 0.015617577359080315,
"skip_count": 0.0,
"step": 1158,
"text_loss": 0.18207129836082458
@@ -11018,13 +11018,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009896349293751906,
- "loss": 0.0422,
+ "loss": 0.0423,
"macro_f1": 0.3272727429866791,
"num_tokens": 1873338.0,
"repeat_count": 0.0,
- "routers_loss": 0.024936161935329437,
+ "routers_loss": 0.02250153198838234,
"skip_count": 1.0,
"step": 1160,
"text_loss": 0.548884391784668
@@ -11037,13 +11037,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1923828125,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009895721399064072,
- "loss": 0.0407,
+ "loss": 0.0388,
"macro_f1": 0.32098764181137085,
"num_tokens": 1876470.0,
"repeat_count": 1.0,
- "routers_loss": 0.06472968310117722,
+ "routers_loss": 0.055204521864652634,
"skip_count": 1.0,
"step": 1162,
"text_loss": 0.48052409291267395
@@ -11056,13 +11056,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009895091628326564,
- "loss": 0.031,
+ "loss": 0.0293,
"macro_f1": 0.3333333432674408,
"num_tokens": 1879354.0,
"repeat_count": 0.0,
- "routers_loss": 0.009633494541049004,
+ "routers_loss": 0.009093789383769035,
"skip_count": 0.0,
"step": 1164,
"text_loss": 0.3908069431781769
@@ -11075,13 +11075,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.140625,
"learning_rate": 0.000989445998178071,
"loss": 0.0323,
"macro_f1": 0.3272727429866791,
"num_tokens": 1881941.0,
"repeat_count": 0.0,
- "routers_loss": 0.01458993274718523,
+ "routers_loss": 0.015086972154676914,
"skip_count": 1.0,
"step": 1166,
"text_loss": 0.4884725511074066
@@ -11094,13 +11094,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009893826459668558,
- "loss": 0.0389,
+ "loss": 0.0386,
"macro_f1": 0.3144654333591461,
"num_tokens": 1885374.0,
"repeat_count": 0.0,
- "routers_loss": 0.06636982411146164,
+ "routers_loss": 0.06587666273117065,
"skip_count": 3.0,
"step": 1168,
"text_loss": 0.12760137021541595
@@ -11113,13 +11113,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1796875,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.0009893191062232873,
- "loss": 0.0325,
+ "loss": 0.0322,
"macro_f1": 0.3333333432674408,
"num_tokens": 1888612.0,
"repeat_count": 0.0,
- "routers_loss": 0.005644182674586773,
+ "routers_loss": 0.006088624242693186,
"skip_count": 0.0,
"step": 1170,
"text_loss": 0.4821319580078125
@@ -11132,13 +11132,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.0009892553789717143,
- "loss": 0.0402,
+ "loss": 0.0389,
"macro_f1": 0.3333333432674408,
"num_tokens": 1891463.0,
"repeat_count": 0.0,
- "routers_loss": 0.010273848660290241,
+ "routers_loss": 0.010113578289747238,
"skip_count": 0.0,
"step": 1172,
"text_loss": 0.3613642454147339
@@ -11151,13 +11151,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009891914642365573,
- "loss": 0.0415,
+ "loss": 0.0404,
"macro_f1": 0.3333333432674408,
"num_tokens": 1894230.0,
"repeat_count": 0.0,
- "routers_loss": 0.004529652185738087,
+ "routers_loss": 0.004947459790855646,
"skip_count": 0.0,
"step": 1174,
"text_loss": 0.5037549138069153
@@ -11170,13 +11170,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2236328125,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.0009891273620423083,
- "loss": 0.045,
+ "loss": 0.0428,
"macro_f1": 0.3272727429866791,
"num_tokens": 1897294.0,
"repeat_count": 1.0,
- "routers_loss": 0.024671228602528572,
+ "routers_loss": 0.026075217872858047,
"skip_count": 0.0,
"step": 1176,
"text_loss": 0.32558977603912354
@@ -11189,13 +11189,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009890630724135314,
- "loss": 0.0354,
+ "loss": 0.0351,
"macro_f1": 0.3272727429866791,
"num_tokens": 1901553.0,
"repeat_count": 0.0,
- "routers_loss": 0.06466450542211533,
+ "routers_loss": 0.06650999188423157,
"skip_count": 1.0,
"step": 1178,
"text_loss": 0.23473620414733887
@@ -11208,13 +11208,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1767578125,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009889985953748625,
- "loss": 0.0278,
+ "loss": 0.0268,
"macro_f1": 0.6666666865348816,
"num_tokens": 1904556.0,
"repeat_count": 0.0,
- "routers_loss": 0.010566026903688908,
+ "routers_loss": 0.010361116379499435,
"skip_count": 1.0,
"step": 1180,
"text_loss": 0.6927042007446289
@@ -11227,13 +11227,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.103515625,
"learning_rate": 0.0009889339309510094,
- "loss": 0.037,
+ "loss": 0.0351,
"macro_f1": 0.3333333432674408,
"num_tokens": 1908053.0,
"repeat_count": 0.0,
- "routers_loss": 0.013842248357832432,
+ "routers_loss": 0.013286533765494823,
"skip_count": 0.0,
"step": 1182,
"text_loss": 0.19977325201034546
@@ -11246,13 +11246,13 @@
"f1_execute": 0.9387754797935486,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.5,
- "grad_norm": 0.07373046875,
+ "grad_norm": 0.058837890625,
"learning_rate": 0.0009888690791667518,
- "loss": 0.0215,
+ "loss": 0.0204,
"macro_f1": 0.7018141150474548,
"num_tokens": 1911754.0,
"repeat_count": 2.0,
- "routers_loss": 0.122759610414505,
+ "routers_loss": 0.11920545995235443,
"skip_count": 3.0,
"step": 1184,
"text_loss": 0.4072858691215515
@@ -11265,32 +11265,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009888040400469408,
- "loss": 0.0402,
+ "loss": 0.0391,
"macro_f1": 0.3272727429866791,
"num_tokens": 1914862.0,
"repeat_count": 0.0,
- "routers_loss": 0.035315629094839096,
+ "routers_loss": 0.03652849420905113,
"skip_count": 1.0,
"step": 1186,
"text_loss": 0.2654043138027191
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 5.577634282359847,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.0009887388136164996,
- "loss": 0.034,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0336,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1918542.0,
"repeat_count": 0.0,
- "routers_loss": 0.040048226714134216,
+ "routers_loss": 0.03991910070180893,
"skip_count": 2.0,
"step": 1188,
"text_loss": 0.21130657196044922
@@ -11298,18 +11298,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 5.587026709715292,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.09521484375,
"learning_rate": 0.000988673399900423,
- "loss": 0.044,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0429,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1921589.0,
"repeat_count": 0.0,
- "routers_loss": 0.012814820744097233,
+ "routers_loss": 0.014900135807693005,
"skip_count": 0.0,
"step": 1190,
"text_loss": 0.5519335865974426
@@ -11322,13 +11322,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2119140625,
+ "grad_norm": 0.1884765625,
"learning_rate": 0.0009886077989237777,
- "loss": 0.0407,
+ "loss": 0.0405,
"macro_f1": 0.3272727429866791,
"num_tokens": 1924320.0,
"repeat_count": 0.0,
- "routers_loss": 0.05977959558367729,
+ "routers_loss": 0.06271552294492722,
"skip_count": 1.0,
"step": 1192,
"text_loss": 0.213813915848732
@@ -11341,13 +11341,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.0,
"f1_skip": 0.888888955116272,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.1875,
"learning_rate": 0.000988542010711702,
- "loss": 0.0334,
+ "loss": 0.0342,
"macro_f1": 0.6225374937057495,
"num_tokens": 1927178.0,
"repeat_count": 0.0,
- "routers_loss": 0.031448643654584885,
+ "routers_loss": 0.03081391751766205,
"skip_count": 5.0,
"step": 1194,
"text_loss": 0.7524349093437195
@@ -11360,13 +11360,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.265625,
+ "grad_norm": 0.255859375,
"learning_rate": 0.0009884760352894064,
- "loss": 0.0523,
+ "loss": 0.0518,
"macro_f1": 0.3333333432674408,
"num_tokens": 1930216.0,
"repeat_count": 0.0,
- "routers_loss": 0.008164947852492332,
+ "routers_loss": 0.008556773886084557,
"skip_count": 0.0,
"step": 1196,
"text_loss": 0.28230375051498413
@@ -11379,32 +11379,32 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.5,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.1064453125,
"learning_rate": 0.0009884098726821726,
- "loss": 0.0478,
+ "loss": 0.0472,
"macro_f1": 0.4871794879436493,
"num_tokens": 1933312.0,
"repeat_count": 3.0,
- "routers_loss": 0.04045635461807251,
+ "routers_loss": 0.05344727262854576,
"skip_count": 0.0,
"step": 1198,
"text_loss": 0.5509607195854187
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6666666865348816,
- "avg_layers": 26.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
"epoch": 5.633988846492516,
- "f1_execute": 0.9600000381469727,
+ "f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
- "f1_skip": 0.800000011920929,
- "grad_norm": 0.1240234375,
+ "f1_skip": 0.5,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.000988343522915354,
- "loss": 0.0447,
- "macro_f1": 0.5866667032241821,
+ "loss": 0.0441,
+ "macro_f1": 0.480392187833786,
"num_tokens": 1936160.0,
"repeat_count": 1.0,
- "routers_loss": 0.06872973591089249,
+ "routers_loss": 0.07324771583080292,
"skip_count": 3.0,
"step": 1200,
"text_loss": 0.30565372109413147
@@ -11412,18 +11412,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
- "avg_layers": 24.0,
+ "avg_layers": 25.0,
"epoch": 5.64338127384796,
- "f1_execute": 0.8695651888847351,
+ "f1_execute": 0.8936169743537903,
"f1_repeat": 0.0,
- "f1_skip": 0.4000000059604645,
- "grad_norm": 0.25390625,
+ "f1_skip": 0.444444477558136,
+ "grad_norm": 0.2470703125,
"learning_rate": 0.0009882769860143764,
- "loss": 0.0331,
- "macro_f1": 0.4231884181499481,
+ "loss": 0.0317,
+ "macro_f1": 0.4460204839706421,
"num_tokens": 1939266.0,
"repeat_count": 0.0,
- "routers_loss": 0.20964151620864868,
+ "routers_loss": 0.18620699644088745,
"skip_count": 6.0,
"step": 1202,
"text_loss": 0.976121723651886
@@ -11442,26 +11442,26 @@
"macro_f1": 0.6666666865348816,
"num_tokens": 1942173.0,
"repeat_count": 0.0,
- "routers_loss": 0.00690250750631094,
+ "routers_loss": 0.007703613489866257,
"skip_count": 1.0,
"step": 1204,
"text_loss": 0.5647401809692383
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 5.66216612855885,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009881433509120036,
- "loss": 0.0372,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0376,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 1945071.0,
"repeat_count": 0.0,
- "routers_loss": 0.022315658628940582,
+ "routers_loss": 0.02162683941423893,
"skip_count": 2.0,
"step": 1206,
"text_loss": 0.24229218065738678
@@ -11474,13 +11474,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1083984375,
+ "grad_norm": 0.0966796875,
"learning_rate": 0.0009880762527618176,
- "loss": 0.0388,
+ "loss": 0.0383,
"macro_f1": 0.3333333432674408,
"num_tokens": 1949060.0,
"repeat_count": 0.0,
- "routers_loss": 0.017015069723129272,
+ "routers_loss": 0.017667081207036972,
"skip_count": 0.0,
"step": 1208,
"text_loss": 0.4035970866680145
@@ -11493,13 +11493,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.154296875,
"learning_rate": 0.0009880089675798908,
- "loss": 0.0372,
+ "loss": 0.0367,
"macro_f1": 0.3333333432674408,
"num_tokens": 1951698.0,
"repeat_count": 0.0,
- "routers_loss": 0.006532609928399324,
+ "routers_loss": 0.006405784282833338,
"skip_count": 0.0,
"step": 1210,
"text_loss": 0.5319879055023193
@@ -11512,13 +11512,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009879414953920071,
- "loss": 0.0301,
+ "loss": 0.0294,
"macro_f1": 0.3333333432674408,
"num_tokens": 1955266.0,
"repeat_count": 0.0,
- "routers_loss": 0.009720963425934315,
+ "routers_loss": 0.009859707206487656,
"skip_count": 0.0,
"step": 1212,
"text_loss": 0.6687407493591309
@@ -11531,32 +11531,32 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009878738362240219,
- "loss": 0.046,
+ "loss": 0.045,
"macro_f1": 0.5492662787437439,
"num_tokens": 1958538.0,
"repeat_count": 0.0,
- "routers_loss": 0.03176085278391838,
+ "routers_loss": 0.030890554189682007,
"skip_count": 2.0,
"step": 1214,
"text_loss": 0.20820017158985138
},
{
"acc_repeat": 0.5,
- "acc_skip": 0.5,
- "avg_layers": 29.0,
+ "acc_skip": 0.0,
+ "avg_layers": 30.0,
"epoch": 5.709128265336073,
- "f1_execute": 0.9387754797935486,
+ "f1_execute": 0.9200000166893005,
"f1_repeat": 0.5,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.2021484375,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1806640625,
"learning_rate": 0.000987805990101862,
- "loss": 0.0323,
- "macro_f1": 0.7018141150474548,
+ "loss": 0.0317,
+ "macro_f1": 0.47333335876464844,
"num_tokens": 1961419.0,
"repeat_count": 2.0,
- "routers_loss": 0.08626245707273483,
+ "routers_loss": 0.10383198410272598,
"skip_count": 2.0,
"step": 1216,
"text_loss": 0.8664976358413696
@@ -11569,13 +11569,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009877379570515268,
- "loss": 0.0374,
+ "loss": 0.0366,
"macro_f1": 0.3333333432674408,
"num_tokens": 1964836.0,
"repeat_count": 0.0,
- "routers_loss": 0.012099343352019787,
+ "routers_loss": 0.013376163318753242,
"skip_count": 0.0,
"step": 1218,
"text_loss": 0.4223395884037018
@@ -11588,13 +11588,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.0859375,
"learning_rate": 0.0009876697370990865,
- "loss": 0.0342,
+ "loss": 0.0343,
"macro_f1": 0.3333333432674408,
"num_tokens": 1967620.0,
"repeat_count": 0.0,
- "routers_loss": 0.007713846862316132,
+ "routers_loss": 0.008577900938689709,
"skip_count": 0.0,
"step": 1220,
"text_loss": 0.4789901375770569
@@ -11607,13 +11607,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.0009876013302706828,
- "loss": 0.0499,
+ "loss": 0.049,
"macro_f1": 0.3333333432674408,
"num_tokens": 1971100.0,
"repeat_count": 0.0,
- "routers_loss": 0.004629489034414291,
+ "routers_loss": 0.004730266984552145,
"skip_count": 0.0,
"step": 1222,
"text_loss": 0.6799837946891785
@@ -11626,13 +11626,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08837890625,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009875327365925295,
- "loss": 0.035,
+ "loss": 0.0341,
"macro_f1": 0.3333333432674408,
"num_tokens": 1974408.0,
"repeat_count": 0.0,
- "routers_loss": 0.010654795914888382,
+ "routers_loss": 0.010849526152014732,
"skip_count": 0.0,
"step": 1224,
"text_loss": 0.18967926502227783
@@ -11640,18 +11640,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 26.0,
+ "avg_layers": 27.0,
"epoch": 5.756090402113296,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19140625,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009874639560909118,
- "loss": 0.0516,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.0498,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 1977046.0,
"repeat_count": 0.0,
- "routers_loss": 0.05963074415922165,
+ "routers_loss": 0.04841252416372299,
"skip_count": 1.0,
"step": 1226,
"text_loss": 0.6133310198783875
@@ -11664,13 +11664,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1328125,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.0009873949887921867,
- "loss": 0.04,
+ "loss": 0.0402,
"macro_f1": 0.3272727429866791,
"num_tokens": 1980330.0,
"repeat_count": 0.0,
- "routers_loss": 0.028920643031597137,
+ "routers_loss": 0.029638588428497314,
"skip_count": 1.0,
"step": 1228,
"text_loss": 0.15649555623531342
@@ -11678,18 +11678,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 5.774875256824186,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10595703125,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.0009873258347227823,
- "loss": 0.0327,
- "macro_f1": 0.3333333432674408,
+ "loss": 0.0331,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 1983173.0,
"repeat_count": 0.0,
- "routers_loss": 0.006852717138826847,
+ "routers_loss": 0.009955910965800285,
"skip_count": 0.0,
"step": 1230,
"text_loss": 0.4741005599498749
@@ -11702,13 +11702,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009872564939091989,
- "loss": 0.0346,
+ "loss": 0.0342,
"macro_f1": 0.3333333432674408,
"num_tokens": 1986825.0,
"repeat_count": 0.0,
- "routers_loss": 0.010968753136694431,
+ "routers_loss": 0.010205300524830818,
"skip_count": 0.0,
"step": 1232,
"text_loss": 0.5315462350845337
@@ -11721,13 +11721,13 @@
"f1_execute": 0.9302325248718262,
"f1_repeat": 1.0,
"f1_skip": 0.7272727489471436,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.11865234375,
"learning_rate": 0.0009871869663780077,
- "loss": 0.0344,
+ "loss": 0.0336,
"macro_f1": 0.8858351111412048,
"num_tokens": 1990448.0,
"repeat_count": 1.0,
- "routers_loss": 0.0906950980424881,
+ "routers_loss": 0.09120134264230728,
"skip_count": 7.0,
"step": 1234,
"text_loss": 0.6187508702278137
@@ -11740,13 +11740,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.125,
"learning_rate": 0.0009871172521558522,
- "loss": 0.0484,
+ "loss": 0.0475,
"macro_f1": 0.6666666865348816,
"num_tokens": 1993474.0,
"repeat_count": 0.0,
- "routers_loss": 0.016306072473526,
+ "routers_loss": 0.016188839450478554,
"skip_count": 1.0,
"step": 1236,
"text_loss": 0.20783066749572754
@@ -11759,13 +11759,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.208984375,
+ "grad_norm": 0.216796875,
"learning_rate": 0.0009870473512694465,
- "loss": 0.038,
+ "loss": 0.0373,
"macro_f1": 0.5934640765190125,
"num_tokens": 1996536.0,
"repeat_count": 0.0,
- "routers_loss": 0.05804471671581268,
+ "routers_loss": 0.05046704784035683,
"skip_count": 3.0,
"step": 1238,
"text_loss": 0.247748002409935
@@ -11773,18 +11773,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 5.821837393601409,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.091796875,
+ "f1_skip": 0.5,
+ "grad_norm": 0.09033203125,
"learning_rate": 0.0009869772637455772,
- "loss": 0.0256,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0251,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 1999530.0,
"repeat_count": 0.0,
- "routers_loss": 0.045395996421575546,
+ "routers_loss": 0.044926248490810394,
"skip_count": 2.0,
"step": 1240,
"text_loss": 0.26001980900764465
@@ -11797,13 +11797,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11767578125,
+ "grad_norm": 0.1513671875,
"learning_rate": 0.000986906989611102,
- "loss": 0.0438,
+ "loss": 0.0446,
"macro_f1": 0.3272727429866791,
"num_tokens": 2002782.0,
"repeat_count": 0.0,
- "routers_loss": 0.020834850147366524,
+ "routers_loss": 0.025911526754498482,
"skip_count": 0.0,
"step": 1242,
"text_loss": 0.9009982943534851
@@ -11816,13 +11816,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1123046875,
+ "grad_norm": 0.115234375,
"learning_rate": 0.0009868365288929492,
- "loss": 0.0377,
+ "loss": 0.0371,
"macro_f1": 0.3333333432674408,
"num_tokens": 2005331.0,
"repeat_count": 0.0,
- "routers_loss": 0.005241698585450649,
+ "routers_loss": 0.0043760035187006,
"skip_count": 0.0,
"step": 1244,
"text_loss": 0.5547386407852173
@@ -11835,13 +11835,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0966796875,
+ "grad_norm": 0.1005859375,
"learning_rate": 0.0009867658816181206,
- "loss": 0.038,
+ "loss": 0.0374,
"macro_f1": 0.3333333432674408,
"num_tokens": 2008115.0,
"repeat_count": 0.0,
- "routers_loss": 0.008387803100049496,
+ "routers_loss": 0.009227181784808636,
"skip_count": 0.0,
"step": 1246,
"text_loss": 1.0067731142044067
@@ -11854,13 +11854,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.126953125,
"learning_rate": 0.000986695047813688,
- "loss": 0.0256,
+ "loss": 0.0261,
"macro_f1": 0.3272727429866791,
"num_tokens": 2011137.0,
"repeat_count": 1.0,
- "routers_loss": 0.02261745184659958,
+ "routers_loss": 0.023822437971830368,
"skip_count": 0.0,
"step": 1248,
"text_loss": 0.30058956146240234
@@ -11873,32 +11873,32 @@
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009866240275067948,
- "loss": 0.0435,
+ "loss": 0.044,
"macro_f1": 0.47333335876464844,
"num_tokens": 2014159.0,
"repeat_count": 2.0,
- "routers_loss": 0.21678555011749268,
+ "routers_loss": 0.21523773670196533,
"skip_count": 3.0,
"step": 1250,
"text_loss": 0.39072203636169434
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 5.878191957734077,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.0009865528207246563,
- "loss": 0.0358,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0351,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 2017731.0,
"repeat_count": 0.0,
- "routers_loss": 0.06554054468870163,
+ "routers_loss": 0.06184682995080948,
"skip_count": 2.0,
"step": 1252,
"text_loss": 0.35751575231552124
@@ -11911,13 +11911,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.203125,
+ "grad_norm": 0.166015625,
"learning_rate": 0.000986481427494559,
- "loss": 0.0337,
+ "loss": 0.0336,
"macro_f1": 0.3333333432674408,
"num_tokens": 2020485.0,
"repeat_count": 0.0,
- "routers_loss": 0.007237187586724758,
+ "routers_loss": 0.007573372684419155,
"skip_count": 0.0,
"step": 1254,
"text_loss": 0.4061077833175659
@@ -11930,13 +11930,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1845703125,
+ "grad_norm": 0.1708984375,
"learning_rate": 0.000986409847843861,
- "loss": 0.0387,
+ "loss": 0.0382,
"macro_f1": 0.3272727429866791,
"num_tokens": 2024149.0,
"repeat_count": 1.0,
- "routers_loss": 0.08003793656826019,
+ "routers_loss": 0.07447971403598785,
"skip_count": 0.0,
"step": 1256,
"text_loss": 0.41876497864723206
@@ -11949,13 +11949,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000986338081799992,
- "loss": 0.0341,
+ "loss": 0.0351,
"macro_f1": 0.3333333432674408,
"num_tokens": 2026545.0,
"repeat_count": 0.0,
- "routers_loss": 0.006424390245229006,
+ "routers_loss": 0.006609147880226374,
"skip_count": 0.0,
"step": 1258,
"text_loss": 0.4673794209957123
@@ -11968,13 +11968,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10009765625,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009862661293904523,
- "loss": 0.0482,
+ "loss": 0.0498,
"macro_f1": 0.32098764181137085,
"num_tokens": 2029581.0,
"repeat_count": 0.0,
- "routers_loss": 0.10797854512929916,
+ "routers_loss": 0.10624702274799347,
"skip_count": 2.0,
"step": 1260,
"text_loss": 0.3483233153820038
@@ -11987,13 +11987,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.111328125,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.0009861939906428145,
- "loss": 0.053,
+ "loss": 0.0525,
"macro_f1": 0.3333333432674408,
"num_tokens": 2033936.0,
"repeat_count": 0.0,
- "routers_loss": 0.006734046153724194,
+ "routers_loss": 0.007944886572659016,
"skip_count": 0.0,
"step": 1262,
"text_loss": 0.16362667083740234
@@ -12006,13 +12006,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009861216655847225,
- "loss": 0.0373,
+ "loss": 0.0376,
"macro_f1": 0.6666666865348816,
"num_tokens": 2037876.0,
"repeat_count": 1.0,
- "routers_loss": 0.00564212491735816,
+ "routers_loss": 0.007004092447459698,
"skip_count": 0.0,
"step": 1264,
"text_loss": 0.43228110671043396
@@ -12025,13 +12025,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1044921875,
+ "grad_norm": 0.1005859375,
"learning_rate": 0.0009860491542438912,
- "loss": 0.0472,
+ "loss": 0.047,
"macro_f1": 0.3272727429866791,
"num_tokens": 2040842.0,
"repeat_count": 0.0,
- "routers_loss": 0.026137735694646835,
+ "routers_loss": 0.026916226372122765,
"skip_count": 1.0,
"step": 1266,
"text_loss": 0.5901188850402832
@@ -12044,13 +12044,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.000985976456648107,
- "loss": 0.0343,
+ "loss": 0.0353,
"macro_f1": 0.3333333432674408,
"num_tokens": 2043890.0,
"repeat_count": 0.0,
- "routers_loss": 0.0069669694639742374,
+ "routers_loss": 0.007325216196477413,
"skip_count": 0.0,
"step": 1268,
"text_loss": 0.8780109882354736
@@ -12063,13 +12063,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.000985903572825228,
- "loss": 0.0323,
+ "loss": 0.0306,
"macro_f1": 0.4871794879436493,
"num_tokens": 2048848.0,
"repeat_count": 0.0,
- "routers_loss": 0.05618409812450409,
+ "routers_loss": 0.05007527023553848,
"skip_count": 2.0,
"step": 1270,
"text_loss": 0.5863722562789917
@@ -12084,11 +12084,11 @@
"f1_skip": 0.0,
"grad_norm": 0.173828125,
"learning_rate": 0.000985830502803183,
- "loss": 0.0391,
+ "loss": 0.0396,
"macro_f1": 0.3272727429866791,
"num_tokens": 2051561.0,
"repeat_count": 0.0,
- "routers_loss": 0.025900620967149734,
+ "routers_loss": 0.023995524272322655,
"skip_count": 0.0,
"step": 1272,
"text_loss": 0.7460709810256958
@@ -12101,13 +12101,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.0009857572466099732,
- "loss": 0.0426,
+ "loss": 0.0431,
"macro_f1": 0.3333333432674408,
"num_tokens": 2054752.0,
"repeat_count": 0.0,
- "routers_loss": 0.006236737594008446,
+ "routers_loss": 0.006928362417966127,
"skip_count": 0.0,
"step": 1274,
"text_loss": 0.5130293369293213
@@ -12120,13 +12120,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.171875,
+ "grad_norm": 0.162109375,
"learning_rate": 0.0009856838042736698,
- "loss": 0.0503,
+ "loss": 0.0501,
"macro_f1": 0.3333333432674408,
"num_tokens": 2058151.0,
"repeat_count": 0.0,
- "routers_loss": 0.006367063149809837,
+ "routers_loss": 0.006969396956264973,
"skip_count": 0.0,
"step": 1276,
"text_loss": 0.5911393761634827
@@ -12139,13 +12139,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.1357421875,
"learning_rate": 0.0009856101758224166,
- "loss": 0.0442,
+ "loss": 0.0441,
"macro_f1": 0.3333333432674408,
"num_tokens": 2061012.0,
"repeat_count": 0.0,
- "routers_loss": 0.003392914542928338,
+ "routers_loss": 0.003499418031424284,
"skip_count": 0.0,
"step": 1278,
"text_loss": 0.25347545742988586
@@ -12158,13 +12158,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.000985536361284428,
- "loss": 0.0231,
+ "loss": 0.0229,
"macro_f1": 0.3333333432674408,
"num_tokens": 2064597.0,
"repeat_count": 0.0,
- "routers_loss": 0.007376343477517366,
+ "routers_loss": 0.007856054231524467,
"skip_count": 0.0,
"step": 1280,
"text_loss": 0.7476963400840759
@@ -12177,13 +12177,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.0888671875,
"learning_rate": 0.0009854623606879898,
- "loss": 0.0243,
+ "loss": 0.0245,
"macro_f1": 0.3272727429866791,
"num_tokens": 2067972.0,
"repeat_count": 0.0,
- "routers_loss": 0.02773376554250717,
+ "routers_loss": 0.02617792971432209,
"skip_count": 1.0,
"step": 1282,
"text_loss": 0.5775872468948364
@@ -12196,13 +12196,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.09033203125,
"learning_rate": 0.000985388174061459,
- "loss": 0.0363,
+ "loss": 0.0356,
"macro_f1": 0.32098767161369324,
"num_tokens": 2071812.0,
"repeat_count": 0.0,
- "routers_loss": 0.03535797819495201,
+ "routers_loss": 0.035979997366666794,
"skip_count": 1.0,
"step": 1284,
"text_loss": 0.2933400869369507
@@ -12215,13 +12215,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08447265625,
"learning_rate": 0.0009853138014332646,
- "loss": 0.0269,
+ "loss": 0.0273,
"macro_f1": 0.3333333432674408,
"num_tokens": 2074868.0,
"repeat_count": 0.0,
- "routers_loss": 0.004910993855446577,
+ "routers_loss": 0.005142854526638985,
"skip_count": 0.0,
"step": 1286,
"text_loss": 0.29085102677345276
@@ -12234,13 +12234,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0888671875,
+ "grad_norm": 0.09033203125,
"learning_rate": 0.0009852392428319058,
- "loss": 0.0301,
+ "loss": 0.0306,
"macro_f1": 0.3333333432674408,
"num_tokens": 2078225.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032444109674543142,
+ "routers_loss": 0.0032799106556922197,
"skip_count": 0.0,
"step": 1288,
"text_loss": 0.7293626070022583
@@ -12253,13 +12253,13 @@
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.08935546875,
"learning_rate": 0.0009851644982859537,
- "loss": 0.0272,
+ "loss": 0.0273,
"macro_f1": 0.480392187833786,
"num_tokens": 2081495.0,
"repeat_count": 1.0,
- "routers_loss": 0.12451831251382828,
+ "routers_loss": 0.12224318832159042,
"skip_count": 3.0,
"step": 1290,
"text_loss": 0.26125892996788025
@@ -12272,13 +12272,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.1435546875,
"learning_rate": 0.0009850895678240508,
- "loss": 0.0289,
+ "loss": 0.0283,
"macro_f1": 0.6666666865348816,
"num_tokens": 2084390.0,
"repeat_count": 1.0,
- "routers_loss": 0.011074979789555073,
+ "routers_loss": 0.010662888176739216,
"skip_count": 0.0,
"step": 1292,
"text_loss": 0.3510764539241791
@@ -12291,13 +12291,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.1689453125,
"learning_rate": 0.0009850144514749104,
- "loss": 0.0336,
+ "loss": 0.0332,
"macro_f1": 0.5492662787437439,
"num_tokens": 2087210.0,
"repeat_count": 0.0,
- "routers_loss": 0.01774786226451397,
+ "routers_loss": 0.01979079470038414,
"skip_count": 2.0,
"step": 1294,
"text_loss": 0.40202176570892334
@@ -12310,13 +12310,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.000984939149267317,
- "loss": 0.0251,
+ "loss": 0.0253,
"macro_f1": 0.6666666865348816,
"num_tokens": 2090777.0,
"repeat_count": 0.0,
- "routers_loss": 0.0052874404937028885,
+ "routers_loss": 0.005172552540898323,
"skip_count": 1.0,
"step": 1296,
"text_loss": 0.5275651216506958
@@ -12329,13 +12329,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10107421875,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009848636612301272,
- "loss": 0.031,
+ "loss": 0.0299,
"macro_f1": 0.3333333432674408,
"num_tokens": 2094248.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034106262028217316,
+ "routers_loss": 0.0029599082190543413,
"skip_count": 0.0,
"step": 1298,
"text_loss": 0.4517653286457062
@@ -12348,13 +12348,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2177734375,
+ "grad_norm": 0.23046875,
"learning_rate": 0.0009847879873922675,
"loss": 0.0357,
"macro_f1": 0.3333333432674408,
"num_tokens": 2097139.0,
"repeat_count": 0.0,
- "routers_loss": 0.010383229702711105,
+ "routers_loss": 0.011455860920250416,
"skip_count": 0.0,
"step": 1300,
"text_loss": 0.16888445615768433
@@ -12367,13 +12367,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0966796875,
+ "grad_norm": 0.09619140625,
"learning_rate": 0.0009847121277827366,
- "loss": 0.0304,
+ "loss": 0.0301,
"macro_f1": 0.3333333432674408,
"num_tokens": 2100415.0,
"repeat_count": 0.0,
- "routers_loss": 0.0076674893498420715,
+ "routers_loss": 0.008091195486485958,
"skip_count": 0.0,
"step": 1302,
"text_loss": 0.40061676502227783
@@ -12386,13 +12386,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.109375,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.000984636082430604,
- "loss": 0.0287,
+ "loss": 0.0285,
"macro_f1": 0.3333333432674408,
"num_tokens": 2103285.0,
"repeat_count": 0.0,
- "routers_loss": 0.010486516170203686,
+ "routers_loss": 0.009593960829079151,
"skip_count": 0.0,
"step": 1304,
"text_loss": 0.7211073637008667
@@ -12405,13 +12405,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.107421875,
"learning_rate": 0.0009845598513650103,
- "loss": 0.0237,
+ "loss": 0.0231,
"macro_f1": 0.3333333432674408,
"num_tokens": 2106255.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023783023934811354,
+ "routers_loss": 0.0023068038281053305,
"skip_count": 0.0,
"step": 1306,
"text_loss": 0.7077119946479797
@@ -12424,13 +12424,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.181640625,
+ "grad_norm": 0.171875,
"learning_rate": 0.0009844834346151674,
- "loss": 0.044,
+ "loss": 0.043,
"macro_f1": 0.3333333432674408,
"num_tokens": 2109305.0,
"repeat_count": 0.0,
- "routers_loss": 0.006714595016092062,
+ "routers_loss": 0.007703019306063652,
"skip_count": 0.0,
"step": 1308,
"text_loss": 0.3534316122531891
@@ -12443,13 +12443,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009844068322103585,
- "loss": 0.0281,
+ "loss": 0.0287,
"macro_f1": 0.3272727429866791,
"num_tokens": 2112216.0,
"repeat_count": 0.0,
- "routers_loss": 0.022373953834176064,
+ "routers_loss": 0.023549847304821014,
"skip_count": 1.0,
"step": 1310,
"text_loss": 0.6792599558830261
@@ -12462,13 +12462,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.150390625,
"learning_rate": 0.0009843300441799378,
- "loss": 0.0205,
+ "loss": 0.0211,
"macro_f1": 0.3333333432674408,
"num_tokens": 2114925.0,
"repeat_count": 0.0,
- "routers_loss": 0.007452849764376879,
+ "routers_loss": 0.007605871185660362,
"skip_count": 0.0,
"step": 1312,
"text_loss": 0.1571389138698578
@@ -12481,13 +12481,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009842530705533304,
- "loss": 0.0251,
+ "loss": 0.0253,
"macro_f1": 0.3272727429866791,
"num_tokens": 2117744.0,
"repeat_count": 0.0,
- "routers_loss": 0.016413308680057526,
+ "routers_loss": 0.014964760281145573,
"skip_count": 0.0,
"step": 1314,
"text_loss": 0.7840361595153809
@@ -12500,13 +12500,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.000984175911360033,
- "loss": 0.0243,
+ "loss": 0.0238,
"macro_f1": 0.3333333432674408,
"num_tokens": 2120848.0,
"repeat_count": 0.0,
- "routers_loss": 0.004676427226513624,
+ "routers_loss": 0.004663798492401838,
"skip_count": 0.0,
"step": 1316,
"text_loss": 0.536246120929718
@@ -12519,13 +12519,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.000984098566629613,
- "loss": 0.0284,
+ "loss": 0.0288,
"macro_f1": 0.5492662787437439,
"num_tokens": 2123651.0,
"repeat_count": 0.0,
- "routers_loss": 0.024454625323414803,
+ "routers_loss": 0.022852955386042595,
"skip_count": 2.0,
"step": 1318,
"text_loss": 0.43372172117233276
@@ -12538,13 +12538,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.07958984375,
"learning_rate": 0.0009840210363917087,
- "loss": 0.022,
+ "loss": 0.0216,
"macro_f1": 0.3333333432674408,
"num_tokens": 2128011.0,
"repeat_count": 0.0,
- "routers_loss": 0.013495884835720062,
+ "routers_loss": 0.012578422203660011,
"skip_count": 0.0,
"step": 1320,
"text_loss": 0.28190380334854126
@@ -12557,13 +12557,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.0009839433206760306,
- "loss": 0.0213,
+ "loss": 0.0204,
"macro_f1": 0.3333333432674408,
"num_tokens": 2131035.0,
"repeat_count": 0.0,
- "routers_loss": 0.006397814955562353,
+ "routers_loss": 0.006863643880933523,
"skip_count": 0.0,
"step": 1322,
"text_loss": 0.6340444087982178
@@ -12576,13 +12576,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1796875,
"learning_rate": 0.0009838654195123589,
- "loss": 0.0246,
+ "loss": 0.0243,
"macro_f1": 0.3333333432674408,
"num_tokens": 2133856.0,
"repeat_count": 0.0,
- "routers_loss": 0.00503434706479311,
+ "routers_loss": 0.00468854233622551,
"skip_count": 0.0,
"step": 1324,
"text_loss": 0.5138425827026367
@@ -12595,13 +12595,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.115234375,
"learning_rate": 0.0009837873329305458,
- "loss": 0.0402,
+ "loss": 0.0396,
"macro_f1": 0.6666666865348816,
"num_tokens": 2136451.0,
"repeat_count": 1.0,
- "routers_loss": 0.005150494631379843,
+ "routers_loss": 0.005731126759201288,
"skip_count": 0.0,
"step": 1326,
"text_loss": 0.742124617099762
@@ -12614,13 +12614,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1533203125,
+ "grad_norm": 0.17578125,
"learning_rate": 0.000983709060960514,
- "loss": 0.041,
+ "loss": 0.0416,
"macro_f1": 0.3333333432674408,
"num_tokens": 2139496.0,
"repeat_count": 0.0,
- "routers_loss": 0.004570818971842527,
+ "routers_loss": 0.0056343949399888515,
"skip_count": 0.0,
"step": 1328,
"text_loss": 0.7317464351654053
@@ -12633,13 +12633,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09326171875,
+ "grad_norm": 0.10791015625,
"learning_rate": 0.0009836306036322576,
- "loss": 0.0314,
+ "loss": 0.0312,
"macro_f1": 0.3333333432674408,
"num_tokens": 2143120.0,
"repeat_count": 0.0,
- "routers_loss": 0.005299333017319441,
+ "routers_loss": 0.005127966403961182,
"skip_count": 0.0,
"step": 1330,
"text_loss": 0.538652241230011
@@ -12652,13 +12652,13 @@
"f1_execute": 0.9130434989929199,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.111328125,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009835519609758415,
- "loss": 0.0303,
+ "loss": 0.0301,
"macro_f1": 0.590062141418457,
"num_tokens": 2145807.0,
"repeat_count": 3.0,
- "routers_loss": 0.168672576546669,
+ "routers_loss": 0.1673707216978073,
"skip_count": 4.0,
"step": 1332,
"text_loss": 0.3498198091983795
@@ -12671,32 +12671,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009834731330214017,
- "loss": 0.0302,
+ "loss": 0.0293,
"macro_f1": 0.3272727429866791,
"num_tokens": 2148397.0,
"repeat_count": 1.0,
- "routers_loss": 0.05187409743666649,
+ "routers_loss": 0.04026653990149498,
"skip_count": 0.0,
"step": 1334,
"text_loss": 0.8153424859046936
},
{
"acc_repeat": 1.0,
- "acc_skip": 1.0,
- "avg_layers": 26.0,
+ "acc_skip": 0.800000011920929,
+ "avg_layers": 27.0,
"epoch": 6.272380393307896,
- "f1_execute": 0.9230769276618958,
+ "f1_execute": 0.8999999761581421,
"f1_repeat": 0.6666666865348816,
- "f1_skip": 0.9090909361839294,
- "grad_norm": 0.1669921875,
+ "f1_skip": 0.8000000715255737,
+ "grad_norm": 0.16015625,
"learning_rate": 0.0009833941197991455,
- "loss": 0.0339,
- "macro_f1": 0.8329448699951172,
+ "loss": 0.0329,
+ "macro_f1": 0.7888889312744141,
"num_tokens": 2152226.0,
"repeat_count": 2.0,
- "routers_loss": 0.05786697566509247,
+ "routers_loss": 0.05481519177556038,
"skip_count": 5.0,
"step": 1336,
"text_loss": 0.7802760004997253
@@ -12709,13 +12709,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009833149213393506,
- "loss": 0.0315,
+ "loss": 0.0304,
"macro_f1": 0.3272727429866791,
"num_tokens": 2156023.0,
"repeat_count": 0.0,
- "routers_loss": 0.017055779695510864,
+ "routers_loss": 0.01760484278202057,
"skip_count": 0.0,
"step": 1338,
"text_loss": 0.19721226394176483
@@ -12728,13 +12728,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.000983235537672366,
- "loss": 0.0249,
+ "loss": 0.0256,
"macro_f1": 0.3333333432674408,
"num_tokens": 2160037.0,
"repeat_count": 0.0,
- "routers_loss": 0.011614206247031689,
+ "routers_loss": 0.013206037692725658,
"skip_count": 0.0,
"step": 1340,
"text_loss": 0.5003817081451416
@@ -12747,13 +12747,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1640625,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.000983155968828612,
- "loss": 0.033,
+ "loss": 0.0315,
"macro_f1": 0.6666666865348816,
"num_tokens": 2163910.0,
"repeat_count": 1.0,
- "routers_loss": 0.012611300684511662,
+ "routers_loss": 0.01256406120955944,
"skip_count": 0.0,
"step": 1342,
"text_loss": 0.5996923446655273
@@ -12766,13 +12766,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.11962890625,
"learning_rate": 0.0009830762148385793,
- "loss": 0.0315,
+ "loss": 0.0313,
"macro_f1": 0.3272727429866791,
"num_tokens": 2166921.0,
"repeat_count": 0.0,
- "routers_loss": 0.018757276237010956,
+ "routers_loss": 0.015086234547197819,
"skip_count": 1.0,
"step": 1344,
"text_loss": 0.45356282591819763
@@ -12785,13 +12785,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08447265625,
"learning_rate": 0.0009829962757328297,
- "loss": 0.0229,
+ "loss": 0.0223,
"macro_f1": 0.32098764181137085,
"num_tokens": 2170135.0,
"repeat_count": 0.0,
- "routers_loss": 0.08197146654129028,
+ "routers_loss": 0.07909081131219864,
"skip_count": 2.0,
"step": 1346,
"text_loss": 0.2874644994735718
@@ -12804,13 +12804,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0009829161515419959,
- "loss": 0.0256,
+ "loss": 0.0246,
"macro_f1": 0.6666666865348816,
"num_tokens": 2173029.0,
"repeat_count": 0.0,
- "routers_loss": 0.014122758992016315,
+ "routers_loss": 0.013569854199886322,
"skip_count": 2.0,
"step": 1348,
"text_loss": 0.25533875823020935
@@ -12823,13 +12823,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06005859375,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0009828358422967823,
- "loss": 0.0221,
+ "loss": 0.0226,
"macro_f1": 0.32098764181137085,
"num_tokens": 2176605.0,
"repeat_count": 1.0,
- "routers_loss": 0.08215996623039246,
+ "routers_loss": 0.08111091703176498,
"skip_count": 1.0,
"step": 1350,
"text_loss": 0.32827726006507874
@@ -12842,13 +12842,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09375,
+ "grad_norm": 0.091796875,
"learning_rate": 0.0009827553480279627,
- "loss": 0.0312,
+ "loss": 0.03,
"macro_f1": 0.5427350401878357,
"num_tokens": 2179406.0,
"repeat_count": 0.0,
- "routers_loss": 0.026304977014660835,
+ "routers_loss": 0.026550088077783585,
"skip_count": 2.0,
"step": 1352,
"text_loss": 0.2966301143169403
@@ -12861,13 +12861,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009826746687663832,
- "loss": 0.0302,
+ "loss": 0.0301,
"macro_f1": 0.3333333432674408,
"num_tokens": 2182353.0,
"repeat_count": 0.0,
- "routers_loss": 0.003616038942709565,
+ "routers_loss": 0.003914554137736559,
"skip_count": 0.0,
"step": 1354,
"text_loss": 0.7596251964569092
@@ -12880,13 +12880,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.0849609375,
+ "grad_norm": 0.0859375,
"learning_rate": 0.0009825938045429602,
- "loss": 0.0323,
+ "loss": 0.0324,
"macro_f1": 0.5866667032241821,
"num_tokens": 2185786.0,
"repeat_count": 1.0,
- "routers_loss": 0.060399893671274185,
+ "routers_loss": 0.059612665325403214,
"skip_count": 3.0,
"step": 1356,
"text_loss": 0.12325898557901382
@@ -12899,13 +12899,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10302734375,
+ "grad_norm": 0.10009765625,
"learning_rate": 0.0009825127553886807,
- "loss": 0.0384,
+ "loss": 0.0375,
"macro_f1": 0.3333333432674408,
"num_tokens": 2190157.0,
"repeat_count": 0.0,
- "routers_loss": 0.007164204493165016,
+ "routers_loss": 0.0071132429875433445,
"skip_count": 0.0,
"step": 1358,
"text_loss": 0.9287898540496826
@@ -12918,13 +12918,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0009824315213346033,
- "loss": 0.0343,
+ "loss": 0.0348,
"macro_f1": 0.3333333432674408,
"num_tokens": 2193077.0,
"repeat_count": 0.0,
- "routers_loss": 0.010965060442686081,
+ "routers_loss": 0.009611099027097225,
"skip_count": 0.0,
"step": 1360,
"text_loss": 0.20427259802818298
@@ -12937,13 +12937,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009823501024118569,
- "loss": 0.0276,
+ "loss": 0.0285,
"macro_f1": 0.3333333432674408,
"num_tokens": 2196494.0,
"repeat_count": 0.0,
- "routers_loss": 0.00784136913716793,
+ "routers_loss": 0.006913455203175545,
"skip_count": 0.0,
"step": 1362,
"text_loss": 0.574759840965271
@@ -12956,13 +12956,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009822684986516411,
- "loss": 0.0251,
+ "loss": 0.0245,
"macro_f1": 0.3333333432674408,
"num_tokens": 2199839.0,
"repeat_count": 0.0,
- "routers_loss": 0.009101065807044506,
+ "routers_loss": 0.009208920411765575,
"skip_count": 0.0,
"step": 1364,
"text_loss": 0.42422571778297424
@@ -12970,37 +12970,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 6.413266803639566,
- "f1_execute": 0.9433962106704712,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.000982186710085227,
- "loss": 0.0206,
- "macro_f1": 0.31446540355682373,
+ "loss": 0.0208,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 2203212.0,
"repeat_count": 1.0,
- "routers_loss": 0.05967295169830322,
+ "routers_loss": 0.059975091367959976,
"skip_count": 1.0,
"step": 1366,
"text_loss": 0.29213017225265503
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 26.0,
+ "acc_skip": 0.25,
+ "avg_layers": 27.0,
"epoch": 6.42265923099501,
- "f1_execute": 0.9600000381469727,
+ "f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.1875,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.181640625,
"learning_rate": 0.0009821047367439561,
- "loss": 0.0356,
- "macro_f1": 0.542222261428833,
+ "loss": 0.0358,
+ "macro_f1": 0.44705885648727417,
"num_tokens": 2206240.0,
"repeat_count": 0.0,
- "routers_loss": 0.05016552656888962,
+ "routers_loss": 0.048244867473840714,
"skip_count": 4.0,
"step": 1368,
"text_loss": 0.3072395324707031
@@ -13013,13 +13013,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0009820225786592405,
- "loss": 0.038,
+ "loss": 0.0375,
"macro_f1": 0.3272727429866791,
"num_tokens": 2209903.0,
"repeat_count": 1.0,
- "routers_loss": 0.02483060024678707,
+ "routers_loss": 0.026068156585097313,
"skip_count": 0.0,
"step": 1370,
"text_loss": 0.5961400270462036
@@ -13032,13 +13032,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.109375,
"learning_rate": 0.0009819402358625634,
- "loss": 0.0373,
+ "loss": 0.0366,
"macro_f1": 0.3272727429866791,
"num_tokens": 2213439.0,
"repeat_count": 0.0,
- "routers_loss": 0.01982821337878704,
+ "routers_loss": 0.022615568712353706,
"skip_count": 1.0,
"step": 1372,
"text_loss": 0.19375644624233246
@@ -13051,13 +13051,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.000981857708385479,
- "loss": 0.0353,
+ "loss": 0.0346,
"macro_f1": 0.3333333432674408,
"num_tokens": 2216457.0,
"repeat_count": 0.0,
- "routers_loss": 0.004753436427563429,
+ "routers_loss": 0.005855285096913576,
"skip_count": 0.0,
"step": 1374,
"text_loss": 0.5123368501663208
@@ -13070,13 +13070,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09912109375,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009817749962596114,
- "loss": 0.0246,
+ "loss": 0.0249,
"macro_f1": 0.3272727429866791,
"num_tokens": 2219975.0,
"repeat_count": 1.0,
- "routers_loss": 0.06541594862937927,
+ "routers_loss": 0.0651634931564331,
"skip_count": 0.0,
"step": 1376,
"text_loss": 0.5999220609664917
@@ -13089,13 +13089,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.10498046875,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.0009816920995166568,
- "loss": 0.0376,
+ "loss": 0.0371,
"macro_f1": 0.6666666865348816,
"num_tokens": 2222833.0,
"repeat_count": 1.0,
- "routers_loss": 0.01156456395983696,
+ "routers_loss": 0.011408994905650616,
"skip_count": 0.0,
"step": 1378,
"text_loss": 0.5323230624198914
@@ -13108,13 +13108,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2392578125,
+ "grad_norm": 0.205078125,
"learning_rate": 0.0009816090181883807,
- "loss": 0.033,
+ "loss": 0.0313,
"macro_f1": 0.32098764181137085,
"num_tokens": 2225842.0,
"repeat_count": 0.0,
- "routers_loss": 0.05175521597266197,
+ "routers_loss": 0.039720915257930756,
"skip_count": 2.0,
"step": 1380,
"text_loss": 0.23363439738750458
@@ -13127,13 +13127,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009815257523066204,
- "loss": 0.0251,
+ "loss": 0.0249,
"macro_f1": 0.3333333432674408,
"num_tokens": 2229430.0,
"repeat_count": 0.0,
- "routers_loss": 0.002684591803699732,
+ "routers_loss": 0.002765297656878829,
"skip_count": 0.0,
"step": 1382,
"text_loss": 0.718977689743042
@@ -13146,13 +13146,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.12890625,
+ "grad_norm": 0.130859375,
"learning_rate": 0.0009814423019032835,
- "loss": 0.0397,
+ "loss": 0.0396,
"macro_f1": 0.5492662787437439,
"num_tokens": 2232594.0,
"repeat_count": 2.0,
- "routers_loss": 0.054509978741407394,
+ "routers_loss": 0.05362323671579361,
"skip_count": 0.0,
"step": 1384,
"text_loss": 0.6392166614532471
@@ -13165,13 +13165,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.150390625,
"learning_rate": 0.0009813586670103483,
"loss": 0.0426,
"macro_f1": 0.6603773832321167,
"num_tokens": 2236327.0,
"repeat_count": 1.0,
- "routers_loss": 0.04031623527407646,
+ "routers_loss": 0.031728316098451614,
"skip_count": 1.0,
"step": 1386,
"text_loss": 0.5951619148254395
@@ -13184,13 +13184,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.126953125,
"learning_rate": 0.0009812748476598638,
- "loss": 0.0308,
+ "loss": 0.031,
"macro_f1": 0.5492662787437439,
"num_tokens": 2239746.0,
"repeat_count": 0.0,
- "routers_loss": 0.039687711745500565,
+ "routers_loss": 0.03981253132224083,
"skip_count": 2.0,
"step": 1388,
"text_loss": 0.22756551206111908
@@ -13203,13 +13203,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.12353515625,
+ "grad_norm": 0.12451171875,
"learning_rate": 0.0009811908438839498,
- "loss": 0.0329,
+ "loss": 0.0331,
"macro_f1": 0.5492662787437439,
"num_tokens": 2242786.0,
"repeat_count": 0.0,
- "routers_loss": 0.04785723611712456,
+ "routers_loss": 0.04617162421345711,
"skip_count": 2.0,
"step": 1390,
"text_loss": 0.3233799934387207
@@ -13222,13 +13222,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1298828125,
+ "grad_norm": 0.154296875,
"learning_rate": 0.000981106655714797,
- "loss": 0.0359,
+ "loss": 0.0358,
"macro_f1": 0.3272727429866791,
"num_tokens": 2245696.0,
"repeat_count": 0.0,
- "routers_loss": 0.046765491366386414,
+ "routers_loss": 0.046828847378492355,
"skip_count": 1.0,
"step": 1392,
"text_loss": 0.24273279309272766
@@ -13241,13 +13241,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009810222831846656,
- "loss": 0.0303,
+ "loss": 0.0307,
"macro_f1": 0.5492662787437439,
"num_tokens": 2249326.0,
"repeat_count": 0.0,
- "routers_loss": 0.015151665546000004,
+ "routers_loss": 0.010921589098870754,
"skip_count": 2.0,
"step": 1394,
"text_loss": 0.3921460807323456
@@ -13260,13 +13260,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009809377263258882,
- "loss": 0.0321,
+ "loss": 0.0315,
"macro_f1": 0.32098767161369324,
"num_tokens": 2253393.0,
"repeat_count": 0.0,
- "routers_loss": 0.04431106895208359,
+ "routers_loss": 0.04564022272825241,
"skip_count": 1.0,
"step": 1396,
"text_loss": 0.582602858543396
@@ -13279,13 +13279,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09814453125,
+ "grad_norm": 0.103515625,
"learning_rate": 0.000980852985170867,
- "loss": 0.0317,
+ "loss": 0.0328,
"macro_f1": 0.3272727429866791,
"num_tokens": 2256626.0,
"repeat_count": 0.0,
- "routers_loss": 0.012700649909675121,
+ "routers_loss": 0.013289985246956348,
"skip_count": 0.0,
"step": 1398,
"text_loss": 0.41031694412231445
@@ -13298,13 +13298,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1591796875,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.0009807680597520745,
- "loss": 0.0256,
+ "loss": 0.0264,
"macro_f1": 0.3333333432674408,
"num_tokens": 2259326.0,
"repeat_count": 0.0,
- "routers_loss": 0.005919010378420353,
+ "routers_loss": 0.0065213534981012344,
"skip_count": 0.0,
"step": 1400,
"text_loss": 0.2888098657131195
@@ -13317,13 +13317,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2255859375,
+ "grad_norm": 0.23046875,
"learning_rate": 0.0009806829501020546,
- "loss": 0.0372,
+ "loss": 0.0358,
"macro_f1": 0.3272727429866791,
"num_tokens": 2262344.0,
"repeat_count": 0.0,
- "routers_loss": 0.04717765748500824,
+ "routers_loss": 0.04199840500950813,
"skip_count": 1.0,
"step": 1402,
"text_loss": 0.31973034143447876
@@ -13336,13 +13336,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.08935546875,
"learning_rate": 0.0009805976562534215,
"loss": 0.0317,
"macro_f1": 0.6603773832321167,
"num_tokens": 2266354.0,
"repeat_count": 1.0,
- "routers_loss": 0.015415813773870468,
+ "routers_loss": 0.015434930101037025,
"skip_count": 1.0,
"step": 1404,
"text_loss": 0.508630633354187
@@ -13355,13 +13355,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009805121782388599,
"loss": 0.0339,
"macro_f1": 0.6533333659172058,
"num_tokens": 2269660.0,
"repeat_count": 2.0,
- "routers_loss": 0.06812979280948639,
+ "routers_loss": 0.0720924660563469,
"skip_count": 2.0,
"step": 1406,
"text_loss": 0.40927737951278687
@@ -13374,13 +13374,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05908203125,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009804265160911253,
- "loss": 0.0265,
+ "loss": 0.0266,
"macro_f1": 0.5492662787437439,
"num_tokens": 2273335.0,
"repeat_count": 0.0,
- "routers_loss": 0.025383235886693,
+ "routers_loss": 0.02400495670735836,
"skip_count": 2.0,
"step": 1408,
"text_loss": 0.1777762621641159
@@ -13393,13 +13393,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1806640625,
+ "grad_norm": 0.2314453125,
"learning_rate": 0.0009803406698430433,
- "loss": 0.0367,
+ "loss": 0.0371,
"macro_f1": 0.3272727429866791,
"num_tokens": 2277107.0,
"repeat_count": 0.0,
- "routers_loss": 0.026493225246667862,
+ "routers_loss": 0.02560107782483101,
"skip_count": 1.0,
"step": 1410,
"text_loss": 0.17955881357192993
@@ -13412,13 +13412,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0009802546395275104,
- "loss": 0.0342,
+ "loss": 0.0349,
"macro_f1": 0.3333333432674408,
"num_tokens": 2281638.0,
"repeat_count": 0.0,
- "routers_loss": 0.006616846192628145,
+ "routers_loss": 0.006655813194811344,
"skip_count": 0.0,
"step": 1412,
"text_loss": 0.20882295072078705
@@ -13431,32 +13431,32 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.0888671875,
+ "grad_norm": 0.08740234375,
"learning_rate": 0.000980168425177494,
- "loss": 0.0328,
+ "loss": 0.0342,
"macro_f1": 0.8200000524520874,
"num_tokens": 2284876.0,
"repeat_count": 1.0,
- "routers_loss": 0.060631848871707916,
+ "routers_loss": 0.06325097382068634,
"skip_count": 3.0,
"step": 1414,
"text_loss": 0.26035264134407043
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 6.648077487525683,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1494140625,
+ "grad_norm": 0.138671875,
"learning_rate": 0.000980082026826031,
- "loss": 0.0317,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0315,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2288938.0,
"repeat_count": 1.0,
- "routers_loss": 0.011199389584362507,
+ "routers_loss": 0.013436575420200825,
"skip_count": 0.0,
"step": 1416,
"text_loss": 0.5502325892448425
@@ -13469,13 +13469,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0009799954445062296,
- "loss": 0.0192,
+ "loss": 0.0193,
"macro_f1": 0.6603773832321167,
"num_tokens": 2292317.0,
"repeat_count": 1.0,
- "routers_loss": 0.01120354700833559,
+ "routers_loss": 0.011264479719102383,
"skip_count": 1.0,
"step": 1418,
"text_loss": 0.48075684905052185
@@ -13488,13 +13488,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.16796875,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009799086782512686,
- "loss": 0.0294,
+ "loss": 0.0292,
"macro_f1": 0.5492662787437439,
"num_tokens": 2295935.0,
"repeat_count": 0.0,
- "routers_loss": 0.030204148963093758,
+ "routers_loss": 0.02833271212875843,
"skip_count": 2.0,
"step": 1420,
"text_loss": 0.18221206963062286
@@ -13507,13 +13507,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0888671875,
+ "grad_norm": 0.09375,
"learning_rate": 0.0009798217280943967,
- "loss": 0.0348,
+ "loss": 0.0356,
"macro_f1": 0.6666666865348816,
"num_tokens": 2298927.0,
"repeat_count": 0.0,
- "routers_loss": 0.008244800381362438,
+ "routers_loss": 0.009208574891090393,
"skip_count": 1.0,
"step": 1422,
"text_loss": 0.48686322569847107
@@ -13526,32 +13526,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0009797345940689335,
- "loss": 0.0269,
+ "loss": 0.0267,
"macro_f1": 0.3272727429866791,
"num_tokens": 2301541.0,
"repeat_count": 0.0,
- "routers_loss": 0.015340043231844902,
+ "routers_loss": 0.015011847950518131,
"skip_count": 0.0,
"step": 1424,
"text_loss": 0.49446266889572144
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6000000238418579,
- "avg_layers": 25.0,
+ "acc_skip": 0.4000000059604645,
+ "avg_layers": 26.0,
"epoch": 6.695039624302906,
- "f1_execute": 0.9583333134651184,
+ "f1_execute": 0.9387754797935486,
"f1_repeat": 0.0,
- "f1_skip": 0.75,
- "grad_norm": 0.1318359375,
+ "f1_skip": 0.5714285969734192,
+ "grad_norm": 0.1337890625,
"learning_rate": 0.0009796472762082687,
- "loss": 0.0341,
- "macro_f1": 0.5694444179534912,
+ "loss": 0.0338,
+ "macro_f1": 0.5034013986587524,
"num_tokens": 2304589.0,
"repeat_count": 0.0,
- "routers_loss": 0.058681465685367584,
+ "routers_loss": 0.05912091210484505,
"skip_count": 5.0,
"step": 1426,
"text_loss": 0.23945684731006622
@@ -13564,32 +13564,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.09765625,
"learning_rate": 0.000979559774545863,
- "loss": 0.0423,
+ "loss": 0.0405,
"macro_f1": 0.3272727429866791,
"num_tokens": 2307860.0,
"repeat_count": 0.0,
- "routers_loss": 0.020810559391975403,
+ "routers_loss": 0.021242303773760796,
"skip_count": 1.0,
"step": 1428,
"text_loss": 0.531273365020752
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 6.713824479013795,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.09033203125,
+ "f1_skip": 0.0,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.000979472089115247,
- "loss": 0.0268,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0276,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 2311581.0,
"repeat_count": 0.0,
- "routers_loss": 0.030001837760210037,
+ "routers_loss": 0.02768544852733612,
"skip_count": 2.0,
"step": 1430,
"text_loss": 0.2497459501028061
@@ -13602,13 +13602,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1318359375,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.000979384219950022,
- "loss": 0.034,
+ "loss": 0.0346,
"macro_f1": 0.3333333432674408,
"num_tokens": 2314639.0,
"repeat_count": 0.0,
- "routers_loss": 0.010381575673818588,
+ "routers_loss": 0.008678150363266468,
"skip_count": 0.0,
"step": 1432,
"text_loss": 0.6579355001449585
@@ -13621,32 +13621,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.08056640625,
"learning_rate": 0.0009792961670838595,
- "loss": 0.0365,
+ "loss": 0.0362,
"macro_f1": 0.3272727429866791,
"num_tokens": 2317927.0,
"repeat_count": 1.0,
- "routers_loss": 0.03234704211354256,
+ "routers_loss": 0.03325597569346428,
"skip_count": 0.0,
"step": 1434,
"text_loss": 0.5209436416625977
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 6.742001761080129,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9818181991577148,
+ "f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009792079305505016,
- "loss": 0.0303,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0306,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2321065.0,
"repeat_count": 1.0,
- "routers_loss": 0.015481291338801384,
+ "routers_loss": 0.019228918477892876,
"skip_count": 0.0,
"step": 1436,
"text_loss": 0.41087067127227783
@@ -13659,13 +13659,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.000979119510383761,
- "loss": 0.0366,
+ "loss": 0.0371,
"macro_f1": 0.3333333432674408,
"num_tokens": 2323714.0,
"repeat_count": 0.0,
- "routers_loss": 0.018170451745390892,
+ "routers_loss": 0.017071325331926346,
"skip_count": 0.0,
"step": 1438,
"text_loss": 0.21490029990673065
@@ -13678,13 +13678,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.162109375,
+ "grad_norm": 0.2060546875,
"learning_rate": 0.00097903090661752,
- "loss": 0.0306,
+ "loss": 0.0309,
"macro_f1": 0.3333333432674408,
"num_tokens": 2326454.0,
"repeat_count": 0.0,
- "routers_loss": 0.010385681875050068,
+ "routers_loss": 0.00991755723953247,
"skip_count": 0.0,
"step": 1440,
"text_loss": 0.23847346007823944
@@ -13697,13 +13697,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.232421875,
"learning_rate": 0.000978942119285732,
- "loss": 0.0407,
+ "loss": 0.0404,
"macro_f1": 0.3272727429866791,
"num_tokens": 2329462.0,
"repeat_count": 0.0,
- "routers_loss": 0.04976538568735123,
+ "routers_loss": 0.04908733069896698,
"skip_count": 1.0,
"step": 1442,
"text_loss": 0.23343028128147125
@@ -13716,13 +13716,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009788531484224204,
- "loss": 0.0255,
+ "loss": 0.0264,
"macro_f1": 0.3333333432674408,
"num_tokens": 2332146.0,
"repeat_count": 0.0,
- "routers_loss": 0.0030266831163316965,
+ "routers_loss": 0.0032628148328512907,
"skip_count": 0.0,
"step": 1444,
"text_loss": 0.47423800826072693
@@ -13730,18 +13730,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.3333333432674408,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 6.788963897857353,
- "f1_execute": 0.9600000381469727,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 0.5,
- "grad_norm": 0.107421875,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009787639940616788,
- "loss": 0.0411,
- "macro_f1": 0.8200000524520874,
+ "loss": 0.0405,
+ "macro_f1": 0.7018141150474548,
"num_tokens": 2335738.0,
"repeat_count": 1.0,
- "routers_loss": 0.13420957326889038,
+ "routers_loss": 0.14336998760700226,
"skip_count": 3.0,
"step": 1446,
"text_loss": 0.21837592124938965
@@ -13754,13 +13754,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1953125,
+ "grad_norm": 0.189453125,
"learning_rate": 0.0009786746562376717,
- "loss": 0.0251,
+ "loss": 0.0241,
"macro_f1": 0.6666666865348816,
"num_tokens": 2338488.0,
"repeat_count": 0.0,
- "routers_loss": 0.012779864482581615,
+ "routers_loss": 0.010542908683419228,
"skip_count": 1.0,
"step": 1448,
"text_loss": 1.0614757537841797
@@ -13773,13 +13773,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.0009785851349846334,
- "loss": 0.0266,
+ "loss": 0.0268,
"macro_f1": 0.3333333432674408,
"num_tokens": 2342074.0,
"repeat_count": 0.0,
- "routers_loss": 0.005545398220419884,
+ "routers_loss": 0.005998016335070133,
"skip_count": 0.0,
"step": 1450,
"text_loss": 0.4269719421863556
@@ -13792,13 +13792,13 @@
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.09814453125,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009784954303368686,
- "loss": 0.0395,
+ "loss": 0.0384,
"macro_f1": 0.44705885648727417,
"num_tokens": 2345838.0,
"repeat_count": 0.0,
- "routers_loss": 0.0899835154414177,
+ "routers_loss": 0.0959126204252243,
"skip_count": 3.0,
"step": 1452,
"text_loss": 0.3315916955471039
@@ -13811,13 +13811,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.1005859375,
"learning_rate": 0.0009784055423287521,
"loss": 0.0218,
"macro_f1": 0.3333333432674408,
"num_tokens": 2348939.0,
"repeat_count": 0.0,
- "routers_loss": 0.002738836221396923,
+ "routers_loss": 0.0025467623490840197,
"skip_count": 0.0,
"step": 1454,
"text_loss": 0.6162732839584351
@@ -13830,13 +13830,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12060546875,
+ "grad_norm": 0.115234375,
"learning_rate": 0.0009783154709947293,
- "loss": 0.0266,
+ "loss": 0.0256,
"macro_f1": 0.3272727429866791,
"num_tokens": 2352232.0,
"repeat_count": 0.0,
- "routers_loss": 0.020522192120552063,
+ "routers_loss": 0.01860538125038147,
"skip_count": 1.0,
"step": 1456,
"text_loss": 0.23928768932819366
@@ -13844,18 +13844,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 6.84531846199002,
- "f1_execute": 0.9629629850387573,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.0009782252163693158,
- "loss": 0.0197,
- "macro_f1": 0.32098767161369324,
+ "loss": 0.0201,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2355159.0,
"repeat_count": 0.0,
- "routers_loss": 0.04245268926024437,
+ "routers_loss": 0.04412713274359703,
"skip_count": 1.0,
"step": 1458,
"text_loss": 0.3371323347091675
@@ -13868,13 +13868,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.224609375,
+ "grad_norm": 0.21484375,
"learning_rate": 0.0009781347784870973,
- "loss": 0.0376,
+ "loss": 0.0379,
"macro_f1": 0.3333333432674408,
"num_tokens": 2358175.0,
"repeat_count": 0.0,
- "routers_loss": 0.009142685681581497,
+ "routers_loss": 0.006809141952544451,
"skip_count": 0.0,
"step": 1460,
"text_loss": 0.547267735004425
@@ -13887,13 +13887,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009780441573827296,
- "loss": 0.0295,
+ "loss": 0.03,
"macro_f1": 0.3076923191547394,
"num_tokens": 2360991.0,
"repeat_count": 0.0,
- "routers_loss": 0.08038893342018127,
+ "routers_loss": 0.08924390375614166,
"skip_count": 4.0,
"step": 1462,
"text_loss": 0.7026563882827759
@@ -13906,13 +13906,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.1865234375,
"learning_rate": 0.000977953353090939,
- "loss": 0.027,
+ "loss": 0.0272,
"macro_f1": 0.3333333432674408,
"num_tokens": 2363894.0,
"repeat_count": 0.0,
- "routers_loss": 0.02107175625860691,
+ "routers_loss": 0.021858472377061844,
"skip_count": 0.0,
"step": 1464,
"text_loss": 0.2718065083026886
@@ -13925,13 +13925,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.0009778623656465219,
- "loss": 0.0349,
+ "loss": 0.0338,
"macro_f1": 0.32098764181137085,
"num_tokens": 2367265.0,
"repeat_count": 0.0,
- "routers_loss": 0.042030055075883865,
+ "routers_loss": 0.044781096279621124,
"skip_count": 0.0,
"step": 1466,
"text_loss": 0.5008095502853394
@@ -13944,13 +13944,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07470703125,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009777711950843448,
- "loss": 0.022,
+ "loss": 0.0212,
"macro_f1": 0.3333333432674408,
"num_tokens": 2370186.0,
"repeat_count": 0.0,
- "routers_loss": 0.004230673424899578,
+ "routers_loss": 0.0040459707379341125,
"skip_count": 0.0,
"step": 1468,
"text_loss": 0.5242461562156677
@@ -13963,13 +13963,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009776798414393446,
- "loss": 0.0284,
+ "loss": 0.0279,
"macro_f1": 0.6598639488220215,
"num_tokens": 2373314.0,
"repeat_count": 1.0,
- "routers_loss": 0.06986775249242783,
+ "routers_loss": 0.0708528608083725,
"skip_count": 3.0,
"step": 1470,
"text_loss": 0.2821732461452484
@@ -13982,13 +13982,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.142578125,
+ "grad_norm": 0.1328125,
"learning_rate": 0.0009775883047465279,
- "loss": 0.0431,
+ "loss": 0.0414,
"macro_f1": 0.31446540355682373,
"num_tokens": 2376435.0,
"repeat_count": 1.0,
- "routers_loss": 0.0439564548432827,
+ "routers_loss": 0.0290578193962574,
"skip_count": 1.0,
"step": 1472,
"text_loss": 0.8438440561294556
@@ -14001,13 +14001,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1123046875,
+ "grad_norm": 0.10546875,
"learning_rate": 0.000977496585040972,
- "loss": 0.0376,
+ "loss": 0.0373,
"macro_f1": 0.3333333432674408,
"num_tokens": 2380244.0,
"repeat_count": 0.0,
- "routers_loss": 0.011889892630279064,
+ "routers_loss": 0.010360375046730042,
"skip_count": 0.0,
"step": 1474,
"text_loss": 0.4356135427951813
@@ -14020,13 +14020,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.000977404682357824,
- "loss": 0.0295,
+ "loss": 0.0294,
"macro_f1": 0.3272727429866791,
"num_tokens": 2383498.0,
"repeat_count": 0.0,
- "routers_loss": 0.022536326199769974,
+ "routers_loss": 0.023518972098827362,
"skip_count": 0.0,
"step": 1476,
"text_loss": 0.25195425748825073
@@ -14039,13 +14039,13 @@
"f1_execute": 0.9743589162826538,
"f1_repeat": 0.888888955116272,
"f1_skip": 1.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.000977312596732301,
- "loss": 0.0388,
+ "loss": 0.0375,
"macro_f1": 0.9544159770011902,
"num_tokens": 2386414.0,
"repeat_count": 5.0,
- "routers_loss": 0.07959948480129242,
+ "routers_loss": 0.08190606534481049,
"skip_count": 4.0,
"step": 1478,
"text_loss": 0.6586798429489136
@@ -14058,13 +14058,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.10546875,
"learning_rate": 0.0009772203281996905,
- "loss": 0.0341,
+ "loss": 0.0336,
"macro_f1": 1.0,
"num_tokens": 2389399.0,
"repeat_count": 1.0,
- "routers_loss": 0.019112225621938705,
+ "routers_loss": 0.016441475600004196,
"skip_count": 2.0,
"step": 1480,
"text_loss": 0.3671986758708954
@@ -14077,13 +14077,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0888671875,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009771278767953502,
- "loss": 0.0345,
+ "loss": 0.0357,
"macro_f1": 0.3333333432674408,
"num_tokens": 2392400.0,
"repeat_count": 0.0,
- "routers_loss": 0.018750866875052452,
+ "routers_loss": 0.019211363047361374,
"skip_count": 0.0,
"step": 1482,
"text_loss": 0.27418580651283264
@@ -14096,32 +14096,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09228515625,
+ "grad_norm": 0.0947265625,
"learning_rate": 0.0009770352425547072,
- "loss": 0.0291,
+ "loss": 0.0292,
"macro_f1": 0.3333333432674408,
"num_tokens": 2395123.0,
"repeat_count": 0.0,
- "routers_loss": 0.015407348051667213,
+ "routers_loss": 0.015800386667251587,
"skip_count": 0.0,
"step": 1484,
"text_loss": 0.19896622002124786
},
{
- "acc_repeat": 0.6666666865348816,
+ "acc_repeat": 0.3333333432674408,
"acc_skip": 0.0,
- "avg_layers": 30.0,
+ "avg_layers": 29.0,
"epoch": 6.976812444966246,
- "f1_execute": 0.9803921580314636,
- "f1_repeat": 0.800000011920929,
+ "f1_execute": 0.9615384340286255,
+ "f1_repeat": 0.5,
"f1_skip": 0.0,
- "grad_norm": 0.11474609375,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009769424255132596,
- "loss": 0.0258,
- "macro_f1": 0.5934640765190125,
+ "loss": 0.0256,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 2397359.0,
"repeat_count": 3.0,
- "routers_loss": 0.06514479219913483,
+ "routers_loss": 0.06670158356428146,
"skip_count": 0.0,
"step": 1486,
"text_loss": 0.4229799509048462
@@ -14134,13 +14134,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.111328125,
+ "grad_norm": 0.1162109375,
"learning_rate": 0.0009768494257065747,
- "loss": 0.0217,
+ "loss": 0.0218,
"macro_f1": 0.3272727429866791,
"num_tokens": 2400387.0,
"repeat_count": 0.0,
- "routers_loss": 0.013567833229899406,
+ "routers_loss": 0.011144762858748436,
"skip_count": 1.0,
"step": 1488,
"text_loss": 0.4264226257801056
@@ -14153,13 +14153,13 @@
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12255859375,
+ "grad_norm": 0.12353515625,
"learning_rate": 0.0009767562431702904,
- "loss": 0.0389,
+ "loss": 0.0387,
"macro_f1": 0.3006536364555359,
"num_tokens": 2403241.0,
"repeat_count": 2.0,
- "routers_loss": 0.13762018084526062,
+ "routers_loss": 0.12339717149734497,
"skip_count": 3.0,
"step": 1490,
"text_loss": 0.2850193977355957
@@ -14172,13 +14172,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0009766628779401142,
- "loss": 0.0214,
+ "loss": 0.0215,
"macro_f1": 0.6666666865348816,
"num_tokens": 2406087.0,
"repeat_count": 0.0,
- "routers_loss": 0.008640666492283344,
+ "routers_loss": 0.008174685761332512,
"skip_count": 1.0,
"step": 1492,
"text_loss": 0.6756544709205627
@@ -14191,13 +14191,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05712890625,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.000976569330051824,
- "loss": 0.0182,
+ "loss": 0.0186,
"macro_f1": 0.3333333432674408,
"num_tokens": 2409312.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018257038900628686,
+ "routers_loss": 0.0021256296895444393,
"skip_count": 0.0,
"step": 1494,
"text_loss": 0.4789894223213196
@@ -14210,13 +14210,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0009764755995412677,
"loss": 0.0193,
"macro_f1": 0.3333333432674408,
"num_tokens": 2412758.0,
"repeat_count": 0.0,
- "routers_loss": 0.003656312357634306,
+ "routers_loss": 0.003944927826523781,
"skip_count": 0.0,
"step": 1496,
"text_loss": 0.5157490968704224
@@ -14229,13 +14229,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009763816864443627,
- "loss": 0.0246,
+ "loss": 0.0239,
"macro_f1": 0.3272727429866791,
"num_tokens": 2416079.0,
"repeat_count": 1.0,
- "routers_loss": 0.044268425554037094,
+ "routers_loss": 0.03893325850367546,
"skip_count": 0.0,
"step": 1498,
"text_loss": 0.28045418858528137
@@ -14248,13 +14248,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.1279296875,
"learning_rate": 0.0009762875907970968,
- "loss": 0.0207,
+ "loss": 0.0199,
"macro_f1": 0.3333333432674408,
"num_tokens": 2420340.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018966116476804018,
+ "routers_loss": 0.0017725443467497826,
"skip_count": 0.0,
"step": 1500,
"text_loss": 0.35550856590270996
@@ -14267,32 +14267,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.06298828125,
"learning_rate": 0.0009761933126355277,
- "loss": 0.0249,
+ "loss": 0.0245,
"macro_f1": 0.3272727429866791,
"num_tokens": 2424735.0,
"repeat_count": 0.0,
- "routers_loss": 0.01729201152920723,
+ "routers_loss": 0.01393749937415123,
"skip_count": 1.0,
"step": 1502,
"text_loss": 0.38840189576148987
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 7.06105077781039,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.11962890625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009760988519957828,
- "loss": 0.0248,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0249,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 2428132.0,
"repeat_count": 0.0,
- "routers_loss": 0.01693531684577465,
+ "routers_loss": 0.01687910407781601,
"skip_count": 2.0,
"step": 1504,
"text_loss": 0.3031681478023529
@@ -14305,13 +14305,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0009760042089140598,
- "loss": 0.0197,
+ "loss": 0.0193,
"macro_f1": 0.3144654333591461,
"num_tokens": 2431592.0,
"repeat_count": 1.0,
- "routers_loss": 0.04939094930887222,
+ "routers_loss": 0.04704280197620392,
"skip_count": 2.0,
"step": 1506,
"text_loss": 0.16355200111865997
@@ -14324,13 +14324,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0009759093834266259,
- "loss": 0.0213,
+ "loss": 0.0206,
"macro_f1": 0.3333333432674408,
"num_tokens": 2434236.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016892930725589395,
+ "routers_loss": 0.0016075772000476718,
"skip_count": 0.0,
"step": 1508,
"text_loss": 0.6080073118209839
@@ -14343,13 +14343,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10009765625,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009758143755698186,
- "loss": 0.0147,
+ "loss": 0.015,
"macro_f1": 0.3333333432674408,
"num_tokens": 2437170.0,
"repeat_count": 0.0,
- "routers_loss": 0.008671467192471027,
+ "routers_loss": 0.008451299741864204,
"skip_count": 0.0,
"step": 1510,
"text_loss": 0.22100484371185303
@@ -14362,13 +14362,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009757191853800449,
- "loss": 0.0228,
+ "loss": 0.0227,
"macro_f1": 0.5866667032241821,
"num_tokens": 2441187.0,
"repeat_count": 1.0,
- "routers_loss": 0.042682576924562454,
+ "routers_loss": 0.046565692871809006,
"skip_count": 3.0,
"step": 1512,
"text_loss": 0.25098952651023865
@@ -14381,13 +14381,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.11279296875,
"learning_rate": 0.000975623812893782,
- "loss": 0.028,
+ "loss": 0.0276,
"macro_f1": 0.3272727429866791,
"num_tokens": 2444664.0,
"repeat_count": 0.0,
- "routers_loss": 0.02905822917819023,
+ "routers_loss": 0.02872578240931034,
"skip_count": 1.0,
"step": 1514,
"text_loss": 0.4952253997325897
@@ -14400,13 +14400,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09326171875,
+ "grad_norm": 0.1142578125,
"learning_rate": 0.0009755282581475768,
- "loss": 0.0223,
+ "loss": 0.0233,
"macro_f1": 0.3333333432674408,
"num_tokens": 2447748.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018810008186846972,
+ "routers_loss": 0.002055214950814843,
"skip_count": 0.0,
"step": 1516,
"text_loss": 0.7465500831604004
@@ -14419,13 +14419,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.10302734375,
"learning_rate": 0.000975432521178046,
- "loss": 0.0219,
+ "loss": 0.0216,
"macro_f1": 0.3272727429866791,
"num_tokens": 2450834.0,
"repeat_count": 1.0,
- "routers_loss": 0.04308714717626572,
+ "routers_loss": 0.04498551785945892,
"skip_count": 0.0,
"step": 1518,
"text_loss": 0.28144413232803345
@@ -14438,13 +14438,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.0009753366020218763,
- "loss": 0.0232,
+ "loss": 0.0234,
"macro_f1": 0.3333333432674408,
"num_tokens": 2454233.0,
"repeat_count": 0.0,
- "routers_loss": 0.003754811594262719,
+ "routers_loss": 0.003669742727652192,
"skip_count": 0.0,
"step": 1520,
"text_loss": 0.5667551755905151
@@ -14457,32 +14457,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08837890625,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009752405007158238,
- "loss": 0.0246,
+ "loss": 0.0238,
"macro_f1": 0.3333333432674408,
"num_tokens": 2457331.0,
"repeat_count": 0.0,
- "routers_loss": 0.010853761807084084,
+ "routers_loss": 0.010455607436597347,
"skip_count": 0.0,
"step": 1522,
"text_loss": 0.19575810432434082
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.5,
"acc_skip": 1.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 7.154975051364837,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0009751442172967151,
- "loss": 0.0196,
- "macro_f1": 1.0,
+ "loss": 0.0193,
+ "macro_f1": 0.8823530077934265,
"num_tokens": 2459935.0,
"repeat_count": 2.0,
- "routers_loss": 0.015100379474461079,
+ "routers_loss": 0.025189083069562912,
"skip_count": 1.0,
"step": 1524,
"text_loss": 0.45453405380249023
@@ -14495,13 +14495,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.000975047751801446,
- "loss": 0.0189,
+ "loss": 0.0187,
"macro_f1": 0.3272727429866791,
"num_tokens": 2463008.0,
"repeat_count": 0.0,
- "routers_loss": 0.011991916224360466,
+ "routers_loss": 0.012297490611672401,
"skip_count": 0.0,
"step": 1526,
"text_loss": 0.31437572836875916
@@ -14514,32 +14514,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009749511042669823,
- "loss": 0.0226,
+ "loss": 0.0233,
"macro_f1": 0.3333333432674408,
"num_tokens": 2466475.0,
"repeat_count": 0.0,
- "routers_loss": 0.008201062679290771,
+ "routers_loss": 0.011026266030967236,
"skip_count": 0.0,
"step": 1528,
"text_loss": 0.46604859828948975
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 7.183152333431171,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.1181640625,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1376953125,
"learning_rate": 0.0009748542747303595,
- "loss": 0.0174,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0182,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2469320.0,
"repeat_count": 0.0,
- "routers_loss": 0.008513177745044231,
+ "routers_loss": 0.011934996582567692,
"skip_count": 1.0,
"step": 1530,
"text_loss": 0.7764923572540283
@@ -14552,13 +14552,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.0966796875,
"learning_rate": 0.0009747572632286827,
- "loss": 0.02,
+ "loss": 0.0203,
"macro_f1": 0.3333333432674408,
"num_tokens": 2472468.0,
"repeat_count": 0.0,
- "routers_loss": 0.004850955214351416,
+ "routers_loss": 0.005786920432001352,
"skip_count": 0.0,
"step": 1532,
"text_loss": 0.3555782437324524
@@ -14571,32 +14571,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.087890625,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009746600697991271,
- "loss": 0.0206,
+ "loss": 0.02,
"macro_f1": 0.6666666865348816,
"num_tokens": 2475736.0,
"repeat_count": 1.0,
- "routers_loss": 0.0027650354895740747,
+ "routers_loss": 0.0026990731712430716,
"skip_count": 0.0,
"step": 1534,
"text_loss": 0.49561792612075806
},
{
"acc_repeat": 1.0,
- "acc_skip": 0.0,
- "avg_layers": 29.0,
+ "acc_skip": 0.5,
+ "avg_layers": 28.0,
"epoch": 7.2113296154975055,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
- "f1_skip": 0.0,
- "grad_norm": 0.0615234375,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.0556640625,
"learning_rate": 0.0009745626944789375,
- "loss": 0.0209,
- "macro_f1": 0.6538461446762085,
+ "loss": 0.0204,
+ "macro_f1": 0.8823530077934265,
"num_tokens": 2478887.0,
"repeat_count": 1.0,
- "routers_loss": 0.023268593475222588,
+ "routers_loss": 0.020221207290887833,
"skip_count": 2.0,
"step": 1536,
"text_loss": 0.5375416278839111
@@ -14609,13 +14609,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11669921875,
+ "grad_norm": 0.12158203125,
"learning_rate": 0.0009744651373054279,
"loss": 0.0286,
"macro_f1": 0.3272727429866791,
"num_tokens": 2481293.0,
"repeat_count": 0.0,
- "routers_loss": 0.031235001981258392,
+ "routers_loss": 0.03131086751818657,
"skip_count": 1.0,
"step": 1538,
"text_loss": 0.5241039395332336
@@ -14628,13 +14628,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.080078125,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009743673983159828,
- "loss": 0.023,
+ "loss": 0.0241,
"macro_f1": 0.6122449040412903,
"num_tokens": 2484403.0,
"repeat_count": 0.0,
- "routers_loss": 0.042398080229759216,
+ "routers_loss": 0.04448170214891434,
"skip_count": 4.0,
"step": 1540,
"text_loss": 0.7465724349021912
@@ -14647,13 +14647,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.08935546875,
"learning_rate": 0.0009742694775480557,
- "loss": 0.0268,
+ "loss": 0.0265,
"macro_f1": 0.6666666865348816,
"num_tokens": 2487952.0,
"repeat_count": 0.0,
- "routers_loss": 0.007361465133726597,
+ "routers_loss": 0.007171491626650095,
"skip_count": 1.0,
"step": 1542,
"text_loss": 0.2877117097377777
@@ -14666,13 +14666,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0009741713750391703,
- "loss": 0.0166,
+ "loss": 0.0171,
"macro_f1": 0.6666666865348816,
"num_tokens": 2490815.0,
"repeat_count": 1.0,
- "routers_loss": 0.0052334014326334,
+ "routers_loss": 0.004559285007417202,
"skip_count": 0.0,
"step": 1544,
"text_loss": 0.6097800135612488
@@ -14685,13 +14685,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.06787109375,
"learning_rate": 0.0009740730908269193,
"loss": 0.0174,
"macro_f1": 0.3333333432674408,
"num_tokens": 2494727.0,
"repeat_count": 0.0,
- "routers_loss": 0.004993532784283161,
+ "routers_loss": 0.005271553061902523,
"skip_count": 0.0,
"step": 1546,
"text_loss": 0.5431114435195923
@@ -14704,13 +14704,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009739746249489658,
- "loss": 0.0248,
+ "loss": 0.0239,
"macro_f1": 0.3333333432674408,
"num_tokens": 2499266.0,
"repeat_count": 0.0,
- "routers_loss": 0.001611889572814107,
+ "routers_loss": 0.0015409323386847973,
"skip_count": 0.0,
"step": 1548,
"text_loss": 0.4702678322792053
@@ -14723,13 +14723,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.1171875,
"learning_rate": 0.0009738759774430417,
- "loss": 0.0209,
+ "loss": 0.0216,
"macro_f1": 0.32098764181137085,
"num_tokens": 2502273.0,
"repeat_count": 1.0,
- "routers_loss": 0.03059260919690132,
+ "routers_loss": 0.030183158814907074,
"skip_count": 1.0,
"step": 1550,
"text_loss": 0.3239189088344574
@@ -14742,32 +14742,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056396484375,
+ "grad_norm": 0.0498046875,
"learning_rate": 0.0009737771483469493,
- "loss": 0.0195,
+ "loss": 0.0196,
"macro_f1": 0.3333333432674408,
"num_tokens": 2507624.0,
"repeat_count": 0.0,
- "routers_loss": 0.00508903618901968,
+ "routers_loss": 0.005410848651081324,
"skip_count": 0.0,
"step": 1552,
"text_loss": 0.4014642834663391
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 7.295861461696507,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
+ "f1_skip": 1.0,
"grad_norm": 0.07763671875,
"learning_rate": 0.0009736781376985598,
- "loss": 0.0174,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0168,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 2510366.0,
"repeat_count": 0.0,
- "routers_loss": 0.007860450074076653,
+ "routers_loss": 0.0066976165398955345,
"skip_count": 1.0,
"step": 1554,
"text_loss": 0.5924848914146423
@@ -14780,13 +14780,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11669921875,
+ "grad_norm": 0.13671875,
"learning_rate": 0.0009735789455358144,
- "loss": 0.0217,
+ "loss": 0.022,
"macro_f1": 0.3333333432674408,
"num_tokens": 2513317.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027370608877390623,
+ "routers_loss": 0.002763477386906743,
"skip_count": 0.0,
"step": 1556,
"text_loss": 0.3222943842411041
@@ -14799,13 +14799,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10302734375,
+ "grad_norm": 0.11767578125,
"learning_rate": 0.0009734795718967237,
- "loss": 0.0276,
+ "loss": 0.0283,
"macro_f1": 0.32098764181137085,
"num_tokens": 2516628.0,
"repeat_count": 0.0,
- "routers_loss": 0.061584725975990295,
+ "routers_loss": 0.061566028743982315,
"skip_count": 2.0,
"step": 1558,
"text_loss": 0.3249334692955017
@@ -14818,13 +14818,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009733800168193679,
"loss": 0.0228,
"macro_f1": 1.0,
"num_tokens": 2519424.0,
"repeat_count": 2.0,
- "routers_loss": 0.01694316789507866,
+ "routers_loss": 0.017976421862840652,
"skip_count": 4.0,
"step": 1560,
"text_loss": 0.3341919481754303
@@ -14837,13 +14837,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1826171875,
"learning_rate": 0.0009732802803418966,
- "loss": 0.0234,
+ "loss": 0.023,
"macro_f1": 0.3333333432674408,
"num_tokens": 2522922.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023331891279667616,
+ "routers_loss": 0.002525332849472761,
"skip_count": 0.0,
"step": 1562,
"text_loss": 0.3176332712173462
@@ -14856,13 +14856,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.07861328125,
"learning_rate": 0.0009731803625025292,
- "loss": 0.0203,
+ "loss": 0.0196,
"macro_f1": 0.3272727429866791,
"num_tokens": 2525811.0,
"repeat_count": 0.0,
- "routers_loss": 0.021300682798027992,
+ "routers_loss": 0.015524424612522125,
"skip_count": 1.0,
"step": 1564,
"text_loss": 0.532774031162262
@@ -14875,13 +14875,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.10205078125,
"learning_rate": 0.0009730802633395541,
- "loss": 0.026,
+ "loss": 0.0257,
"macro_f1": 0.6603773832321167,
"num_tokens": 2529157.0,
"repeat_count": 1.0,
- "routers_loss": 0.08335043489933014,
+ "routers_loss": 0.08138631284236908,
"skip_count": 1.0,
"step": 1566,
"text_loss": 0.529487133026123
@@ -14894,13 +14894,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009729799828913298,
- "loss": 0.0224,
+ "loss": 0.0223,
"macro_f1": 0.3333333432674408,
"num_tokens": 2532249.0,
"repeat_count": 0.0,
- "routers_loss": 0.003535634372383356,
+ "routers_loss": 0.0035867292899638414,
"skip_count": 0.0,
"step": 1568,
"text_loss": 0.503160297870636
@@ -14913,13 +14913,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009728795211962838,
"loss": 0.0259,
"macro_f1": 0.5492662787437439,
"num_tokens": 2535904.0,
"repeat_count": 0.0,
- "routers_loss": 0.025729363784193993,
+ "routers_loss": 0.02987455204129219,
"skip_count": 2.0,
"step": 1570,
"text_loss": 0.9170270562171936
@@ -14932,13 +14932,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1357421875,
+ "grad_norm": 0.11865234375,
"learning_rate": 0.0009727788782929131,
- "loss": 0.0287,
+ "loss": 0.0273,
"macro_f1": 0.3272727429866791,
"num_tokens": 2538943.0,
"repeat_count": 1.0,
- "routers_loss": 0.059166863560676575,
+ "routers_loss": 0.04676021635532379,
"skip_count": 0.0,
"step": 1572,
"text_loss": 0.29146310687065125
@@ -14951,13 +14951,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0009726780542197844,
- "loss": 0.0173,
+ "loss": 0.0169,
"macro_f1": 0.3333333432674408,
"num_tokens": 2541805.0,
"repeat_count": 0.0,
- "routers_loss": 0.002580022206529975,
+ "routers_loss": 0.002127803163602948,
"skip_count": 0.0,
"step": 1574,
"text_loss": 1.0126502513885498
@@ -14970,13 +14970,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.138671875,
+ "grad_norm": 0.142578125,
"learning_rate": 0.0009725770490155338,
- "loss": 0.0257,
+ "loss": 0.0262,
"macro_f1": 0.3333333432674408,
"num_tokens": 2546213.0,
"repeat_count": 0.0,
- "routers_loss": 0.007746981456875801,
+ "routers_loss": 0.007609677035361528,
"skip_count": 0.0,
"step": 1576,
"text_loss": 0.190168559551239
@@ -14989,13 +14989,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.083984375,
"learning_rate": 0.0009724758627188665,
- "loss": 0.0344,
+ "loss": 0.0356,
"macro_f1": 0.3272727429866791,
"num_tokens": 2549554.0,
"repeat_count": 0.0,
- "routers_loss": 0.027308562770485878,
+ "routers_loss": 0.033554721623659134,
"skip_count": 1.0,
"step": 1578,
"text_loss": 0.2977406084537506
@@ -15008,13 +15008,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.14453125,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009723744953685572,
- "loss": 0.0277,
+ "loss": 0.028,
"macro_f1": 0.3272727429866791,
"num_tokens": 2552785.0,
"repeat_count": 1.0,
- "routers_loss": 0.029863199219107628,
+ "routers_loss": 0.027864238247275352,
"skip_count": 0.0,
"step": 1580,
"text_loss": 0.2700682580471039
@@ -15027,13 +15027,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.19921875,
"learning_rate": 0.0009722729470034503,
- "loss": 0.0218,
+ "loss": 0.0224,
"macro_f1": 0.3333333432674408,
"num_tokens": 2556550.0,
"repeat_count": 0.0,
- "routers_loss": 0.004019706044346094,
+ "routers_loss": 0.004798175301402807,
"skip_count": 0.0,
"step": 1582,
"text_loss": 0.6559903025627136
@@ -15046,32 +15046,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07177734375,
+ "grad_norm": 0.078125,
"learning_rate": 0.0009721712176624591,
- "loss": 0.0239,
+ "loss": 0.0242,
"macro_f1": 0.3333333432674408,
"num_tokens": 2559862.0,
"repeat_count": 0.0,
- "routers_loss": 0.014162382110953331,
+ "routers_loss": 0.013764148578047752,
"skip_count": 0.0,
"step": 1584,
"text_loss": 0.2257535308599472
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.5,
+ "avg_layers": 27.0,
"epoch": 7.446140299383622,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.10986328125,
"learning_rate": 0.0009720693073845667,
- "loss": 0.0338,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.032,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 2562766.0,
"repeat_count": 0.0,
- "routers_loss": 0.023485012352466583,
+ "routers_loss": 0.01937069371342659,
"skip_count": 2.0,
"step": 1586,
"text_loss": 0.178413525223732
@@ -15079,37 +15079,37 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 7.455532726739067,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.150390625,
"learning_rate": 0.0009719672162088252,
- "loss": 0.0308,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0306,
+ "macro_f1": 0.32098767161369324,
"num_tokens": 2566583.0,
"repeat_count": 1.0,
- "routers_loss": 0.05822715163230896,
+ "routers_loss": 0.06224144622683525,
"skip_count": 0.0,
"step": 1588,
"text_loss": 0.3992367684841156
},
{
- "acc_repeat": 0.5,
- "acc_skip": 0.5,
+ "acc_repeat": 1.0,
+ "acc_skip": 0.75,
"avg_layers": 27.0,
"epoch": 7.464925154094511,
- "f1_execute": 0.936170220375061,
- "f1_repeat": 0.6666666865348816,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.189453125,
+ "f1_execute": 0.9777777791023254,
+ "f1_repeat": 1.0,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.185546875,
"learning_rate": 0.0009718649441743559,
- "loss": 0.0243,
- "macro_f1": 0.7565011978149414,
+ "loss": 0.0239,
+ "macro_f1": 0.9449735879898071,
"num_tokens": 2569516.0,
"repeat_count": 2.0,
- "routers_loss": 0.07448136061429977,
+ "routers_loss": 0.06937911361455917,
"skip_count": 4.0,
"step": 1590,
"text_loss": 0.1945122629404068
@@ -15122,13 +15122,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.00097176249132035,
- "loss": 0.0228,
+ "loss": 0.0229,
"macro_f1": 0.3333333432674408,
"num_tokens": 2572418.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038424162194132805,
+ "routers_loss": 0.0034326619934290648,
"skip_count": 0.0,
"step": 1592,
"text_loss": 0.6259906888008118
@@ -15141,13 +15141,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.08642578125,
"learning_rate": 0.0009716598576860676,
- "loss": 0.0277,
+ "loss": 0.0278,
"macro_f1": 0.6666666865348816,
"num_tokens": 2575235.0,
"repeat_count": 1.0,
- "routers_loss": 0.005674343090504408,
+ "routers_loss": 0.004557516425848007,
"skip_count": 0.0,
"step": 1594,
"text_loss": 0.6638736724853516
@@ -15160,13 +15160,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.193359375,
"learning_rate": 0.0009715570433108378,
- "loss": 0.0209,
+ "loss": 0.0198,
"macro_f1": 1.0,
"num_tokens": 2578157.0,
"repeat_count": 1.0,
- "routers_loss": 0.015544800087809563,
+ "routers_loss": 0.015363055281341076,
"skip_count": 1.0,
"step": 1596,
"text_loss": 0.6530464887619019
@@ -15179,13 +15179,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009714540482340595,
- "loss": 0.0279,
+ "loss": 0.0268,
"macro_f1": 0.6666666865348816,
"num_tokens": 2581801.0,
"repeat_count": 1.0,
- "routers_loss": 0.013199405744671822,
+ "routers_loss": 0.01257144846022129,
"skip_count": 0.0,
"step": 1598,
"text_loss": 0.5916110277175903
@@ -15198,13 +15198,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059326171875,
+ "grad_norm": 0.058837890625,
"learning_rate": 0.0009713508724952006,
- "loss": 0.0178,
+ "loss": 0.0177,
"macro_f1": 0.3333333432674408,
"num_tokens": 2585204.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032487998250871897,
+ "routers_loss": 0.003175645601004362,
"skip_count": 0.0,
"step": 1600,
"text_loss": 0.27901601791381836
@@ -15217,13 +15217,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12255859375,
+ "grad_norm": 0.12353515625,
"learning_rate": 0.0009712475161337981,
- "loss": 0.0253,
+ "loss": 0.0261,
"macro_f1": 0.3333333432674408,
"num_tokens": 2588286.0,
"repeat_count": 0.0,
- "routers_loss": 0.0041928659193217754,
+ "routers_loss": 0.004122321493923664,
"skip_count": 0.0,
"step": 1602,
"text_loss": 0.42420244216918945
@@ -15236,13 +15236,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0009711439791894585,
- "loss": 0.0343,
+ "loss": 0.0341,
"macro_f1": 0.6666666865348816,
"num_tokens": 2591476.0,
"repeat_count": 0.0,
- "routers_loss": 0.011576149612665176,
+ "routers_loss": 0.011215819045901299,
"skip_count": 1.0,
"step": 1604,
"text_loss": 0.5549933910369873
@@ -15255,13 +15255,13 @@
"f1_execute": 0.9599999785423279,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009710402617018574,
- "loss": 0.0179,
+ "loss": 0.0172,
"macro_f1": 0.8200000524520874,
"num_tokens": 2594336.0,
"repeat_count": 1.0,
- "routers_loss": 0.03026912547647953,
+ "routers_loss": 0.02916567400097847,
"skip_count": 2.0,
"step": 1606,
"text_loss": 0.3263779282569885
@@ -15276,11 +15276,11 @@
"f1_skip": 1.0,
"grad_norm": 0.068359375,
"learning_rate": 0.0009709363637107393,
- "loss": 0.021,
+ "loss": 0.0209,
"macro_f1": 0.6666666865348816,
"num_tokens": 2597462.0,
"repeat_count": 0.0,
- "routers_loss": 0.014957098290324211,
+ "routers_loss": 0.015897957608103752,
"skip_count": 1.0,
"step": 1608,
"text_loss": 0.20917139947414398
@@ -15293,13 +15293,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009708322852559184,
- "loss": 0.0226,
+ "loss": 0.0229,
"macro_f1": 0.3333333432674408,
"num_tokens": 2601543.0,
"repeat_count": 0.0,
- "routers_loss": 0.00254683755338192,
+ "routers_loss": 0.002211357234045863,
"skip_count": 0.0,
"step": 1610,
"text_loss": 0.450550377368927
@@ -15312,13 +15312,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1748046875,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.0009707280263772776,
- "loss": 0.0286,
+ "loss": 0.0277,
"macro_f1": 0.6666666865348816,
"num_tokens": 2604462.0,
"repeat_count": 0.0,
- "routers_loss": 0.018759876489639282,
+ "routers_loss": 0.01615734025835991,
"skip_count": 2.0,
"step": 1612,
"text_loss": 0.6908381581306458
@@ -15337,7 +15337,7 @@
"macro_f1": 0.5492662787437439,
"num_tokens": 2607484.0,
"repeat_count": 0.0,
- "routers_loss": 0.022694367915391922,
+ "routers_loss": 0.022048067301511765,
"skip_count": 2.0,
"step": 1614,
"text_loss": 0.36691340804100037
@@ -15350,13 +15350,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.10546875,
"learning_rate": 0.0009705189675084138,
- "loss": 0.0181,
+ "loss": 0.0176,
"macro_f1": 0.6666666865348816,
"num_tokens": 2610204.0,
"repeat_count": 0.0,
- "routers_loss": 0.010102321393787861,
+ "routers_loss": 0.008503952994942665,
"skip_count": 1.0,
"step": 1616,
"text_loss": 0.5226598381996155
@@ -15369,13 +15369,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08984375,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009704141675983029,
- "loss": 0.0252,
+ "loss": 0.0248,
"macro_f1": 0.3333333432674408,
"num_tokens": 2613128.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020994991064071655,
+ "routers_loss": 0.0019020626787096262,
"skip_count": 0.0,
"step": 1618,
"text_loss": 0.6465088725090027
@@ -15388,13 +15388,13 @@
"f1_execute": 0.9333333373069763,
"f1_repeat": 0.0,
"f1_skip": 0.7272727489471436,
- "grad_norm": 0.10009765625,
+ "grad_norm": 0.107421875,
"learning_rate": 0.0009703091874245956,
- "loss": 0.0323,
+ "loss": 0.032,
"macro_f1": 0.5535354018211365,
"num_tokens": 2616360.0,
"repeat_count": 0.0,
- "routers_loss": 0.11748704314231873,
+ "routers_loss": 0.11837691068649292,
"skip_count": 7.0,
"step": 1620,
"text_loss": 0.2987039089202881
@@ -15407,32 +15407,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009702040270275204,
- "loss": 0.018,
+ "loss": 0.0181,
"macro_f1": 0.3333333432674408,
"num_tokens": 2619606.0,
"repeat_count": 0.0,
- "routers_loss": 0.007642311509698629,
+ "routers_loss": 0.0065958453342318535,
"skip_count": 0.0,
"step": 1622,
"text_loss": 0.6262096166610718
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 7.62459641913707,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.10595703125,
+ "f1_skip": 1.0,
+ "grad_norm": 0.103515625,
"learning_rate": 0.000970098686447375,
- "loss": 0.0258,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0257,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 2622499.0,
"repeat_count": 0.0,
- "routers_loss": 0.016890225932002068,
+ "routers_loss": 0.013632026500999928,
"skip_count": 1.0,
"step": 1624,
"text_loss": 0.2392602562904358
@@ -15445,13 +15445,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1240234375,
+ "grad_norm": 0.125,
"learning_rate": 0.0009699931657245264,
- "loss": 0.0242,
+ "loss": 0.0245,
"macro_f1": 0.5492662787437439,
"num_tokens": 2626002.0,
"repeat_count": 0.0,
- "routers_loss": 0.010900186374783516,
+ "routers_loss": 0.012147823348641396,
"skip_count": 2.0,
"step": 1626,
"text_loss": 0.4742976129055023
@@ -15464,13 +15464,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0810546875,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009698874648994098,
- "loss": 0.0279,
+ "loss": 0.0285,
"macro_f1": 1.0,
"num_tokens": 2629847.0,
"repeat_count": 1.0,
- "routers_loss": 0.011229799129068851,
+ "routers_loss": 0.010692884214222431,
"skip_count": 3.0,
"step": 1628,
"text_loss": 0.5090685486793518
@@ -15483,13 +15483,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.0009697815840125304,
- "loss": 0.0275,
+ "loss": 0.0265,
"macro_f1": 0.3333333432674408,
"num_tokens": 2633529.0,
"repeat_count": 0.0,
- "routers_loss": 0.0105878422036767,
+ "routers_loss": 0.011442207731306553,
"skip_count": 0.0,
"step": 1630,
"text_loss": 0.1874329298734665
@@ -15502,13 +15502,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.2021484375,
+ "grad_norm": 0.2119140625,
"learning_rate": 0.0009696755231044618,
- "loss": 0.0209,
+ "loss": 0.0207,
"macro_f1": 0.3333333432674408,
"num_tokens": 2636321.0,
"repeat_count": 0.0,
- "routers_loss": 0.002953991526737809,
+ "routers_loss": 0.0026681360322982073,
"skip_count": 0.0,
"step": 1632,
"text_loss": 0.7650400400161743
@@ -15521,13 +15521,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10888671875,
+ "grad_norm": 0.10498046875,
"learning_rate": 0.0009695692822158466,
- "loss": 0.0241,
+ "loss": 0.0242,
"macro_f1": 0.3272727429866791,
"num_tokens": 2638840.0,
"repeat_count": 1.0,
- "routers_loss": 0.04717390984296799,
+ "routers_loss": 0.033965807408094406,
"skip_count": 0.0,
"step": 1634,
"text_loss": 0.6175784468650818
@@ -15540,13 +15540,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0009694628613873968,
- "loss": 0.0179,
+ "loss": 0.018,
"macro_f1": 0.3333333432674408,
"num_tokens": 2641886.0,
"repeat_count": 0.0,
- "routers_loss": 0.0073657832108438015,
+ "routers_loss": 0.007568214554339647,
"skip_count": 0.0,
"step": 1636,
"text_loss": 0.43139931559562683
@@ -15559,13 +15559,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.193359375,
"learning_rate": 0.0009693562606598929,
- "loss": 0.0259,
+ "loss": 0.025,
"macro_f1": 0.3333333432674408,
"num_tokens": 2645028.0,
"repeat_count": 0.0,
- "routers_loss": 0.005212752148509026,
+ "routers_loss": 0.004973865579813719,
"skip_count": 0.0,
"step": 1638,
"text_loss": 0.6430339217185974
@@ -15578,13 +15578,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0009692494800741844,
- "loss": 0.0304,
+ "loss": 0.0313,
"macro_f1": 0.3272727429866791,
"num_tokens": 2648209.0,
"repeat_count": 1.0,
- "routers_loss": 0.04311618581414223,
+ "routers_loss": 0.049863800406455994,
"skip_count": 0.0,
"step": 1640,
"text_loss": 0.28138160705566406
@@ -15597,13 +15597,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08251953125,
+ "grad_norm": 0.08544921875,
"learning_rate": 0.0009691425196711901,
- "loss": 0.039,
+ "loss": 0.0398,
"macro_f1": 0.3272727429866791,
"num_tokens": 2651171.0,
"repeat_count": 0.0,
- "routers_loss": 0.02027471922338009,
+ "routers_loss": 0.02112230286002159,
"skip_count": 0.0,
"step": 1642,
"text_loss": 0.3745322525501251
@@ -15616,13 +15616,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009690353794918971,
- "loss": 0.0279,
+ "loss": 0.0275,
"macro_f1": 0.3333333432674408,
"num_tokens": 2654093.0,
"repeat_count": 0.0,
- "routers_loss": 0.003074956126511097,
+ "routers_loss": 0.0024304776452481747,
"skip_count": 0.0,
"step": 1644,
"text_loss": 0.4275154173374176
@@ -15635,13 +15635,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.0771484375,
"learning_rate": 0.000968928059577362,
- "loss": 0.0241,
+ "loss": 0.0244,
"macro_f1": 0.6666666865348816,
"num_tokens": 2657079.0,
"repeat_count": 0.0,
- "routers_loss": 0.009374706074595451,
+ "routers_loss": 0.009320619516074657,
"skip_count": 1.0,
"step": 1646,
"text_loss": 0.46650025248527527
@@ -15654,13 +15654,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1162109375,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009688205599687099,
- "loss": 0.0218,
+ "loss": 0.0209,
"macro_f1": 0.3272727429866791,
"num_tokens": 2660951.0,
"repeat_count": 0.0,
- "routers_loss": 0.01204691268503666,
+ "routers_loss": 0.011913162656128407,
"skip_count": 0.0,
"step": 1648,
"text_loss": 0.46644100546836853
@@ -15673,13 +15673,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009687128807071347,
"loss": 0.0284,
"macro_f1": 0.3333333432674408,
"num_tokens": 2663823.0,
"repeat_count": 0.0,
- "routers_loss": 0.01376053225249052,
+ "routers_loss": 0.013754756189882755,
"skip_count": 0.0,
"step": 1650,
"text_loss": 0.40808847546577454
@@ -15692,13 +15692,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.103515625,
"learning_rate": 0.0009686050218338996,
- "loss": 0.0285,
+ "loss": 0.0286,
"macro_f1": 0.3333333432674408,
"num_tokens": 2667079.0,
"repeat_count": 0.0,
- "routers_loss": 0.009346984326839447,
+ "routers_loss": 0.009099726565182209,
"skip_count": 0.0,
"step": 1652,
"text_loss": 0.2389989197254181
@@ -15711,13 +15711,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.08837890625,
"learning_rate": 0.0009684969833903359,
- "loss": 0.0291,
+ "loss": 0.0283,
"macro_f1": 0.6666666865348816,
"num_tokens": 2670162.0,
"repeat_count": 0.0,
- "routers_loss": 0.002724624238908291,
+ "routers_loss": 0.0034928603563457727,
"skip_count": 1.0,
"step": 1654,
"text_loss": 0.6930749416351318
@@ -15730,13 +15730,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009683887654178445,
- "loss": 0.0271,
+ "loss": 0.0261,
"macro_f1": 0.6666666865348816,
"num_tokens": 2673031.0,
"repeat_count": 0.0,
- "routers_loss": 0.00823777075856924,
+ "routers_loss": 0.008340462110936642,
"skip_count": 1.0,
"step": 1656,
"text_loss": 0.277752548456192
@@ -15749,32 +15749,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07373046875,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009682803679578947,
- "loss": 0.0262,
+ "loss": 0.0259,
"macro_f1": 0.3333333432674408,
"num_tokens": 2676092.0,
"repeat_count": 0.0,
- "routers_loss": 0.004393119364976883,
+ "routers_loss": 0.004337446764111519,
"skip_count": 0.0,
"step": 1658,
"text_loss": 0.5176776051521301
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 7.7936601115350745,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.1513671875,
+ "f1_skip": 0.0,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009681717910520244,
- "loss": 0.024,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0242,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 2679479.0,
"repeat_count": 0.0,
- "routers_loss": 0.031827569007873535,
+ "routers_loss": 0.034611742943525314,
"skip_count": 2.0,
"step": 1660,
"text_loss": 0.21485982835292816
@@ -15789,11 +15789,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.07958984375,
"learning_rate": 0.0009680630347418406,
- "loss": 0.0216,
+ "loss": 0.022,
"macro_f1": 0.5492662787437439,
"num_tokens": 2683289.0,
"repeat_count": 0.0,
- "routers_loss": 0.03329647704958916,
+ "routers_loss": 0.03297121450304985,
"skip_count": 2.0,
"step": 1662,
"text_loss": 0.33801013231277466
@@ -15806,13 +15806,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1826171875,
+ "grad_norm": 0.1728515625,
"learning_rate": 0.000967954099069019,
- "loss": 0.0415,
+ "loss": 0.0411,
"macro_f1": 0.32098764181137085,
"num_tokens": 2685879.0,
"repeat_count": 1.0,
- "routers_loss": 0.047317031770944595,
+ "routers_loss": 0.04551183059811592,
"skip_count": 1.0,
"step": 1664,
"text_loss": 0.41123488545417786
@@ -15827,11 +15827,11 @@
"f1_skip": 0.0,
"grad_norm": 0.1240234375,
"learning_rate": 0.0009678449840753038,
- "loss": 0.0325,
+ "loss": 0.0324,
"macro_f1": 0.32098764181137085,
"num_tokens": 2688910.0,
"repeat_count": 0.0,
- "routers_loss": 0.05649980902671814,
+ "routers_loss": 0.05866450071334839,
"skip_count": 2.0,
"step": 1666,
"text_loss": 0.1740892380475998
@@ -15844,13 +15844,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009677356898025082,
- "loss": 0.0229,
+ "loss": 0.023,
"macro_f1": 0.3333333432674408,
"num_tokens": 2691680.0,
"repeat_count": 0.0,
- "routers_loss": 0.01004624180495739,
+ "routers_loss": 0.009243223816156387,
"skip_count": 0.0,
"step": 1668,
"text_loss": 0.2512350380420685
@@ -15863,13 +15863,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.09619140625,
"learning_rate": 0.000967626216292514,
- "loss": 0.0194,
+ "loss": 0.0195,
"macro_f1": 0.3333333432674408,
"num_tokens": 2694895.0,
"repeat_count": 0.0,
- "routers_loss": 0.0054973396472632885,
+ "routers_loss": 0.005576452240347862,
"skip_count": 0.0,
"step": 1670,
"text_loss": 0.43294376134872437
@@ -15882,13 +15882,13 @@
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.09130859375,
"learning_rate": 0.0009675165635872715,
- "loss": 0.031,
+ "loss": 0.0306,
"macro_f1": 0.44705885648727417,
"num_tokens": 2697806.0,
"repeat_count": 0.0,
- "routers_loss": 0.05615650862455368,
+ "routers_loss": 0.05372785031795502,
"skip_count": 3.0,
"step": 1672,
"text_loss": 0.1614082306623459
@@ -15901,13 +15901,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009674067317288,
- "loss": 0.0301,
+ "loss": 0.0296,
"macro_f1": 0.6666666865348816,
"num_tokens": 2700529.0,
"repeat_count": 1.0,
- "routers_loss": 0.012819192372262478,
+ "routers_loss": 0.018131591379642487,
"skip_count": 0.0,
"step": 1674,
"text_loss": 0.2093173861503601
@@ -15920,13 +15920,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.08203125,
"learning_rate": 0.0009672967207591869,
- "loss": 0.0253,
+ "loss": 0.0257,
"macro_f1": 0.3272727429866791,
"num_tokens": 2703650.0,
"repeat_count": 0.0,
- "routers_loss": 0.07059332728385925,
+ "routers_loss": 0.0673515796661377,
"skip_count": 1.0,
"step": 1676,
"text_loss": 0.3029400110244751
@@ -15939,13 +15939,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009671865307205892,
- "loss": 0.0198,
+ "loss": 0.021,
"macro_f1": 0.32098767161369324,
"num_tokens": 2707615.0,
"repeat_count": 0.0,
- "routers_loss": 0.029778441414237022,
+ "routers_loss": 0.03821169584989548,
"skip_count": 1.0,
"step": 1678,
"text_loss": 0.2262786477804184
@@ -15958,13 +15958,13 @@
"f1_execute": 0.9756097793579102,
"f1_repeat": 1.0,
"f1_skip": 0.9090909361839294,
- "grad_norm": 0.1416015625,
+ "grad_norm": 0.1396484375,
"learning_rate": 0.0009670761616552315,
- "loss": 0.0474,
+ "loss": 0.0465,
"macro_f1": 0.9615669250488281,
"num_tokens": 2710894.0,
"repeat_count": 2.0,
- "routers_loss": 0.04371272772550583,
+ "routers_loss": 0.042625464498996735,
"skip_count": 6.0,
"step": 1680,
"text_loss": 0.29623574018478394
@@ -15977,13 +15977,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.189453125,
+ "grad_norm": 0.169921875,
"learning_rate": 0.0009669656136054074,
- "loss": 0.0293,
+ "loss": 0.0289,
"macro_f1": 0.3333333432674408,
"num_tokens": 2714330.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033591394312679768,
+ "routers_loss": 0.0037571541033685207,
"skip_count": 0.0,
"step": 1682,
"text_loss": 0.7510389089584351
@@ -15996,13 +15996,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.07421875,
"learning_rate": 0.0009668548866134795,
- "loss": 0.0259,
+ "loss": 0.0256,
"macro_f1": 0.3333333432674408,
"num_tokens": 2717176.0,
"repeat_count": 0.0,
- "routers_loss": 0.005085585173219442,
+ "routers_loss": 0.004142968449741602,
"skip_count": 0.0,
"step": 1684,
"text_loss": 0.3273485600948334
@@ -16015,13 +16015,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0712890625,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009667439807218783,
- "loss": 0.0243,
+ "loss": 0.0233,
"macro_f1": 0.6666666865348816,
"num_tokens": 2720628.0,
"repeat_count": 0.0,
- "routers_loss": 0.008569681085646152,
+ "routers_loss": 0.008753842674195766,
"skip_count": 2.0,
"step": 1686,
"text_loss": 0.4314708709716797
@@ -16034,32 +16034,32 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0009666328959731033,
- "loss": 0.022,
+ "loss": 0.0211,
"macro_f1": 0.6603773832321167,
"num_tokens": 2723739.0,
"repeat_count": 1.0,
- "routers_loss": 0.024587804451584816,
+ "routers_loss": 0.022674910724163055,
"skip_count": 1.0,
"step": 1688,
"text_loss": 0.25734150409698486
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.3333333432674408,
- "avg_layers": 27.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 26.0,
"epoch": 7.934546521866745,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
- "f1_skip": 0.5,
- "grad_norm": 0.169921875,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009665216324097222,
- "loss": 0.0332,
- "macro_f1": 0.4871794879436493,
+ "loss": 0.0324,
+ "macro_f1": 0.5934640765190125,
"num_tokens": 2726644.0,
"repeat_count": 0.0,
- "routers_loss": 0.037516288459300995,
+ "routers_loss": 0.03932750225067139,
"skip_count": 3.0,
"step": 1690,
"text_loss": 0.24511034786701202
@@ -16072,13 +16072,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.09765625,
"learning_rate": 0.0009664101900743714,
- "loss": 0.0262,
+ "loss": 0.0255,
"macro_f1": 0.3272727429866791,
"num_tokens": 2729662.0,
"repeat_count": 0.0,
- "routers_loss": 0.01287431176751852,
+ "routers_loss": 0.012672754004597664,
"skip_count": 1.0,
"step": 1692,
"text_loss": 0.39431414008140564
@@ -16091,13 +16091,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.07763671875,
+ "grad_norm": 0.076171875,
"learning_rate": 0.000966298569009756,
- "loss": 0.0227,
+ "loss": 0.0231,
"macro_f1": 0.5492662787437439,
"num_tokens": 2732578.0,
"repeat_count": 0.0,
- "routers_loss": 0.015499880537390709,
+ "routers_loss": 0.01548632513731718,
"skip_count": 2.0,
"step": 1694,
"text_loss": 0.12439999729394913
@@ -16110,13 +16110,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.080078125,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009661867692586494,
- "loss": 0.0144,
+ "loss": 0.0153,
"macro_f1": 0.32098764181137085,
"num_tokens": 2735887.0,
"repeat_count": 0.0,
- "routers_loss": 0.049878787249326706,
+ "routers_loss": 0.05622401833534241,
"skip_count": 2.0,
"step": 1696,
"text_loss": 0.29024389386177063
@@ -16129,13 +16129,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10009765625,
+ "grad_norm": 0.087890625,
"learning_rate": 0.0009660747908638933,
- "loss": 0.0206,
+ "loss": 0.0205,
"macro_f1": 0.3272727429866791,
"num_tokens": 2739293.0,
"repeat_count": 0.0,
- "routers_loss": 0.04108169302344322,
+ "routers_loss": 0.041060201823711395,
"skip_count": 1.0,
"step": 1698,
"text_loss": 0.39461007714271545
@@ -16148,13 +16148,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1728515625,
+ "grad_norm": 0.1767578125,
"learning_rate": 0.0009659626338683981,
- "loss": 0.0367,
+ "loss": 0.0369,
"macro_f1": 0.3333333432674408,
"num_tokens": 2742468.0,
"repeat_count": 0.0,
- "routers_loss": 0.007651917636394501,
+ "routers_loss": 0.007251353468745947,
"skip_count": 0.0,
"step": 1700,
"text_loss": 0.2751767635345459
@@ -16167,13 +16167,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.07763671875,
"learning_rate": 0.0009658502983151427,
- "loss": 0.0182,
+ "loss": 0.0186,
"macro_f1": 0.3272727429866791,
"num_tokens": 2745123.0,
"repeat_count": 0.0,
- "routers_loss": 0.015448091551661491,
+ "routers_loss": 0.012847424484789371,
"skip_count": 1.0,
"step": 1702,
"text_loss": 0.4756404757499695
@@ -16186,13 +16186,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1279296875,
+ "grad_norm": 0.11767578125,
"learning_rate": 0.0009657377842471742,
- "loss": 0.0324,
+ "loss": 0.0313,
"macro_f1": 0.6666666865348816,
"num_tokens": 2748016.0,
"repeat_count": 0.0,
- "routers_loss": 0.009139287285506725,
+ "routers_loss": 0.007060411386191845,
"skip_count": 1.0,
"step": 1704,
"text_loss": 0.9571210145950317
@@ -16205,13 +16205,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.10009765625,
"learning_rate": 0.0009656250917076081,
- "loss": 0.0191,
+ "loss": 0.0188,
"macro_f1": 0.5492662787437439,
"num_tokens": 2750717.0,
"repeat_count": 0.0,
- "routers_loss": 0.015412120148539543,
+ "routers_loss": 0.016748681664466858,
"skip_count": 2.0,
"step": 1706,
"text_loss": 0.14542843401432037
@@ -16224,13 +16224,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.060302734375,
"learning_rate": 0.0009655122207396285,
- "loss": 0.0175,
+ "loss": 0.017,
"macro_f1": 0.3333333432674408,
"num_tokens": 2753635.0,
"repeat_count": 0.0,
- "routers_loss": 0.012735052965581417,
+ "routers_loss": 0.013607042841613293,
"skip_count": 0.0,
"step": 1708,
"text_loss": 0.21836471557617188
@@ -16243,13 +16243,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07177734375,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0009653991713864878,
- "loss": 0.0192,
+ "loss": 0.0205,
"macro_f1": 0.3333333432674408,
"num_tokens": 2756643.0,
"repeat_count": 0.0,
- "routers_loss": 0.00114025070797652,
+ "routers_loss": 0.0012097888393327594,
"skip_count": 0.0,
"step": 1710,
"text_loss": 0.635187029838562
@@ -16262,13 +16262,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1396484375,
+ "grad_norm": 0.1171875,
"learning_rate": 0.0009652859436915066,
- "loss": 0.0243,
+ "loss": 0.0231,
"macro_f1": 0.3333333432674408,
"num_tokens": 2759432.0,
"repeat_count": 0.0,
- "routers_loss": 0.006401443853974342,
+ "routers_loss": 0.006196760106831789,
"skip_count": 0.0,
"step": 1712,
"text_loss": 0.5629420876502991
@@ -16281,13 +16281,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0009651725376980743,
- "loss": 0.0185,
+ "loss": 0.0177,
"macro_f1": 0.3333333432674408,
"num_tokens": 2762538.0,
"repeat_count": 0.0,
- "routers_loss": 0.004316259175539017,
+ "routers_loss": 0.0042513771913945675,
"skip_count": 0.0,
"step": 1714,
"text_loss": 0.39522525668144226
@@ -16300,13 +16300,13 @@
"f1_execute": 0.9583333134651184,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.125,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009650589534496479,
- "loss": 0.0201,
+ "loss": 0.0194,
"macro_f1": 0.8194444179534912,
"num_tokens": 2765571.0,
"repeat_count": 2.0,
- "routers_loss": 0.043461959809064865,
+ "routers_loss": 0.03596706688404083,
"skip_count": 3.0,
"step": 1716,
"text_loss": 0.6252416968345642
@@ -16319,13 +16319,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.04833984375,
"learning_rate": 0.0009649451909897532,
"loss": 0.0178,
"macro_f1": 0.3333333432674408,
"num_tokens": 2769206.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024530428927391768,
+ "routers_loss": 0.0025788163766264915,
"skip_count": 0.0,
"step": 1718,
"text_loss": 0.8851634860038757
@@ -16338,13 +16338,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.10791015625,
"learning_rate": 0.0009648312503619843,
- "loss": 0.026,
+ "loss": 0.0265,
"macro_f1": 0.3333333432674408,
"num_tokens": 2772488.0,
"repeat_count": 0.0,
- "routers_loss": 0.0046626063995063305,
+ "routers_loss": 0.004443451762199402,
"skip_count": 0.0,
"step": 1720,
"text_loss": 0.8568580746650696
@@ -16357,13 +16357,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.1513671875,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0009647171316100034,
- "loss": 0.0257,
+ "loss": 0.0265,
"macro_f1": 0.9265305995941162,
"num_tokens": 2776482.0,
"repeat_count": 1.0,
- "routers_loss": 0.02480102889239788,
+ "routers_loss": 0.022948263213038445,
"skip_count": 3.0,
"step": 1722,
"text_loss": 0.13431036472320557
@@ -16376,13 +16376,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.126953125,
+ "grad_norm": 0.1572265625,
"learning_rate": 0.0009646028347775409,
- "loss": 0.02,
+ "loss": 0.0204,
"macro_f1": 0.6666666865348816,
"num_tokens": 2778966.0,
"repeat_count": 0.0,
- "routers_loss": 0.012629947625100613,
+ "routers_loss": 0.011328035034239292,
"skip_count": 1.0,
"step": 1724,
"text_loss": 0.2085491120815277
@@ -16395,13 +16395,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08447265625,
+ "grad_norm": 0.08984375,
"learning_rate": 0.0009644883599083958,
"loss": 0.0238,
"macro_f1": 0.3333333432674408,
"num_tokens": 2781968.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024127380456775427,
+ "routers_loss": 0.002208018908277154,
"skip_count": 0.0,
"step": 1726,
"text_loss": 0.4948323965072632
@@ -16414,13 +16414,13 @@
"f1_execute": 0.9411764740943909,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.054443359375,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0009643737070464349,
- "loss": 0.0162,
+ "loss": 0.0158,
"macro_f1": 0.6470588445663452,
"num_tokens": 2784666.0,
"repeat_count": 1.0,
- "routers_loss": 0.0415453165769577,
+ "routers_loss": 0.04391832649707794,
"skip_count": 2.0,
"step": 1728,
"text_loss": 0.39060094952583313
@@ -16433,13 +16433,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.046630859375,
"learning_rate": 0.0009642588762355935,
- "loss": 0.0211,
+ "loss": 0.0212,
"macro_f1": 0.6666666865348816,
"num_tokens": 2787558.0,
"repeat_count": 0.0,
- "routers_loss": 0.0056681083515286446,
+ "routers_loss": 0.004497280344367027,
"skip_count": 1.0,
"step": 1730,
"text_loss": 0.34908708930015564
@@ -16452,13 +16452,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0009641438675198748,
- "loss": 0.0189,
+ "loss": 0.0175,
"macro_f1": 0.3333333432674408,
"num_tokens": 2790474.0,
"repeat_count": 0.0,
- "routers_loss": 0.006391602102667093,
+ "routers_loss": 0.00583475548774004,
"skip_count": 0.0,
"step": 1732,
"text_loss": 0.5720033049583435
@@ -16471,13 +16471,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0595703125,
+ "grad_norm": 0.08154296875,
"learning_rate": 0.0009640286809433508,
- "loss": 0.0229,
+ "loss": 0.0235,
"macro_f1": 0.3333333432674408,
"num_tokens": 2793272.0,
"repeat_count": 0.0,
- "routers_loss": 0.007466991897672415,
+ "routers_loss": 0.007826375775039196,
"skip_count": 0.0,
"step": 1734,
"text_loss": 0.32181721925735474
@@ -16490,13 +16490,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0009639133165501606,
- "loss": 0.0197,
+ "loss": 0.0192,
"macro_f1": 0.3333333432674408,
"num_tokens": 2797726.0,
"repeat_count": 0.0,
- "routers_loss": 0.001953453291207552,
+ "routers_loss": 0.0019055595621466637,
"skip_count": 0.0,
"step": 1736,
"text_loss": 0.620936393737793
@@ -16509,13 +16509,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009637977743845124,
- "loss": 0.0223,
+ "loss": 0.0229,
"macro_f1": 0.3333333432674408,
"num_tokens": 2800706.0,
"repeat_count": 0.0,
- "routers_loss": 0.003612719476222992,
+ "routers_loss": 0.0028302327264100313,
"skip_count": 0.0,
"step": 1738,
"text_loss": 0.6473138332366943
@@ -16528,13 +16528,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.049072265625,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009636820544906823,
- "loss": 0.0145,
+ "loss": 0.0146,
"macro_f1": 1.0,
"num_tokens": 2803847.0,
"repeat_count": 1.0,
- "routers_loss": 0.009977150708436966,
+ "routers_loss": 0.01105099730193615,
"skip_count": 2.0,
"step": 1740,
"text_loss": 0.4401201903820038
@@ -16547,13 +16547,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009635661569130141,
"loss": 0.0195,
"macro_f1": 0.5934640765190125,
"num_tokens": 2807235.0,
"repeat_count": 0.0,
- "routers_loss": 0.026468059048056602,
+ "routers_loss": 0.02619045600295067,
"skip_count": 3.0,
"step": 1742,
"text_loss": 0.459264874458313
@@ -16566,13 +16566,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0009634500816959202,
- "loss": 0.0165,
+ "loss": 0.0162,
"macro_f1": 0.6666666865348816,
"num_tokens": 2810396.0,
"repeat_count": 0.0,
- "routers_loss": 0.00849854201078415,
+ "routers_loss": 0.007915694266557693,
"skip_count": 2.0,
"step": 1744,
"text_loss": 0.5084020495414734
@@ -16585,13 +16585,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.177734375,
+ "grad_norm": 0.1748046875,
"learning_rate": 0.0009633338288838805,
- "loss": 0.0275,
+ "loss": 0.0271,
"macro_f1": 0.5492662787437439,
"num_tokens": 2813215.0,
"repeat_count": 2.0,
- "routers_loss": 0.08082596957683563,
+ "routers_loss": 0.08364596217870712,
"skip_count": 0.0,
"step": 1746,
"text_loss": 0.27681824564933777
@@ -16604,13 +16604,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.046142578125,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.0009632173985214438,
- "loss": 0.015,
+ "loss": 0.0156,
"macro_f1": 0.8817967176437378,
"num_tokens": 2816452.0,
"repeat_count": 3.0,
- "routers_loss": 0.029500717297196388,
+ "routers_loss": 0.028805451467633247,
"skip_count": 2.0,
"step": 1748,
"text_loss": 0.4678419530391693
@@ -16623,13 +16623,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.0625,
"learning_rate": 0.000963100790653226,
- "loss": 0.0183,
+ "loss": 0.0188,
"macro_f1": 0.3272727429866791,
"num_tokens": 2819364.0,
"repeat_count": 0.0,
- "routers_loss": 0.025238536298274994,
+ "routers_loss": 0.03056817688047886,
"skip_count": 1.0,
"step": 1750,
"text_loss": 0.3078109920024872
@@ -16642,13 +16642,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0703125,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009629840053239116,
- "loss": 0.0204,
+ "loss": 0.0205,
"macro_f1": 0.3333333432674408,
"num_tokens": 2823469.0,
"repeat_count": 0.0,
- "routers_loss": 0.002069319598376751,
+ "routers_loss": 0.0019477814203128219,
"skip_count": 0.0,
"step": 1752,
"text_loss": 0.45501336455345154
@@ -16661,13 +16661,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05224609375,
+ "grad_norm": 0.057373046875,
"learning_rate": 0.000962867042578253,
- "loss": 0.0169,
+ "loss": 0.0173,
"macro_f1": 0.3333333432674408,
"num_tokens": 2826716.0,
"repeat_count": 0.0,
- "routers_loss": 0.002853946527466178,
+ "routers_loss": 0.0032963966950774193,
"skip_count": 0.0,
"step": 1754,
"text_loss": 0.49234694242477417
@@ -16680,13 +16680,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0947265625,
"learning_rate": 0.0009627499024610707,
- "loss": 0.0236,
+ "loss": 0.0239,
"macro_f1": 0.3272727429866791,
"num_tokens": 2829733.0,
"repeat_count": 0.0,
- "routers_loss": 0.0100983502343297,
+ "routers_loss": 0.010289114899933338,
"skip_count": 1.0,
"step": 1756,
"text_loss": 0.22335539758205414
@@ -16699,13 +16699,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09228515625,
+ "grad_norm": 0.0888671875,
"learning_rate": 0.0009626325850172527,
- "loss": 0.0173,
+ "loss": 0.0174,
"macro_f1": 0.3272727429866791,
"num_tokens": 2833350.0,
"repeat_count": 0.0,
- "routers_loss": 0.031218983232975006,
+ "routers_loss": 0.03249066323041916,
"skip_count": 1.0,
"step": 1758,
"text_loss": 0.6581931114196777
@@ -16718,13 +16718,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009625150902917555,
- "loss": 0.019,
+ "loss": 0.0185,
"macro_f1": 0.3333333432674408,
"num_tokens": 2836558.0,
"repeat_count": 0.0,
- "routers_loss": 0.010347879491746426,
+ "routers_loss": 0.00870000571012497,
"skip_count": 0.0,
"step": 1760,
"text_loss": 0.22938725352287292
@@ -16737,13 +16737,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1455078125,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0009623974183296031,
- "loss": 0.0193,
+ "loss": 0.0192,
"macro_f1": 0.3333333432674408,
"num_tokens": 2840560.0,
"repeat_count": 0.0,
- "routers_loss": 0.007768871728330851,
+ "routers_loss": 0.007767196744680405,
"skip_count": 0.0,
"step": 1762,
"text_loss": 0.24473799765110016
@@ -16756,13 +16756,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009622795691758876,
- "loss": 0.0253,
+ "loss": 0.0244,
"macro_f1": 0.3333333432674408,
"num_tokens": 2843548.0,
"repeat_count": 0.0,
- "routers_loss": 0.002887974726036191,
+ "routers_loss": 0.0021693643648177385,
"skip_count": 0.0,
"step": 1764,
"text_loss": 0.3084608018398285
@@ -16777,11 +16777,11 @@
"f1_skip": 0.0,
"grad_norm": 0.0498046875,
"learning_rate": 0.0009621615428757693,
- "loss": 0.0147,
+ "loss": 0.0149,
"macro_f1": 0.3333333432674408,
"num_tokens": 2847076.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027294005267322063,
+ "routers_loss": 0.0024727333802729845,
"skip_count": 0.0,
"step": 1766,
"text_loss": 0.5251734852790833
@@ -16794,13 +16794,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.000962043339474476,
- "loss": 0.0193,
+ "loss": 0.0194,
"macro_f1": 0.3333333432674408,
"num_tokens": 2849751.0,
"repeat_count": 0.0,
- "routers_loss": 0.00543541694059968,
+ "routers_loss": 0.005174890160560608,
"skip_count": 0.0,
"step": 1768,
"text_loss": 0.4410129189491272
@@ -16813,13 +16813,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.06103515625,
"learning_rate": 0.0009619249590173032,
- "loss": 0.0167,
+ "loss": 0.016,
"macro_f1": 0.6666666865348816,
"num_tokens": 2853916.0,
"repeat_count": 0.0,
- "routers_loss": 0.006514009553939104,
+ "routers_loss": 0.006785830482840538,
"skip_count": 2.0,
"step": 1770,
"text_loss": 0.550076425075531
@@ -16832,13 +16832,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.06591796875,
"learning_rate": 0.0009618064015496149,
- "loss": 0.019,
+ "loss": 0.0192,
"macro_f1": 0.5934640765190125,
"num_tokens": 2857372.0,
"repeat_count": 0.0,
- "routers_loss": 0.02333846502006054,
+ "routers_loss": 0.021370256319642067,
"skip_count": 3.0,
"step": 1772,
"text_loss": 0.1988629847764969
@@ -16851,13 +16851,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0732421875,
+ "grad_norm": 0.072265625,
"learning_rate": 0.0009616876671168423,
- "loss": 0.0165,
+ "loss": 0.0162,
"macro_f1": 0.6666666865348816,
"num_tokens": 2861028.0,
"repeat_count": 0.0,
- "routers_loss": 0.004471905063837767,
+ "routers_loss": 0.004313841462135315,
"skip_count": 1.0,
"step": 1774,
"text_loss": 0.42581331729888916
@@ -16870,13 +16870,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.0009615687557644847,
- "loss": 0.0261,
+ "loss": 0.0268,
"macro_f1": 0.3333333432674408,
"num_tokens": 2864847.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024362702388316393,
+ "routers_loss": 0.0025742491707205772,
"skip_count": 0.0,
"step": 1776,
"text_loss": 0.46510905027389526
@@ -16889,13 +16889,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.140625,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009614496675381093,
- "loss": 0.0116,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 2867392.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021166049409657717,
+ "routers_loss": 0.0016813480760902166,
"skip_count": 0.0,
"step": 1778,
"text_loss": 0.5922174453735352
@@ -16908,13 +16908,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0712890625,
+ "grad_norm": 0.0810546875,
"learning_rate": 0.0009613304024833507,
"loss": 0.0166,
"macro_f1": 0.3333333432674408,
"num_tokens": 2871273.0,
"repeat_count": 0.0,
- "routers_loss": 0.004722296260297298,
+ "routers_loss": 0.004948933608829975,
"skip_count": 0.0,
"step": 1780,
"text_loss": 0.6776977777481079
@@ -16929,11 +16929,11 @@
"f1_skip": 1.0,
"grad_norm": 0.07470703125,
"learning_rate": 0.0009612109606459117,
- "loss": 0.0199,
+ "loss": 0.0186,
"macro_f1": 1.0,
"num_tokens": 2874172.0,
"repeat_count": 1.0,
- "routers_loss": 0.014188882894814014,
+ "routers_loss": 0.016950147226452827,
"skip_count": 2.0,
"step": 1782,
"text_loss": 0.48758944869041443
@@ -16946,13 +16946,13 @@
"f1_execute": 0.9599999785423279,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.076171875,
+ "grad_norm": 0.08251953125,
"learning_rate": 0.0009610913420715623,
- "loss": 0.0241,
+ "loss": 0.0237,
"macro_f1": 0.7644444704055786,
"num_tokens": 2877528.0,
"repeat_count": 2.0,
- "routers_loss": 0.04599560424685478,
+ "routers_loss": 0.04880943149328232,
"skip_count": 1.0,
"step": 1784,
"text_loss": 0.4404778480529785
@@ -16965,13 +16965,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.06201171875,
"learning_rate": 0.0009609715468061411,
- "loss": 0.0216,
+ "loss": 0.0205,
"macro_f1": 0.3333333432674408,
"num_tokens": 2880627.0,
"repeat_count": 0.0,
- "routers_loss": 0.004942454397678375,
+ "routers_loss": 0.004678630735725164,
"skip_count": 0.0,
"step": 1786,
"text_loss": 0.7295402884483337
@@ -16984,13 +16984,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08349609375,
+ "grad_norm": 0.07958984375,
"learning_rate": 0.0009608515748955535,
- "loss": 0.021,
+ "loss": 0.0205,
"macro_f1": 0.3333333432674408,
"num_tokens": 2883333.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020542226266115904,
+ "routers_loss": 0.0026695074047893286,
"skip_count": 0.0,
"step": 1788,
"text_loss": 0.9697831273078918
@@ -17003,13 +17003,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.1171875,
+ "grad_norm": 0.107421875,
"learning_rate": 0.000960731426385773,
- "loss": 0.0155,
+ "loss": 0.0157,
"macro_f1": 0.4871794879436493,
"num_tokens": 2887444.0,
"repeat_count": 0.0,
- "routers_loss": 0.0397041030228138,
+ "routers_loss": 0.029743613675236702,
"skip_count": 2.0,
"step": 1790,
"text_loss": 0.4737568199634552
@@ -17022,13 +17022,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.10107421875,
"learning_rate": 0.0009606111013228407,
- "loss": 0.0204,
+ "loss": 0.0207,
"macro_f1": 0.3333333432674408,
"num_tokens": 2890221.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017490010941401124,
+ "routers_loss": 0.0016153788892552257,
"skip_count": 0.0,
"step": 1792,
"text_loss": 0.6693558096885681
@@ -17041,13 +17041,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08251953125,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009604905997528655,
- "loss": 0.021,
+ "loss": 0.02,
"macro_f1": 0.3272727429866791,
"num_tokens": 2893262.0,
"repeat_count": 0.0,
- "routers_loss": 0.023590171709656715,
+ "routers_loss": 0.01965433731675148,
"skip_count": 1.0,
"step": 1794,
"text_loss": 0.45227760076522827
@@ -17060,13 +17060,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.08642578125,
"learning_rate": 0.0009603699217220239,
- "loss": 0.0125,
+ "loss": 0.0117,
"macro_f1": 0.6601307392120361,
"num_tokens": 2896823.0,
"repeat_count": 1.0,
- "routers_loss": 0.02458076737821102,
+ "routers_loss": 0.024017298594117165,
"skip_count": 2.0,
"step": 1796,
"text_loss": 0.48865509033203125
@@ -17079,13 +17079,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.08837890625,
"learning_rate": 0.0009602490672765597,
- "loss": 0.019,
+ "loss": 0.0182,
"macro_f1": 0.3333333432674408,
"num_tokens": 2899707.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014341498026624322,
+ "routers_loss": 0.0012420224957168102,
"skip_count": 0.0,
"step": 1798,
"text_loss": 0.43292415142059326
@@ -17098,13 +17098,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08056640625,
+ "grad_norm": 0.07861328125,
"learning_rate": 0.0009601280364627848,
- "loss": 0.02,
+ "loss": 0.0196,
"macro_f1": 0.3333333432674408,
"num_tokens": 2902795.0,
"repeat_count": 0.0,
- "routers_loss": 0.00213223067112267,
+ "routers_loss": 0.0020389219280332327,
"skip_count": 0.0,
"step": 1800,
"text_loss": 0.41021591424942017
@@ -17117,13 +17117,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07275390625,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009600068293270783,
- "loss": 0.0147,
+ "loss": 0.0142,
"macro_f1": 0.3333333432674408,
"num_tokens": 2905769.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027340995147824287,
+ "routers_loss": 0.002006303984671831,
"skip_count": 0.0,
"step": 1802,
"text_loss": 0.46892106533050537
@@ -17136,32 +17136,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.08740234375,
"learning_rate": 0.000959885445915887,
- "loss": 0.0172,
+ "loss": 0.017,
"macro_f1": 0.3333333432674408,
"num_tokens": 2909475.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035587961319833994,
+ "routers_loss": 0.003734810510650277,
"skip_count": 0.0,
"step": 1804,
"text_loss": 0.45364710688591003
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.5,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 8.479013795127678,
- "f1_execute": 0.9615384340286255,
- "f1_repeat": 0.0,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009597638862757254,
- "loss": 0.0187,
- "macro_f1": 0.5427350401878357,
+ "loss": 0.0182,
+ "macro_f1": 0.8823530077934265,
"num_tokens": 2914348.0,
"repeat_count": 1.0,
- "routers_loss": 0.04446055367588997,
+ "routers_loss": 0.038971323519945145,
"skip_count": 2.0,
"step": 1806,
"text_loss": 0.42913779616355896
@@ -17174,13 +17174,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08447265625,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009596421504531751,
- "loss": 0.0244,
+ "loss": 0.0249,
"macro_f1": 0.3272727429866791,
"num_tokens": 2917467.0,
"repeat_count": 1.0,
- "routers_loss": 0.05095123499631882,
+ "routers_loss": 0.04800829663872719,
"skip_count": 0.0,
"step": 1808,
"text_loss": 0.17332297563552856
@@ -17193,13 +17193,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.1083984375,
"learning_rate": 0.0009595202384948858,
- "loss": 0.0232,
+ "loss": 0.0227,
"macro_f1": 0.6666666865348816,
"num_tokens": 2920223.0,
"repeat_count": 1.0,
- "routers_loss": 0.008440068922936916,
+ "routers_loss": 0.009164143353700638,
"skip_count": 0.0,
"step": 1810,
"text_loss": 0.33740702271461487
@@ -17212,13 +17212,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0927734375,
+ "grad_norm": 0.0947265625,
"learning_rate": 0.0009593981504475742,
- "loss": 0.0273,
+ "loss": 0.0275,
"macro_f1": 0.6666666865348816,
"num_tokens": 2923780.0,
"repeat_count": 0.0,
- "routers_loss": 0.012230116873979568,
+ "routers_loss": 0.011236993595957756,
"skip_count": 2.0,
"step": 1812,
"text_loss": 0.1609916388988495
@@ -17231,13 +17231,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1005859375,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009592758863580248,
- "loss": 0.026,
+ "loss": 0.0259,
"macro_f1": 0.5492662787437439,
"num_tokens": 2926259.0,
"repeat_count": 0.0,
- "routers_loss": 0.017307188361883163,
+ "routers_loss": 0.019026532769203186,
"skip_count": 2.0,
"step": 1814,
"text_loss": 0.6460903882980347
@@ -17250,13 +17250,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.099609375,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009591534462730894,
- "loss": 0.0215,
+ "loss": 0.0206,
"macro_f1": 0.5492662787437439,
"num_tokens": 2929173.0,
"repeat_count": 2.0,
- "routers_loss": 0.07191162556409836,
+ "routers_loss": 0.0608333982527256,
"skip_count": 0.0,
"step": 1816,
"text_loss": 0.476126492023468
@@ -17269,13 +17269,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.06640625,
"learning_rate": 0.000959030830239687,
- "loss": 0.0182,
+ "loss": 0.0175,
"macro_f1": 0.3333333432674408,
"num_tokens": 2932703.0,
"repeat_count": 0.0,
- "routers_loss": 0.008753604255616665,
+ "routers_loss": 0.0093300249427557,
"skip_count": 0.0,
"step": 1818,
"text_loss": 0.5471875667572021
@@ -17288,13 +17288,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.19921875,
+ "grad_norm": 0.2001953125,
"learning_rate": 0.0009589080383048048,
- "loss": 0.0233,
+ "loss": 0.0235,
"macro_f1": 0.3333333432674408,
"num_tokens": 2936195.0,
"repeat_count": 0.0,
- "routers_loss": 0.008390828967094421,
+ "routers_loss": 0.010434109717607498,
"skip_count": 0.0,
"step": 1820,
"text_loss": 0.5068115592002869
@@ -17307,13 +17307,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0009587850705154964,
"loss": 0.0291,
"macro_f1": 0.3333333432674408,
"num_tokens": 2939412.0,
"repeat_count": 0.0,
- "routers_loss": 0.005617359187453985,
+ "routers_loss": 0.004347751382738352,
"skip_count": 0.0,
"step": 1822,
"text_loss": 0.4241984784603119
@@ -17326,13 +17326,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.0859375,
"learning_rate": 0.0009586619269188836,
- "loss": 0.0227,
+ "loss": 0.0224,
"macro_f1": 0.32098767161369324,
"num_tokens": 2942318.0,
"repeat_count": 0.0,
- "routers_loss": 0.0346846878528595,
+ "routers_loss": 0.034238871186971664,
"skip_count": 1.0,
"step": 1824,
"text_loss": 0.2328975349664688
@@ -17345,32 +17345,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0009585386075621553,
"loss": 0.027,
"macro_f1": 0.3333333432674408,
"num_tokens": 2945731.0,
"repeat_count": 0.0,
- "routers_loss": 0.006601692643016577,
+ "routers_loss": 0.006097695790231228,
"skip_count": 0.0,
"step": 1826,
"text_loss": 0.22816994786262512
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 8.582330496037569,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.08837890625,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0908203125,
"learning_rate": 0.0009584151124925676,
- "loss": 0.0207,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0208,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 2948944.0,
"repeat_count": 0.0,
- "routers_loss": 0.0065619745291769505,
+ "routers_loss": 0.007790776435285807,
"skip_count": 1.0,
"step": 1828,
"text_loss": 0.5009413361549377
@@ -17383,13 +17383,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0009582914417574438,
- "loss": 0.0149,
+ "loss": 0.0145,
"macro_f1": 0.6666666865348816,
"num_tokens": 2951723.0,
"repeat_count": 0.0,
- "routers_loss": 0.011109639890491962,
+ "routers_loss": 0.009144559502601624,
"skip_count": 2.0,
"step": 1830,
"text_loss": 0.1402502954006195
@@ -17404,11 +17404,11 @@
"f1_skip": 0.0,
"grad_norm": 0.06201171875,
"learning_rate": 0.0009581675954041751,
- "loss": 0.0167,
+ "loss": 0.0166,
"macro_f1": 0.6666666865348816,
"num_tokens": 2954726.0,
"repeat_count": 1.0,
- "routers_loss": 0.008432094007730484,
+ "routers_loss": 0.006593191530555487,
"skip_count": 0.0,
"step": 1832,
"text_loss": 0.4871736466884613
@@ -17421,13 +17421,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0859375,
+ "grad_norm": 0.0869140625,
"learning_rate": 0.0009580435734802196,
- "loss": 0.0208,
+ "loss": 0.0206,
"macro_f1": 0.3333333432674408,
"num_tokens": 2957853.0,
"repeat_count": 0.0,
- "routers_loss": 0.011518111452460289,
+ "routers_loss": 0.01241068821400404,
"skip_count": 0.0,
"step": 1834,
"text_loss": 0.30100154876708984
@@ -17440,13 +17440,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.146484375,
+ "grad_norm": 0.1298828125,
"learning_rate": 0.0009579193760331027,
- "loss": 0.0211,
+ "loss": 0.022,
"macro_f1": 0.3333333432674408,
"num_tokens": 2960783.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026744187343865633,
+ "routers_loss": 0.002219218760728836,
"skip_count": 0.0,
"step": 1836,
"text_loss": 0.4961516559123993
@@ -17459,13 +17459,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.09619140625,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009577950031104169,
- "loss": 0.0165,
+ "loss": 0.0166,
"macro_f1": 0.6601307392120361,
"num_tokens": 2963328.0,
"repeat_count": 1.0,
- "routers_loss": 0.028107430785894394,
+ "routers_loss": 0.029363535344600677,
"skip_count": 2.0,
"step": 1838,
"text_loss": 0.42814353108406067
@@ -17478,13 +17478,13 @@
"f1_execute": 0.9387754797935486,
"f1_repeat": 1.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.1044921875,
"learning_rate": 0.0009576704547598226,
- "loss": 0.0263,
+ "loss": 0.0257,
"macro_f1": 0.7795917987823486,
"num_tokens": 2966108.0,
"repeat_count": 1.0,
- "routers_loss": 0.060007549822330475,
+ "routers_loss": 0.0579402856528759,
"skip_count": 4.0,
"step": 1840,
"text_loss": 0.20523512363433838
@@ -17497,13 +17497,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009575457310290463,
"loss": 0.0121,
"macro_f1": 0.3272727429866791,
"num_tokens": 2969137.0,
"repeat_count": 0.0,
- "routers_loss": 0.01074182614684105,
+ "routers_loss": 0.008810589089989662,
"skip_count": 0.0,
"step": 1842,
"text_loss": 0.6199528574943542
@@ -17516,13 +17516,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0732421875,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0009574208319658831,
- "loss": 0.0213,
+ "loss": 0.0208,
"macro_f1": 0.6666666865348816,
"num_tokens": 2972407.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019638657104223967,
+ "routers_loss": 0.0012295129708945751,
"skip_count": 1.0,
"step": 1844,
"text_loss": 0.66938316822052
@@ -17535,13 +17535,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.1572265625,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.000957295757618194,
- "loss": 0.0156,
+ "loss": 0.0152,
"macro_f1": 0.4871794879436493,
"num_tokens": 2976045.0,
"repeat_count": 0.0,
- "routers_loss": 0.06953249871730804,
+ "routers_loss": 0.06162935495376587,
"skip_count": 2.0,
"step": 1846,
"text_loss": 0.5381782650947571
@@ -17554,13 +17554,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009571705080339079,
- "loss": 0.0154,
+ "loss": 0.0144,
"macro_f1": 0.3333333432674408,
"num_tokens": 2979025.0,
"repeat_count": 0.0,
- "routers_loss": 0.003563052974641323,
+ "routers_loss": 0.003950524143874645,
"skip_count": 0.0,
"step": 1848,
"text_loss": 0.5831671357154846
@@ -17573,13 +17573,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.11376953125,
"learning_rate": 0.0009570450832610208,
- "loss": 0.0216,
+ "loss": 0.0209,
"macro_f1": 0.3333333432674408,
"num_tokens": 2982276.0,
"repeat_count": 0.0,
- "routers_loss": 0.010409255512058735,
+ "routers_loss": 0.010354886762797832,
"skip_count": 0.0,
"step": 1850,
"text_loss": 0.27448201179504395
@@ -17592,13 +17592,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.0009569194833475956,
- "loss": 0.0195,
+ "loss": 0.0199,
"macro_f1": 0.3272727429866791,
"num_tokens": 2985691.0,
"repeat_count": 0.0,
- "routers_loss": 0.009769548662006855,
+ "routers_loss": 0.010167439468204975,
"skip_count": 0.0,
"step": 1852,
"text_loss": 0.5264663696289062
@@ -17611,13 +17611,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1181640625,
+ "grad_norm": 0.1328125,
"learning_rate": 0.0009567937083417624,
- "loss": 0.0184,
+ "loss": 0.0194,
"macro_f1": 0.3272727429866791,
"num_tokens": 2989126.0,
"repeat_count": 0.0,
- "routers_loss": 0.036616452038288116,
+ "routers_loss": 0.0371871180832386,
"skip_count": 1.0,
"step": 1854,
"text_loss": 0.2008018046617508
@@ -17630,13 +17630,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.0009566677582917185,
- "loss": 0.0192,
+ "loss": 0.0184,
"macro_f1": 0.3333333432674408,
"num_tokens": 2992814.0,
"repeat_count": 0.0,
- "routers_loss": 0.009581349790096283,
+ "routers_loss": 0.010190588422119617,
"skip_count": 0.0,
"step": 1856,
"text_loss": 0.749717116355896
@@ -17649,13 +17649,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09814453125,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009565416332457282,
- "loss": 0.0138,
+ "loss": 0.0132,
"macro_f1": 0.6538461446762085,
"num_tokens": 2995729.0,
"repeat_count": 1.0,
- "routers_loss": 0.02330300398170948,
+ "routers_loss": 0.022285036742687225,
"skip_count": 1.0,
"step": 1858,
"text_loss": 0.5870219469070435
@@ -17668,13 +17668,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009564153332521228,
- "loss": 0.0226,
+ "loss": 0.0224,
"macro_f1": 0.3272727429866791,
"num_tokens": 2998812.0,
"repeat_count": 0.0,
- "routers_loss": 0.011985735036432743,
+ "routers_loss": 0.011050296947360039,
"skip_count": 1.0,
"step": 1860,
"text_loss": 0.8444408774375916
@@ -17687,13 +17687,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.06005859375,
"learning_rate": 0.0009562888583593005,
- "loss": 0.0162,
+ "loss": 0.0163,
"macro_f1": 0.3333333432674408,
"num_tokens": 3001799.0,
"repeat_count": 0.0,
- "routers_loss": 0.005997250322252512,
+ "routers_loss": 0.007125461008399725,
"skip_count": 0.0,
"step": 1862,
"text_loss": 0.41510361433029175
@@ -17706,13 +17706,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009561622086157272,
- "loss": 0.0243,
+ "loss": 0.0236,
"macro_f1": 0.3333333432674408,
"num_tokens": 3005088.0,
"repeat_count": 0.0,
- "routers_loss": 0.004814761225134134,
+ "routers_loss": 0.0049054501578211784,
"skip_count": 0.0,
"step": 1864,
"text_loss": 0.3801248073577881
@@ -17725,13 +17725,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.056884765625,
+ "grad_norm": 0.054443359375,
"learning_rate": 0.000956035384069935,
- "loss": 0.0242,
+ "loss": 0.0238,
"macro_f1": 1.0,
"num_tokens": 3008178.0,
"repeat_count": 1.0,
- "routers_loss": 0.004750931169837713,
+ "routers_loss": 0.005162427201867104,
"skip_count": 1.0,
"step": 1866,
"text_loss": 0.2687684893608093
@@ -17744,13 +17744,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1123046875,
+ "grad_norm": 0.10400390625,
"learning_rate": 0.0009559083847705233,
- "loss": 0.0216,
+ "loss": 0.0214,
"macro_f1": 0.3272727429866791,
"num_tokens": 3010923.0,
"repeat_count": 0.0,
- "routers_loss": 0.038251202553510666,
+ "routers_loss": 0.028984658420085907,
"skip_count": 1.0,
"step": 1868,
"text_loss": 0.6277349591255188
@@ -17763,13 +17763,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009557812107661584,
- "loss": 0.0204,
+ "loss": 0.0208,
"macro_f1": 1.0,
"num_tokens": 3015030.0,
"repeat_count": 1.0,
- "routers_loss": 0.010951942764222622,
+ "routers_loss": 0.012200530618429184,
"skip_count": 1.0,
"step": 1870,
"text_loss": 0.6293368339538574
@@ -17782,13 +17782,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.11962890625,
"learning_rate": 0.0009556538621055739,
- "loss": 0.0265,
+ "loss": 0.0268,
"macro_f1": 0.3272727429866791,
"num_tokens": 3019067.0,
"repeat_count": 0.0,
- "routers_loss": 0.06582094728946686,
+ "routers_loss": 0.06365182995796204,
"skip_count": 1.0,
"step": 1872,
"text_loss": 0.39046618342399597
@@ -17796,18 +17796,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
- "avg_layers": 26.0,
+ "avg_layers": 27.0,
"epoch": 8.798356325212797,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.12353515625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.115234375,
"learning_rate": 0.0009555263388375699,
- "loss": 0.0143,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.014,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3022166.0,
"repeat_count": 0.0,
- "routers_loss": 0.008920271880924702,
+ "routers_loss": 0.0041703456081449986,
"skip_count": 1.0,
"step": 1874,
"text_loss": 0.42232340574264526
@@ -17820,13 +17820,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1220703125,
+ "grad_norm": 0.11572265625,
"learning_rate": 0.0009553986410110134,
"loss": 0.016,
"macro_f1": 0.3333333432674408,
"num_tokens": 3025865.0,
"repeat_count": 0.0,
- "routers_loss": 0.006444344762712717,
+ "routers_loss": 0.005841755773872137,
"skip_count": 0.0,
"step": 1876,
"text_loss": 0.37600573897361755
@@ -17839,13 +17839,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.09228515625,
"learning_rate": 0.0009552707686748388,
- "loss": 0.022,
+ "loss": 0.0219,
"macro_f1": 0.3272727429866791,
"num_tokens": 3029950.0,
"repeat_count": 0.0,
- "routers_loss": 0.05197767913341522,
+ "routers_loss": 0.05165952071547508,
"skip_count": 1.0,
"step": 1878,
"text_loss": 0.33717799186706543
@@ -17858,13 +17858,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009551427218780467,
- "loss": 0.0224,
+ "loss": 0.0219,
"macro_f1": 0.6666666865348816,
"num_tokens": 3033649.0,
"repeat_count": 0.0,
- "routers_loss": 0.017570581287145615,
+ "routers_loss": 0.020680008456110954,
"skip_count": 2.0,
"step": 1880,
"text_loss": 0.5011783838272095
@@ -17877,13 +17877,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.173828125,
+ "grad_norm": 0.15625,
"learning_rate": 0.0009550145006697048,
- "loss": 0.0225,
+ "loss": 0.0217,
"macro_f1": 0.32098764181137085,
"num_tokens": 3036847.0,
"repeat_count": 0.0,
- "routers_loss": 0.07106777280569077,
+ "routers_loss": 0.07626450061798096,
"skip_count": 2.0,
"step": 1882,
"text_loss": 0.3066408336162567
@@ -17896,13 +17896,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0009548861050989482,
- "loss": 0.0139,
+ "loss": 0.0136,
"macro_f1": 1.0,
"num_tokens": 3040353.0,
"repeat_count": 1.0,
- "routers_loss": 0.009862381964921951,
+ "routers_loss": 0.010884666815400124,
"skip_count": 1.0,
"step": 1884,
"text_loss": 0.49779415130615234
@@ -17915,13 +17915,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0908203125,
"learning_rate": 0.0009547575352149778,
- "loss": 0.0209,
+ "loss": 0.0213,
"macro_f1": 0.6666666865348816,
"num_tokens": 3043504.0,
"repeat_count": 0.0,
- "routers_loss": 0.006928981747478247,
+ "routers_loss": 0.006704333238303661,
"skip_count": 2.0,
"step": 1886,
"text_loss": 0.12284614145755768
@@ -17934,13 +17934,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09423828125,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.0009546287910670621,
"loss": 0.0211,
"macro_f1": 0.5427350401878357,
"num_tokens": 3046422.0,
"repeat_count": 1.0,
- "routers_loss": 0.04788029566407204,
+ "routers_loss": 0.04799000173807144,
"skip_count": 2.0,
"step": 1888,
"text_loss": 0.1824081838130951
@@ -17953,13 +17953,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1357421875,
+ "grad_norm": 0.1484375,
"learning_rate": 0.0009544998727045361,
- "loss": 0.0299,
+ "loss": 0.0306,
"macro_f1": 0.3333333432674408,
"num_tokens": 3049819.0,
"repeat_count": 0.0,
- "routers_loss": 0.008282946422696114,
+ "routers_loss": 0.008139612153172493,
"skip_count": 0.0,
"step": 1890,
"text_loss": 0.18929053843021393
@@ -17972,32 +17972,32 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.09375,
"learning_rate": 0.0009543707801768015,
- "loss": 0.0181,
+ "loss": 0.0175,
"macro_f1": 0.5934640765190125,
"num_tokens": 3052766.0,
"repeat_count": 0.0,
- "routers_loss": 0.03251546248793602,
+ "routers_loss": 0.02966771461069584,
"skip_count": 3.0,
"step": 1892,
"text_loss": 0.247748002409935
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 24.0,
+ "acc_skip": 0.5,
+ "avg_layers": 25.0,
"epoch": 8.892280598767243,
- "f1_execute": 0.9600000381469727,
+ "f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.06640625,
+ "f1_skip": 0.4000000059604645,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009542415135333267,
- "loss": 0.0195,
- "macro_f1": 0.542222261428833,
+ "loss": 0.0193,
+ "macro_f1": 0.44705885648727417,
"num_tokens": 3056427.0,
"repeat_count": 0.0,
- "routers_loss": 0.03368280455470085,
+ "routers_loss": 0.03637036308646202,
"skip_count": 2.0,
"step": 1894,
"text_loss": 0.2583999037742615
@@ -18010,13 +18010,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.0595703125,
"learning_rate": 0.0009541120728236472,
- "loss": 0.0133,
+ "loss": 0.0136,
"macro_f1": 0.3333333432674408,
"num_tokens": 3059497.0,
"repeat_count": 0.0,
- "routers_loss": 0.0069940583780407906,
+ "routers_loss": 0.007026574574410915,
"skip_count": 0.0,
"step": 1896,
"text_loss": 0.5222375988960266
@@ -18029,13 +18029,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0810546875,
+ "grad_norm": 0.076171875,
"learning_rate": 0.0009539824580973646,
- "loss": 0.0221,
+ "loss": 0.0219,
"macro_f1": 0.3333333432674408,
"num_tokens": 3062187.0,
"repeat_count": 0.0,
- "routers_loss": 0.004268508404493332,
+ "routers_loss": 0.003449335927143693,
"skip_count": 0.0,
"step": 1898,
"text_loss": 0.5736427307128906
@@ -18048,13 +18048,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0009538526694041477,
- "loss": 0.0159,
+ "loss": 0.0163,
"macro_f1": 0.3333333432674408,
"num_tokens": 3066100.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032616283278912306,
+ "routers_loss": 0.0035463871899992228,
"skip_count": 0.0,
"step": 1900,
"text_loss": 0.5471583604812622
@@ -18067,13 +18067,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.08056640625,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009537227067937318,
- "loss": 0.023,
+ "loss": 0.0233,
"macro_f1": 1.0,
"num_tokens": 3068737.0,
"repeat_count": 3.0,
- "routers_loss": 0.005389219615608454,
+ "routers_loss": 0.00597514258697629,
"skip_count": 3.0,
"step": 1902,
"text_loss": 0.36644190549850464
@@ -18086,13 +18086,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.166015625,
"learning_rate": 0.0009535925703159186,
- "loss": 0.0311,
+ "loss": 0.0301,
"macro_f1": 0.32098764181137085,
"num_tokens": 3071686.0,
"repeat_count": 0.0,
- "routers_loss": 0.024814991280436516,
+ "routers_loss": 0.025420479476451874,
"skip_count": 2.0,
"step": 1904,
"text_loss": 0.535789966583252
@@ -18107,11 +18107,11 @@
"f1_skip": 0.0,
"grad_norm": 0.07568359375,
"learning_rate": 0.0009534622600205769,
- "loss": 0.0151,
+ "loss": 0.0145,
"macro_f1": 0.3333333432674408,
"num_tokens": 3074954.0,
"repeat_count": 0.0,
- "routers_loss": 0.013415839523077011,
+ "routers_loss": 0.014377486892044544,
"skip_count": 0.0,
"step": 1906,
"text_loss": 0.19009549915790558
@@ -18124,13 +18124,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.115234375,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0009533317759576416,
- "loss": 0.019,
+ "loss": 0.0197,
"macro_f1": 0.3333333432674408,
"num_tokens": 3077540.0,
"repeat_count": 0.0,
- "routers_loss": 0.005814475007355213,
+ "routers_loss": 0.004848944488912821,
"skip_count": 0.0,
"step": 1908,
"text_loss": 0.5022001266479492
@@ -18143,13 +18143,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0732421875,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0009532011181771148,
- "loss": 0.0218,
+ "loss": 0.0217,
"macro_f1": 0.6666666865348816,
"num_tokens": 3080445.0,
"repeat_count": 0.0,
- "routers_loss": 0.007621586322784424,
+ "routers_loss": 0.009480170905590057,
"skip_count": 2.0,
"step": 1910,
"text_loss": 0.35135936737060547
@@ -18162,13 +18162,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09765625,
+ "grad_norm": 0.10400390625,
"learning_rate": 0.0009530702867290644,
- "loss": 0.0178,
+ "loss": 0.0185,
"macro_f1": 0.3333333432674408,
"num_tokens": 3083657.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020917020738124847,
+ "routers_loss": 0.0019353039097040892,
"skip_count": 0.0,
"step": 1912,
"text_loss": 0.5123994946479797
@@ -18181,13 +18181,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.123046875,
+ "grad_norm": 0.1455078125,
"learning_rate": 0.0009529392816636256,
- "loss": 0.025,
+ "loss": 0.0249,
"macro_f1": 0.3333333432674408,
"num_tokens": 3086837.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010824954370036721,
+ "routers_loss": 0.0010921972570940852,
"skip_count": 0.0,
"step": 1914,
"text_loss": 0.44477662444114685
@@ -18200,13 +18200,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1884765625,
+ "grad_norm": 0.19140625,
"learning_rate": 0.0009528081030309995,
- "loss": 0.0353,
+ "loss": 0.0351,
"macro_f1": 0.3333333432674408,
"num_tokens": 3089892.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018075350672006607,
+ "routers_loss": 0.0018027103506028652,
"skip_count": 0.0,
"step": 1916,
"text_loss": 0.7356183528900146
@@ -18219,13 +18219,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07958984375,
+ "grad_norm": 0.07568359375,
"learning_rate": 0.0009526767508814542,
- "loss": 0.0235,
+ "loss": 0.0236,
"macro_f1": 0.3333333432674408,
"num_tokens": 3093058.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032930250745266676,
+ "routers_loss": 0.003243023296818137,
"skip_count": 0.0,
"step": 1918,
"text_loss": 0.48823556303977966
@@ -18238,13 +18238,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08837890625,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009525452252653239,
- "loss": 0.0184,
+ "loss": 0.0175,
"macro_f1": 0.3333333432674408,
"num_tokens": 3096404.0,
"repeat_count": 0.0,
- "routers_loss": 0.009042349644005299,
+ "routers_loss": 0.009360014460980892,
"skip_count": 0.0,
"step": 1920,
"text_loss": 0.21498437225818634
@@ -18257,13 +18257,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.1103515625,
+ "grad_norm": 0.140625,
"learning_rate": 0.0009524135262330098,
- "loss": 0.022,
+ "loss": 0.0224,
"macro_f1": 0.9265305995941162,
"num_tokens": 3099520.0,
"repeat_count": 1.0,
- "routers_loss": 0.016776500269770622,
+ "routers_loss": 0.017444295808672905,
"skip_count": 3.0,
"step": 1922,
"text_loss": 0.27608850598335266
@@ -18276,13 +18276,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.0009522816538349789,
- "loss": 0.016,
+ "loss": 0.0162,
"macro_f1": 0.5492662787437439,
"num_tokens": 3102956.0,
"repeat_count": 0.0,
- "routers_loss": 0.06579705327749252,
+ "routers_loss": 0.06424452364444733,
"skip_count": 2.0,
"step": 1924,
"text_loss": 0.21558666229248047
@@ -18295,13 +18295,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.058349609375,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0009521496081217651,
- "loss": 0.0113,
+ "loss": 0.0112,
"macro_f1": 0.6666666865348816,
"num_tokens": 3106565.0,
"repeat_count": 1.0,
- "routers_loss": 0.0022786022163927555,
+ "routers_loss": 0.002270506462082267,
"skip_count": 0.0,
"step": 1926,
"text_loss": 0.5641813278198242
@@ -18314,13 +18314,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.09033203125,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009520173891439684,
"loss": 0.0216,
"macro_f1": 0.6666666865348816,
"num_tokens": 3109314.0,
"repeat_count": 0.0,
- "routers_loss": 0.01074281521141529,
+ "routers_loss": 0.011512448079884052,
"skip_count": 1.0,
"step": 1928,
"text_loss": 0.6351624727249146
@@ -18333,13 +18333,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009518849969522556,
- "loss": 0.0201,
+ "loss": 0.0198,
"macro_f1": 0.3333333432674408,
"num_tokens": 3112956.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032052614260464907,
+ "routers_loss": 0.003883908037096262,
"skip_count": 0.0,
"step": 1930,
"text_loss": 0.35160085558891296
@@ -18352,32 +18352,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.10888671875,
"learning_rate": 0.0009517524315973595,
- "loss": 0.0186,
+ "loss": 0.019,
"macro_f1": 1.0,
"num_tokens": 3115593.0,
"repeat_count": 1.0,
- "routers_loss": 0.008593574166297913,
+ "routers_loss": 0.009479222819209099,
"skip_count": 3.0,
"step": 1932,
"text_loss": 0.2900560200214386
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 9.079835632521279,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.07373046875,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0771484375,
"learning_rate": 0.0009516196931300794,
- "loss": 0.0152,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0153,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3118516.0,
"repeat_count": 0.0,
- "routers_loss": 0.0201246440410614,
+ "routers_loss": 0.017834696918725967,
"skip_count": 2.0,
"step": 1934,
"text_loss": 0.20094378292560577
@@ -18390,13 +18390,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1357421875,
+ "grad_norm": 0.12890625,
"learning_rate": 0.0009514867816012809,
- "loss": 0.0199,
+ "loss": 0.02,
"macro_f1": 0.3333333432674408,
"num_tokens": 3122242.0,
"repeat_count": 0.0,
- "routers_loss": 0.001721356064081192,
+ "routers_loss": 0.0017964740982279181,
"skip_count": 0.0,
"step": 1936,
"text_loss": 0.6498590707778931
@@ -18409,13 +18409,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.049072265625,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0009513536970618961,
- "loss": 0.0135,
+ "loss": 0.013,
"macro_f1": 0.6666666865348816,
"num_tokens": 3125645.0,
"repeat_count": 0.0,
- "routers_loss": 0.010442634113132954,
+ "routers_loss": 0.007437168620526791,
"skip_count": 2.0,
"step": 1938,
"text_loss": 0.25863033533096313
@@ -18428,13 +18428,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.058349609375,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009512204395629232,
- "loss": 0.019,
+ "loss": 0.0184,
"macro_f1": 0.6666666865348816,
"num_tokens": 3128740.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009493798715993762,
+ "routers_loss": 0.0008759932243265212,
"skip_count": 1.0,
"step": 1940,
"text_loss": 0.5638351440429688
@@ -18447,13 +18447,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05517578125,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009510870091554264,
- "loss": 0.0149,
+ "loss": 0.0153,
"macro_f1": 0.3272727429866791,
"num_tokens": 3131742.0,
"repeat_count": 1.0,
- "routers_loss": 0.022104881703853607,
+ "routers_loss": 0.019906625151634216,
"skip_count": 0.0,
"step": 1942,
"text_loss": 0.8410717844963074
@@ -18466,13 +18466,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10400390625,
+ "grad_norm": 0.12255859375,
"learning_rate": 0.0009509534058905369,
- "loss": 0.0164,
+ "loss": 0.016,
"macro_f1": 0.3333333432674408,
"num_tokens": 3134407.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009013625676743686,
+ "routers_loss": 0.0009229081333614886,
"skip_count": 0.0,
"step": 1944,
"text_loss": 0.47506049275398254
@@ -18485,13 +18485,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06103515625,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0009508196298194517,
- "loss": 0.0121,
+ "loss": 0.0123,
"macro_f1": 0.3333333432674408,
"num_tokens": 3137053.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028069843538105488,
+ "routers_loss": 0.003630586201325059,
"skip_count": 0.0,
"step": 1946,
"text_loss": 0.32225799560546875
@@ -18504,13 +18504,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009506856809934338,
- "loss": 0.0116,
+ "loss": 0.0119,
"macro_f1": 0.3333333432674408,
"num_tokens": 3140943.0,
"repeat_count": 0.0,
- "routers_loss": 0.006877045147120953,
+ "routers_loss": 0.007580445148050785,
"skip_count": 0.0,
"step": 1948,
"text_loss": 0.3120577931404114
@@ -18523,13 +18523,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0009505515594638127,
- "loss": 0.0127,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 3144298.0,
"repeat_count": 0.0,
- "routers_loss": 0.004543667659163475,
+ "routers_loss": 0.004471861757338047,
"skip_count": 0.0,
"step": 1950,
"text_loss": 0.22052447497844696
@@ -18542,13 +18542,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.09130859375,
"learning_rate": 0.0009504172652819843,
- "loss": 0.0232,
+ "loss": 0.023,
"macro_f1": 1.0,
"num_tokens": 3147069.0,
"repeat_count": 1.0,
- "routers_loss": 0.007053609937429428,
+ "routers_loss": 0.009606664068996906,
"skip_count": 1.0,
"step": 1952,
"text_loss": 0.34773921966552734
@@ -18561,13 +18561,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0537109375,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009502827984994099,
- "loss": 0.0146,
+ "loss": 0.0148,
"macro_f1": 0.6666666865348816,
"num_tokens": 3149992.0,
"repeat_count": 0.0,
- "routers_loss": 0.006783280987292528,
+ "routers_loss": 0.006443799939006567,
"skip_count": 1.0,
"step": 1954,
"text_loss": 0.6442171335220337
@@ -18580,13 +18580,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.0009501481591676177,
- "loss": 0.0181,
+ "loss": 0.0188,
"macro_f1": 0.3333333432674408,
"num_tokens": 3153167.0,
"repeat_count": 0.0,
- "routers_loss": 0.002531677018851042,
+ "routers_loss": 0.003219039412215352,
"skip_count": 0.0,
"step": 1956,
"text_loss": 0.43369221687316895
@@ -18599,32 +18599,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.078125,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.000950013347338202,
- "loss": 0.0154,
+ "loss": 0.0152,
"macro_f1": 0.3272727429866791,
"num_tokens": 3156590.0,
"repeat_count": 0.0,
- "routers_loss": 0.027040868997573853,
+ "routers_loss": 0.025551019236445427,
"skip_count": 1.0,
"step": 1958,
"text_loss": 0.294479101896286
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 1.0,
- "avg_layers": 26.0,
+ "avg_layers": 27.0,
"epoch": 9.201937188142061,
- "f1_execute": 0.9803921580314636,
- "f1_repeat": 0.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009498783630628225,
- "loss": 0.0154,
- "macro_f1": 0.6601307392120361,
+ "loss": 0.0158,
+ "macro_f1": 1.0,
"num_tokens": 3159451.0,
"repeat_count": 1.0,
- "routers_loss": 0.01573321223258972,
+ "routers_loss": 0.013802438974380493,
"skip_count": 2.0,
"step": 1960,
"text_loss": 0.20888492465019226
@@ -18637,13 +18637,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06689453125,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009497432063932057,
- "loss": 0.0135,
+ "loss": 0.0137,
"macro_f1": 0.6601307392120361,
"num_tokens": 3162889.0,
"repeat_count": 1.0,
- "routers_loss": 0.02442278526723385,
+ "routers_loss": 0.02852988988161087,
"skip_count": 2.0,
"step": 1962,
"text_loss": 0.5027125477790833
@@ -18656,13 +18656,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.046630859375,
+ "grad_norm": 0.045166015625,
"learning_rate": 0.0009496078773811437,
- "loss": 0.0142,
+ "loss": 0.0136,
"macro_f1": 0.6666666865348816,
"num_tokens": 3165979.0,
"repeat_count": 0.0,
- "routers_loss": 0.018267054110765457,
+ "routers_loss": 0.01784522272646427,
"skip_count": 2.0,
"step": 1964,
"text_loss": 0.1696339100599289
@@ -18675,13 +18675,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.060302734375,
"learning_rate": 0.000949472376078495,
- "loss": 0.0162,
+ "loss": 0.016,
"macro_f1": 0.3333333432674408,
"num_tokens": 3168683.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016024474753066897,
+ "routers_loss": 0.0017019887454807758,
"skip_count": 0.0,
"step": 1966,
"text_loss": 0.48905447125434875
@@ -18694,13 +18694,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.052978515625,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.000949336702537184,
- "loss": 0.011,
+ "loss": 0.0108,
"macro_f1": 0.6666666865348816,
"num_tokens": 3171968.0,
"repeat_count": 0.0,
- "routers_loss": 0.004668849054723978,
+ "routers_loss": 0.004817947279661894,
"skip_count": 2.0,
"step": 1968,
"text_loss": 0.20984773337841034
@@ -18713,13 +18713,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0009492008568092007,
- "loss": 0.0098,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 3175947.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011657609138637781,
+ "routers_loss": 0.0012963006738573313,
"skip_count": 0.0,
"step": 1970,
"text_loss": 0.5215106010437012
@@ -18732,13 +18732,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.04248046875,
+ "grad_norm": 0.044921875,
"learning_rate": 0.0009490648389466019,
- "loss": 0.0133,
+ "loss": 0.0135,
"macro_f1": 0.4871794879436493,
"num_tokens": 3179348.0,
"repeat_count": 0.0,
- "routers_loss": 0.03806794434785843,
+ "routers_loss": 0.03950481489300728,
"skip_count": 2.0,
"step": 1972,
"text_loss": 0.24640929698944092
@@ -18751,13 +18751,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08837890625,
+ "grad_norm": 0.09326171875,
"learning_rate": 0.0009489286490015097,
- "loss": 0.0189,
+ "loss": 0.0183,
"macro_f1": 0.6666666865348816,
"num_tokens": 3182640.0,
"repeat_count": 0.0,
- "routers_loss": 0.005107097327709198,
+ "routers_loss": 0.0043345349840819836,
"skip_count": 2.0,
"step": 1974,
"text_loss": 0.6362852454185486
@@ -18770,13 +18770,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.078125,
+ "grad_norm": 0.07958984375,
"learning_rate": 0.0009487922870261122,
- "loss": 0.0156,
+ "loss": 0.0155,
"macro_f1": 0.3333333432674408,
"num_tokens": 3185657.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013696947135031223,
+ "routers_loss": 0.0015687479171901941,
"skip_count": 0.0,
"step": 1976,
"text_loss": 0.8977144360542297
@@ -18789,13 +18789,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.0009486557530726638,
- "loss": 0.0136,
+ "loss": 0.0139,
"macro_f1": 0.3333333432674408,
"num_tokens": 3188772.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012224154779687524,
+ "routers_loss": 0.0010977238416671753,
"skip_count": 0.0,
"step": 1978,
"text_loss": 0.38512736558914185
@@ -18808,13 +18808,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09423828125,
+ "grad_norm": 0.11279296875,
"learning_rate": 0.0009485190471934844,
"loss": 0.0196,
"macro_f1": 0.6666666865348816,
"num_tokens": 3193131.0,
"repeat_count": 2.0,
- "routers_loss": 0.0030119111761450768,
+ "routers_loss": 0.002264744369313121,
"skip_count": 0.0,
"step": 1980,
"text_loss": 0.4171289801597595
@@ -18827,13 +18827,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.12451171875,
+ "grad_norm": 0.09033203125,
"learning_rate": 0.00094838216944096,
- "loss": 0.0222,
+ "loss": 0.0219,
"macro_f1": 0.3272727429866791,
"num_tokens": 3196668.0,
"repeat_count": 0.0,
- "routers_loss": 0.04286033287644386,
+ "routers_loss": 0.042320676147937775,
"skip_count": 1.0,
"step": 1982,
"text_loss": 0.19008000195026398
@@ -18846,32 +18846,32 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.053466796875,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0009482451198675424,
- "loss": 0.0158,
+ "loss": 0.0151,
"macro_f1": 0.32098767161369324,
"num_tokens": 3200282.0,
"repeat_count": 0.0,
- "routers_loss": 0.019988590851426125,
+ "routers_loss": 0.01796630397439003,
"skip_count": 1.0,
"step": 1984,
"text_loss": 0.5009249448776245
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 9.324038743762841,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.0634765625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.061767578125,
"learning_rate": 0.0009481078985257494,
- "loss": 0.0154,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0147,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3204439.0,
"repeat_count": 0.0,
- "routers_loss": 0.012215938419103622,
+ "routers_loss": 0.01052347756922245,
"skip_count": 1.0,
"step": 1986,
"text_loss": 0.15319275856018066
@@ -18884,13 +18884,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0009479705054681644,
- "loss": 0.0149,
+ "loss": 0.015,
"macro_f1": 0.3076923191547394,
"num_tokens": 3207590.0,
"repeat_count": 1.0,
- "routers_loss": 0.10747655481100082,
+ "routers_loss": 0.09640293568372726,
"skip_count": 3.0,
"step": 1988,
"text_loss": 0.3654652535915375
@@ -18903,13 +18903,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009478329407474366,
- "loss": 0.0186,
+ "loss": 0.0183,
"macro_f1": 0.5492662787437439,
"num_tokens": 3211172.0,
"repeat_count": 0.0,
- "routers_loss": 0.016109853982925415,
+ "routers_loss": 0.012670112773776054,
"skip_count": 1.0,
"step": 1990,
"text_loss": 0.5817596316337585
@@ -18922,13 +18922,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.05859375,
"learning_rate": 0.000947695204416281,
- "loss": 0.0116,
+ "loss": 0.0121,
"macro_f1": 0.6666666865348816,
"num_tokens": 3214050.0,
"repeat_count": 1.0,
- "routers_loss": 0.006929324474185705,
+ "routers_loss": 0.005263707600533962,
"skip_count": 0.0,
"step": 1992,
"text_loss": 0.5985888242721558
@@ -18941,13 +18941,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009475572965274787,
- "loss": 0.0147,
+ "loss": 0.0144,
"macro_f1": 0.3272727429866791,
"num_tokens": 3217318.0,
"repeat_count": 1.0,
- "routers_loss": 0.0715102106332779,
+ "routers_loss": 0.0682850033044815,
"skip_count": 0.0,
"step": 1994,
"text_loss": 0.316506564617157
@@ -18960,13 +18960,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.052490234375,
+ "grad_norm": 0.0595703125,
"learning_rate": 0.000947419217133876,
- "loss": 0.0187,
+ "loss": 0.019,
"macro_f1": 0.6666666865348816,
"num_tokens": 3220012.0,
"repeat_count": 0.0,
- "routers_loss": 0.008499355986714363,
+ "routers_loss": 0.008508823812007904,
"skip_count": 2.0,
"step": 1996,
"text_loss": 0.09665893763303757
@@ -18979,13 +18979,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.048583984375,
+ "grad_norm": 0.053466796875,
"learning_rate": 0.0009472809662883852,
- "loss": 0.0162,
+ "loss": 0.0155,
"macro_f1": 1.0,
"num_tokens": 3223019.0,
"repeat_count": 1.0,
- "routers_loss": 0.012003371492028236,
+ "routers_loss": 0.01100847590714693,
"skip_count": 2.0,
"step": 1998,
"text_loss": 0.4938808083534241
@@ -18998,13 +18998,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0009471425440439844,
- "loss": 0.0137,
+ "loss": 0.0135,
"macro_f1": 0.8817967176437378,
"num_tokens": 3226013.0,
"repeat_count": 2.0,
- "routers_loss": 0.0529167577624321,
+ "routers_loss": 0.04953207075595856,
"skip_count": 3.0,
"step": 2000,
"text_loss": 0.22258254885673523
@@ -19017,13 +19017,13 @@
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.076171875,
+ "grad_norm": 0.07568359375,
"learning_rate": 0.0009470039504537173,
- "loss": 0.0185,
+ "loss": 0.0186,
"macro_f1": 0.31446540355682373,
"num_tokens": 3230031.0,
"repeat_count": 0.0,
- "routers_loss": 0.05719539523124695,
+ "routers_loss": 0.052884332835674286,
"skip_count": 2.0,
"step": 2002,
"text_loss": 0.1741616576910019
@@ -19038,11 +19038,11 @@
"f1_skip": 0.0,
"grad_norm": 0.0869140625,
"learning_rate": 0.0009468651855706931,
- "loss": 0.0205,
+ "loss": 0.0204,
"macro_f1": 0.6666666865348816,
"num_tokens": 3232991.0,
"repeat_count": 1.0,
- "routers_loss": 0.007613501511514187,
+ "routers_loss": 0.008056716993451118,
"skip_count": 0.0,
"step": 2004,
"text_loss": 0.3173636198043823
@@ -19055,13 +19055,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0009467262494480868,
- "loss": 0.014,
+ "loss": 0.0136,
"macro_f1": 0.3333333432674408,
"num_tokens": 3236390.0,
"repeat_count": 0.0,
- "routers_loss": 0.005654903594404459,
+ "routers_loss": 0.0053409393876791,
"skip_count": 0.0,
"step": 2006,
"text_loss": 0.5806330442428589
@@ -19074,13 +19074,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07958984375,
+ "grad_norm": 0.068359375,
"learning_rate": 0.000946587142139139,
- "loss": 0.0152,
+ "loss": 0.0147,
"macro_f1": 0.3333333432674408,
"num_tokens": 3239267.0,
"repeat_count": 0.0,
- "routers_loss": 0.001680699409916997,
+ "routers_loss": 0.0015652200672775507,
"skip_count": 0.0,
"step": 2008,
"text_loss": 0.6214317679405212
@@ -19093,13 +19093,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.11376953125,
"learning_rate": 0.000946447863697156,
- "loss": 0.0171,
+ "loss": 0.0151,
"macro_f1": 0.6601307392120361,
"num_tokens": 3242569.0,
"repeat_count": 1.0,
- "routers_loss": 0.014179535210132599,
+ "routers_loss": 0.011673987843096256,
"skip_count": 2.0,
"step": 2010,
"text_loss": 0.532565712928772
@@ -19112,13 +19112,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.04345703125,
"learning_rate": 0.0009463084141755093,
- "loss": 0.0157,
+ "loss": 0.0159,
"macro_f1": 0.3272727429866791,
"num_tokens": 3245669.0,
"repeat_count": 0.0,
- "routers_loss": 0.026209332048892975,
+ "routers_loss": 0.028480790555477142,
"skip_count": 1.0,
"step": 2012,
"text_loss": 0.25210800766944885
@@ -19131,13 +19131,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08349609375,
+ "grad_norm": 0.0869140625,
"learning_rate": 0.0009461687936276364,
- "loss": 0.0134,
+ "loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 3248751.0,
"repeat_count": 0.0,
- "routers_loss": 0.008315940387547016,
+ "routers_loss": 0.007234727032482624,
"skip_count": 0.0,
"step": 2014,
"text_loss": 0.35922971367836
@@ -19150,13 +19150,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.06689453125,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0009460290021070402,
- "loss": 0.0197,
+ "loss": 0.0195,
"macro_f1": 0.6666666865348816,
"num_tokens": 3252614.0,
"repeat_count": 1.0,
- "routers_loss": 0.01872348040342331,
+ "routers_loss": 0.014691276475787163,
"skip_count": 0.0,
"step": 2016,
"text_loss": 0.2747853398323059
@@ -19169,13 +19169,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05126953125,
+ "grad_norm": 0.051513671875,
"learning_rate": 0.0009458890396672888,
"loss": 0.0186,
"macro_f1": 0.3333333432674408,
"num_tokens": 3256374.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024314222391694784,
+ "routers_loss": 0.002385235857218504,
"skip_count": 0.0,
"step": 2018,
"text_loss": 0.5268719792366028
@@ -19188,13 +19188,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.052978515625,
+ "grad_norm": 0.04443359375,
"learning_rate": 0.0009457489063620164,
- "loss": 0.0137,
+ "loss": 0.0133,
"macro_f1": 0.8823530077934265,
"num_tokens": 3259792.0,
"repeat_count": 1.0,
- "routers_loss": 0.04815426841378212,
+ "routers_loss": 0.047268565744161606,
"skip_count": 2.0,
"step": 2020,
"text_loss": 0.7785539627075195
@@ -19207,13 +19207,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.13671875,
+ "grad_norm": 0.1494140625,
"learning_rate": 0.0009456086022449221,
- "loss": 0.0209,
+ "loss": 0.0218,
"macro_f1": 0.3272727429866791,
"num_tokens": 3262833.0,
"repeat_count": 0.0,
- "routers_loss": 0.015121756121516228,
+ "routers_loss": 0.015878718346357346,
"skip_count": 1.0,
"step": 2022,
"text_loss": 0.42270028591156006
@@ -19226,32 +19226,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.08935546875,
"learning_rate": 0.0009454681273697711,
- "loss": 0.0122,
+ "loss": 0.0117,
"macro_f1": 0.3272727429866791,
"num_tokens": 3265718.0,
"repeat_count": 1.0,
- "routers_loss": 0.030219297856092453,
+ "routers_loss": 0.030749641358852386,
"skip_count": 0.0,
"step": 2024,
"text_loss": 0.18668225407600403
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 9.511887290871735,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.05419921875,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.0009453274817903931,
- "loss": 0.0132,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.012,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3268158.0,
"repeat_count": 0.0,
- "routers_loss": 0.013256299309432507,
+ "routers_loss": 0.011538166552782059,
"skip_count": 1.0,
"step": 2026,
"text_loss": 0.34090787172317505
@@ -19264,13 +19264,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11572265625,
+ "grad_norm": 0.099609375,
"learning_rate": 0.000945186665560684,
- "loss": 0.0232,
+ "loss": 0.0218,
"macro_f1": 0.3333333432674408,
"num_tokens": 3271082.0,
"repeat_count": 0.0,
- "routers_loss": 0.009389489889144897,
+ "routers_loss": 0.009527760557830334,
"skip_count": 0.0,
"step": 2028,
"text_loss": 0.2110334187746048
@@ -19283,13 +19283,13 @@
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.119140625,
"learning_rate": 0.000945045678734605,
- "loss": 0.0178,
+ "loss": 0.0175,
"macro_f1": 0.3144654333591461,
"num_tokens": 3273488.0,
"repeat_count": 0.0,
- "routers_loss": 0.03916877508163452,
+ "routers_loss": 0.03317151218652725,
"skip_count": 3.0,
"step": 2030,
"text_loss": 0.2233227640390396
@@ -19302,13 +19302,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11474609375,
+ "grad_norm": 0.12451171875,
"learning_rate": 0.0009449045213661822,
- "loss": 0.0215,
+ "loss": 0.0201,
"macro_f1": 0.3272727429866791,
"num_tokens": 3276646.0,
"repeat_count": 0.0,
- "routers_loss": 0.019781047478318214,
+ "routers_loss": 0.018510591238737106,
"skip_count": 1.0,
"step": 2032,
"text_loss": 0.16100332140922546
@@ -19321,13 +19321,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 1.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.11474609375,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.0009447631935095077,
- "loss": 0.0193,
+ "loss": 0.0185,
"macro_f1": 0.9452888369560242,
"num_tokens": 3279441.0,
"repeat_count": 1.0,
- "routers_loss": 0.02645993046462536,
+ "routers_loss": 0.028113311156630516,
"skip_count": 4.0,
"step": 2034,
"text_loss": 0.29208317399024963
@@ -19340,13 +19340,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.0009446216952187384,
- "loss": 0.0168,
+ "loss": 0.0164,
"macro_f1": 0.3333333432674408,
"num_tokens": 3282697.0,
"repeat_count": 0.0,
- "routers_loss": 0.008575125597417355,
+ "routers_loss": 0.008379172533750534,
"skip_count": 0.0,
"step": 2036,
"text_loss": 0.16026398539543152
@@ -19359,13 +19359,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.06298828125,
"learning_rate": 0.0009444800265480967,
- "loss": 0.0184,
+ "loss": 0.0178,
"macro_f1": 0.3333333432674408,
"num_tokens": 3285574.0,
"repeat_count": 0.0,
- "routers_loss": 0.01042154710739851,
+ "routers_loss": 0.00941354501992464,
"skip_count": 0.0,
"step": 2038,
"text_loss": 0.29523080587387085
@@ -19378,13 +19378,13 @@
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.8571428656578064,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.076171875,
"learning_rate": 0.0009443381875518703,
- "loss": 0.0206,
+ "loss": 0.0197,
"macro_f1": 0.8600732684135437,
"num_tokens": 3289159.0,
"repeat_count": 4.0,
- "routers_loss": 0.05496715381741524,
+ "routers_loss": 0.04974055662751198,
"skip_count": 6.0,
"step": 2040,
"text_loss": 0.23033179342746735
@@ -19397,13 +19397,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.0537109375,
"learning_rate": 0.0009441961782844123,
- "loss": 0.0149,
+ "loss": 0.0146,
"macro_f1": 0.3272727429866791,
"num_tokens": 3293598.0,
"repeat_count": 0.0,
- "routers_loss": 0.021722445264458656,
+ "routers_loss": 0.022241825237870216,
"skip_count": 1.0,
"step": 2042,
"text_loss": 0.8299165368080139
@@ -19416,13 +19416,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.0009440539988001408,
- "loss": 0.0161,
+ "loss": 0.0159,
"macro_f1": 0.3333333432674408,
"num_tokens": 3296648.0,
"repeat_count": 0.0,
- "routers_loss": 0.011090370826423168,
+ "routers_loss": 0.011019332334399223,
"skip_count": 0.0,
"step": 2044,
"text_loss": 0.18207129836082458
@@ -19435,13 +19435,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.041259765625,
"learning_rate": 0.0009439116491535394,
- "loss": 0.0123,
+ "loss": 0.0118,
"macro_f1": 0.3333333432674408,
"num_tokens": 3300058.0,
"repeat_count": 0.0,
- "routers_loss": 0.00327755743637681,
+ "routers_loss": 0.002889640862122178,
"skip_count": 0.0,
"step": 2046,
"text_loss": 0.7051978707313538
@@ -19454,13 +19454,13 @@
"f1_execute": 0.9333333373069763,
"f1_repeat": 0.5,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.078125,
"learning_rate": 0.0009437691293991563,
- "loss": 0.0198,
+ "loss": 0.0192,
"macro_f1": 0.7634921073913574,
"num_tokens": 3303296.0,
"repeat_count": 3.0,
- "routers_loss": 0.0807223841547966,
+ "routers_loss": 0.07741832733154297,
"skip_count": 4.0,
"step": 2048,
"text_loss": 0.15563532710075378
@@ -19473,13 +19473,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.09521484375,
"learning_rate": 0.0009436264395916061,
- "loss": 0.0218,
+ "loss": 0.0209,
"macro_f1": 0.6666666865348816,
"num_tokens": 3306204.0,
"repeat_count": 0.0,
- "routers_loss": 0.014681774191558361,
+ "routers_loss": 0.014225383289158344,
"skip_count": 2.0,
"step": 2050,
"text_loss": 0.18117287755012512
@@ -19492,13 +19492,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09326171875,
+ "grad_norm": 0.1416015625,
"learning_rate": 0.0009434835797855672,
- "loss": 0.0166,
+ "loss": 0.0165,
"macro_f1": 0.3333333432674408,
"num_tokens": 3309444.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025602662935853004,
+ "routers_loss": 0.0023932650219649076,
"skip_count": 0.0,
"step": 2052,
"text_loss": 0.4645874798297882
@@ -19511,13 +19511,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05810546875,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0009433405500357839,
- "loss": 0.0148,
+ "loss": 0.0153,
"macro_f1": 0.3272727429866791,
"num_tokens": 3312488.0,
"repeat_count": 0.0,
- "routers_loss": 0.03283753618597984,
+ "routers_loss": 0.03193361684679985,
"skip_count": 1.0,
"step": 2054,
"text_loss": 0.5291082859039307
@@ -19530,13 +19530,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.062255859375,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0009431973503970655,
- "loss": 0.0138,
+ "loss": 0.0134,
"macro_f1": 0.3333333432674408,
"num_tokens": 3315765.0,
"repeat_count": 0.0,
- "routers_loss": 0.002137230010703206,
+ "routers_loss": 0.0020529816392809153,
"skip_count": 0.0,
"step": 2056,
"text_loss": 0.5877931118011475
@@ -19549,13 +19549,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08251953125,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0009430539809242864,
- "loss": 0.0199,
+ "loss": 0.0185,
"macro_f1": 0.32098764181137085,
"num_tokens": 3318877.0,
"repeat_count": 2.0,
- "routers_loss": 0.07938452064990997,
+ "routers_loss": 0.07907948642969131,
"skip_count": 0.0,
"step": 2058,
"text_loss": 0.3836737871170044
@@ -19568,13 +19568,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009429104416723862,
- "loss": 0.0164,
+ "loss": 0.0163,
"macro_f1": 0.6666666865348816,
"num_tokens": 3322576.0,
"repeat_count": 2.0,
- "routers_loss": 0.003832251997664571,
+ "routers_loss": 0.003006070153787732,
"skip_count": 0.0,
"step": 2060,
"text_loss": 0.3480920195579529
@@ -19587,13 +19587,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.045166015625,
"learning_rate": 0.0009427667326963689,
- "loss": 0.0131,
+ "loss": 0.0127,
"macro_f1": 0.3333333432674408,
"num_tokens": 3325974.0,
"repeat_count": 0.0,
- "routers_loss": 0.006192604545503855,
+ "routers_loss": 0.005013179033994675,
"skip_count": 0.0,
"step": 2062,
"text_loss": 0.931358814239502
@@ -19606,13 +19606,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09375,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0009426228540513047,
"loss": 0.0206,
"macro_f1": 0.3333333432674408,
"num_tokens": 3329398.0,
"repeat_count": 0.0,
- "routers_loss": 0.008115313947200775,
+ "routers_loss": 0.0059848143719136715,
"skip_count": 0.0,
"step": 2064,
"text_loss": 0.47568953037261963
@@ -19625,13 +19625,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009424788057923277,
- "loss": 0.0127,
+ "loss": 0.0131,
"macro_f1": 0.3333333432674408,
"num_tokens": 3332029.0,
"repeat_count": 0.0,
- "routers_loss": 0.007599714212119579,
+ "routers_loss": 0.00783882662653923,
"skip_count": 0.0,
"step": 2066,
"text_loss": 0.22887596487998962
@@ -19644,13 +19644,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.07470703125,
+ "grad_norm": 0.0712890625,
"learning_rate": 0.0009423345879746376,
- "loss": 0.0126,
+ "loss": 0.0128,
"macro_f1": 0.5492662787437439,
"num_tokens": 3334858.0,
"repeat_count": 0.0,
- "routers_loss": 0.016804348677396774,
+ "routers_loss": 0.01866884157061577,
"skip_count": 2.0,
"step": 2068,
"text_loss": 0.17724967002868652
@@ -19663,13 +19663,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.06591796875,
"learning_rate": 0.000942190200653499,
- "loss": 0.0164,
+ "loss": 0.0162,
"macro_f1": 0.32098764181137085,
"num_tokens": 3338094.0,
"repeat_count": 0.0,
- "routers_loss": 0.02686731517314911,
+ "routers_loss": 0.028636593371629715,
"skip_count": 2.0,
"step": 2070,
"text_loss": 0.34344956278800964
@@ -19682,13 +19682,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0771484375,
+ "grad_norm": 0.07568359375,
"learning_rate": 0.0009420456438842413,
- "loss": 0.0172,
+ "loss": 0.0165,
"macro_f1": 0.5492662787437439,
"num_tokens": 3340526.0,
"repeat_count": 0.0,
- "routers_loss": 0.025320913642644882,
+ "routers_loss": 0.023245645686984062,
"skip_count": 2.0,
"step": 2072,
"text_loss": 0.7276164293289185
@@ -19701,13 +19701,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11376953125,
+ "grad_norm": 0.11328125,
"learning_rate": 0.000941900917722259,
- "loss": 0.0145,
+ "loss": 0.0143,
"macro_f1": 0.3272727429866791,
"num_tokens": 3343303.0,
"repeat_count": 1.0,
- "routers_loss": 0.014900023117661476,
+ "routers_loss": 0.01565689593553543,
"skip_count": 0.0,
"step": 2074,
"text_loss": 0.5665070414543152
@@ -19720,13 +19720,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11474609375,
+ "grad_norm": 0.1201171875,
"learning_rate": 0.0009417560222230115,
- "loss": 0.0244,
+ "loss": 0.0245,
"macro_f1": 0.3333333432674408,
"num_tokens": 3346409.0,
"repeat_count": 0.0,
- "routers_loss": 0.003426895011216402,
+ "routers_loss": 0.0035056080669164658,
"skip_count": 0.0,
"step": 2076,
"text_loss": 0.5112795233726501
@@ -19739,13 +19739,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0712890625,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0009416109574420229,
- "loss": 0.0136,
+ "loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 3349220.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031935563310980797,
+ "routers_loss": 0.0027565446216613054,
"skip_count": 0.0,
"step": 2078,
"text_loss": 0.5240910053253174
@@ -19758,13 +19758,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.08203125,
"learning_rate": 0.0009414657234348823,
- "loss": 0.0183,
+ "loss": 0.0186,
"macro_f1": 1.0,
"num_tokens": 3352627.0,
"repeat_count": 3.0,
- "routers_loss": 0.016454946249723434,
+ "routers_loss": 0.01652451977133751,
"skip_count": 2.0,
"step": 2080,
"text_loss": 1.0217112302780151
@@ -19777,13 +19777,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1650390625,
+ "grad_norm": 0.1630859375,
"learning_rate": 0.0009413203202572438,
- "loss": 0.0174,
+ "loss": 0.0179,
"macro_f1": 0.32098764181137085,
"num_tokens": 3355392.0,
"repeat_count": 0.0,
- "routers_loss": 0.1056143268942833,
+ "routers_loss": 0.1012420505285263,
"skip_count": 2.0,
"step": 2082,
"text_loss": 0.4085482358932495
@@ -19796,13 +19796,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07373046875,
+ "grad_norm": 0.08251953125,
"learning_rate": 0.000941174747964826,
- "loss": 0.016,
+ "loss": 0.0154,
"macro_f1": 0.3333333432674408,
"num_tokens": 3358425.0,
"repeat_count": 0.0,
- "routers_loss": 0.003626141929998994,
+ "routers_loss": 0.004962718114256859,
"skip_count": 0.0,
"step": 2084,
"text_loss": 0.5833504796028137
@@ -19810,18 +19810,18 @@
{
"acc_repeat": 0.5,
"acc_skip": 0.6666666865348816,
- "avg_layers": 26.0,
+ "avg_layers": 27.0,
"epoch": 9.793660111535075,
- "f1_execute": 0.936170220375061,
+ "f1_execute": 0.9583333134651184,
"f1_repeat": 0.6666666865348816,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.107421875,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.11376953125,
"learning_rate": 0.0009410290066134124,
- "loss": 0.0216,
- "macro_f1": 0.7565011978149414,
+ "loss": 0.0211,
+ "macro_f1": 0.8083333373069763,
"num_tokens": 3361925.0,
"repeat_count": 2.0,
- "routers_loss": 0.08091846853494644,
+ "routers_loss": 0.07889176905155182,
"skip_count": 3.0,
"step": 2086,
"text_loss": 0.38126569986343384
@@ -19834,13 +19834,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.056884765625,
+ "grad_norm": 0.051513671875,
"learning_rate": 0.0009408830962588517,
- "loss": 0.0197,
+ "loss": 0.0195,
"macro_f1": 0.6601307392120361,
"num_tokens": 3365963.0,
"repeat_count": 1.0,
- "routers_loss": 0.035208042711019516,
+ "routers_loss": 0.033715736120939255,
"skip_count": 2.0,
"step": 2088,
"text_loss": 0.23213914036750793
@@ -19853,13 +19853,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07958984375,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0009407370169570567,
- "loss": 0.0167,
+ "loss": 0.0169,
"macro_f1": 0.3333333432674408,
"num_tokens": 3369422.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018934847321361303,
+ "routers_loss": 0.0014188943896442652,
"skip_count": 0.0,
"step": 2090,
"text_loss": 0.4648318886756897
@@ -19872,13 +19872,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.0712890625,
"learning_rate": 0.0009405907687640054,
- "loss": 0.0132,
+ "loss": 0.013,
"macro_f1": 0.3272727429866791,
"num_tokens": 3372506.0,
"repeat_count": 0.0,
- "routers_loss": 0.016075141727924347,
+ "routers_loss": 0.015339684672653675,
"skip_count": 1.0,
"step": 2092,
"text_loss": 0.2563800811767578
@@ -19891,13 +19891,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.054443359375,
"learning_rate": 0.0009404443517357404,
"loss": 0.0146,
"macro_f1": 0.542222261428833,
"num_tokens": 3375653.0,
"repeat_count": 4.0,
- "routers_loss": 0.06333976984024048,
+ "routers_loss": 0.06562861055135727,
"skip_count": 0.0,
"step": 2094,
"text_loss": 0.797835111618042
@@ -19910,13 +19910,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.060546875,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.000940297765928369,
- "loss": 0.0133,
+ "loss": 0.0136,
"macro_f1": 0.3333333432674408,
"num_tokens": 3379018.0,
"repeat_count": 0.0,
- "routers_loss": 0.005521406419575214,
+ "routers_loss": 0.005745889153331518,
"skip_count": 0.0,
"step": 2096,
"text_loss": 0.4238114655017853
@@ -19929,13 +19929,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06103515625,
+ "grad_norm": 0.0712890625,
"learning_rate": 0.0009401510113980631,
- "loss": 0.0205,
+ "loss": 0.0207,
"macro_f1": 0.3333333432674408,
"num_tokens": 3382855.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025159218348562717,
+ "routers_loss": 0.0026634482201188803,
"skip_count": 0.0,
"step": 2098,
"text_loss": 0.4967166483402252
@@ -19948,13 +19948,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009400040882010592,
- "loss": 0.0172,
+ "loss": 0.0166,
"macro_f1": 0.3333333432674408,
"num_tokens": 3386386.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025535966269671917,
+ "routers_loss": 0.0020642587915062904,
"skip_count": 0.0,
"step": 2100,
"text_loss": 0.44390562176704407
@@ -19967,13 +19967,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.056640625,
"learning_rate": 0.0009398569963936589,
- "loss": 0.0178,
+ "loss": 0.017,
"macro_f1": 0.3272727429866791,
"num_tokens": 3389958.0,
"repeat_count": 0.0,
- "routers_loss": 0.013569516129791737,
+ "routers_loss": 0.013722737319767475,
"skip_count": 1.0,
"step": 2102,
"text_loss": 0.7207565903663635
@@ -19986,13 +19986,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.08837890625,
"learning_rate": 0.0009397097360322276,
- "loss": 0.0175,
+ "loss": 0.017,
"macro_f1": 0.3333333432674408,
"num_tokens": 3392892.0,
"repeat_count": 0.0,
- "routers_loss": 0.0044935219921171665,
+ "routers_loss": 0.002051608171314001,
"skip_count": 0.0,
"step": 2104,
"text_loss": 0.3196398913860321
@@ -20005,13 +20005,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.000939562307173196,
- "loss": 0.0223,
+ "loss": 0.022,
"macro_f1": 0.3333333432674408,
"num_tokens": 3396636.0,
"repeat_count": 0.0,
- "routers_loss": 0.007407462690025568,
+ "routers_loss": 0.007085663266479969,
"skip_count": 0.0,
"step": 2106,
"text_loss": 0.5663776397705078
@@ -20024,13 +20024,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.13671875,
+ "grad_norm": 0.11328125,
"learning_rate": 0.0009394147098730592,
- "loss": 0.0205,
+ "loss": 0.02,
"macro_f1": 0.5492662787437439,
"num_tokens": 3399475.0,
"repeat_count": 0.0,
- "routers_loss": 0.024386432021856308,
+ "routers_loss": 0.019473131746053696,
"skip_count": 2.0,
"step": 2108,
"text_loss": 0.7708223462104797
@@ -20043,32 +20043,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037353515625,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.0009392669441883767,
- "loss": 0.0135,
+ "loss": 0.0134,
"macro_f1": 0.3333333432674408,
"num_tokens": 3402350.0,
"repeat_count": 0.0,
- "routers_loss": 0.002929724520072341,
+ "routers_loss": 0.0028328890912234783,
"skip_count": 0.0,
"step": 2110,
"text_loss": 0.5888006091117859
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 9.915761667155856,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.1201171875,
+ "f1_skip": 1.0,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.0009391190101757724,
- "loss": 0.0168,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0166,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3405561.0,
"repeat_count": 0.0,
- "routers_loss": 0.026861928403377533,
+ "routers_loss": 0.023098422214388847,
"skip_count": 2.0,
"step": 2112,
"text_loss": 0.09865197539329529
@@ -20081,13 +20081,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0966796875,
+ "grad_norm": 0.10107421875,
"learning_rate": 0.000938970907891935,
- "loss": 0.0251,
+ "loss": 0.0247,
"macro_f1": 0.3333333432674408,
"num_tokens": 3408513.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025369988288730383,
+ "routers_loss": 0.002896632067859173,
"skip_count": 0.0,
"step": 2114,
"text_loss": 0.6613234281539917
@@ -20100,51 +20100,51 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09423828125,
+ "grad_norm": 0.0947265625,
"learning_rate": 0.0009388226373936179,
- "loss": 0.0209,
+ "loss": 0.0211,
"macro_f1": 0.3333333432674408,
"num_tokens": 3411195.0,
"repeat_count": 0.0,
- "routers_loss": 0.014292459934949875,
+ "routers_loss": 0.015814457088708878,
"skip_count": 0.0,
"step": 2116,
"text_loss": 0.17363053560256958
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 27.0,
+ "avg_layers": 28.0,
"epoch": 9.94393894922219,
- "f1_execute": 0.9629629850387573,
- "f1_repeat": 0.0,
+ "f1_execute": 0.9811320900917053,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1181640625,
+ "grad_norm": 0.12451171875,
"learning_rate": 0.0009386741987376381,
- "loss": 0.0151,
- "macro_f1": 0.32098767161369324,
+ "loss": 0.015,
+ "macro_f1": 0.6603773832321167,
"num_tokens": 3414875.0,
"repeat_count": 1.0,
- "routers_loss": 0.027571436017751694,
+ "routers_loss": 0.02676783688366413,
"skip_count": 0.0,
"step": 2118,
"text_loss": 0.674056887626648
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.0,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 27.0,
"epoch": 9.953331376577633,
- "f1_execute": 0.9818181991577148,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.0,
- "grad_norm": 0.08349609375,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0009385255919808778,
- "loss": 0.0205,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0203,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3418410.0,
"repeat_count": 0.0,
- "routers_loss": 0.011719600297510624,
+ "routers_loss": 0.01022857241332531,
"skip_count": 1.0,
"step": 2120,
"text_loss": 0.235092431306839
@@ -20157,13 +20157,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09375,
+ "grad_norm": 0.0888671875,
"learning_rate": 0.0009383768171802836,
- "loss": 0.0249,
+ "loss": 0.0244,
"macro_f1": 0.5492662787437439,
"num_tokens": 3421289.0,
"repeat_count": 0.0,
- "routers_loss": 0.01207603607326746,
+ "routers_loss": 0.013572212308645248,
"skip_count": 2.0,
"step": 2122,
"text_loss": 0.5992844104766846
@@ -20176,13 +20176,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.060791015625,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0009382278743928659,
- "loss": 0.0206,
+ "loss": 0.0201,
"macro_f1": 0.6666666865348816,
"num_tokens": 3424781.0,
"repeat_count": 0.0,
- "routers_loss": 0.008004254661500454,
+ "routers_loss": 0.0051873656921088696,
"skip_count": 2.0,
"step": 2124,
"text_loss": 0.29915499687194824
@@ -20195,13 +20195,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.07421875,
"learning_rate": 0.0009380787636757001,
- "loss": 0.0156,
+ "loss": 0.0155,
"macro_f1": 0.6122449040412903,
"num_tokens": 3427942.0,
"repeat_count": 0.0,
- "routers_loss": 0.030767880380153656,
+ "routers_loss": 0.030079292133450508,
"skip_count": 4.0,
"step": 2126,
"text_loss": 0.24181491136550903
@@ -20214,13 +20214,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06201171875,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0009379294850859256,
"loss": 0.0141,
"macro_f1": 0.3333333432674408,
"num_tokens": 3431314.0,
"repeat_count": 0.0,
- "routers_loss": 0.002620625076815486,
+ "routers_loss": 0.002675612922757864,
"skip_count": 0.0,
"step": 2128,
"text_loss": 0.4669873118400574
@@ -20233,13 +20233,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09033203125,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0009377800386807465,
- "loss": 0.0175,
+ "loss": 0.0177,
"macro_f1": 0.3333333432674408,
"num_tokens": 3435020.0,
"repeat_count": 0.0,
- "routers_loss": 0.009095560759305954,
+ "routers_loss": 0.009334275498986244,
"skip_count": 0.0,
"step": 2130,
"text_loss": 0.6478219628334045
@@ -20252,13 +20252,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.11865234375,
+ "grad_norm": 0.134765625,
"learning_rate": 0.0009376304245174306,
- "loss": 0.0143,
+ "loss": 0.0137,
"macro_f1": 0.6000000238418579,
"num_tokens": 3438276.0,
"repeat_count": 1.0,
- "routers_loss": 0.058448426425457,
+ "routers_loss": 0.038227908313274384,
"skip_count": 2.0,
"step": 2132,
"text_loss": 0.4401201903820038
@@ -20271,13 +20271,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.046875,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0009374806426533104,
- "loss": 0.0116,
+ "loss": 0.0113,
"macro_f1": 0.3333333432674408,
"num_tokens": 3440938.0,
"repeat_count": 0.0,
- "routers_loss": 0.007323687430471182,
+ "routers_loss": 0.006901399698108435,
"skip_count": 0.0,
"step": 2134,
"text_loss": 0.5948942303657532
@@ -20290,13 +20290,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.051513671875,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.0009373306931457827,
- "loss": 0.0122,
+ "loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 3444028.0,
"repeat_count": 0.0,
- "routers_loss": 0.003302243771031499,
+ "routers_loss": 0.0037061909679323435,
"skip_count": 0.0,
"step": 2136,
"text_loss": 0.5349751114845276
@@ -20309,13 +20309,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047607421875,
+ "grad_norm": 0.056884765625,
"learning_rate": 0.0009371805760523086,
- "loss": 0.0113,
+ "loss": 0.0111,
"macro_f1": 0.3333333432674408,
"num_tokens": 3448331.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027974818367511034,
+ "routers_loss": 0.0025877030566334724,
"skip_count": 0.0,
"step": 2138,
"text_loss": 0.4591051936149597
@@ -20328,13 +20328,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009370302914304129,
- "loss": 0.0145,
+ "loss": 0.0144,
"macro_f1": 0.5934640765190125,
"num_tokens": 3451434.0,
"repeat_count": 0.0,
- "routers_loss": 0.01572767272591591,
+ "routers_loss": 0.018742674961686134,
"skip_count": 3.0,
"step": 2140,
"text_loss": 0.23470863699913025
@@ -20347,13 +20347,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06201171875,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009368798393376851,
- "loss": 0.0119,
+ "loss": 0.0122,
"macro_f1": 0.3272727429866791,
"num_tokens": 3454375.0,
"repeat_count": 0.0,
- "routers_loss": 0.020721890032291412,
+ "routers_loss": 0.02382594160735607,
"skip_count": 1.0,
"step": 2142,
"text_loss": 0.6077954769134521
@@ -20366,13 +20366,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.05517578125,
"learning_rate": 0.0009367292198317787,
- "loss": 0.0161,
+ "loss": 0.0164,
"macro_f1": 0.5492662787437439,
"num_tokens": 3457591.0,
"repeat_count": 0.0,
- "routers_loss": 0.03272393345832825,
+ "routers_loss": 0.03331060707569122,
"skip_count": 2.0,
"step": 2144,
"text_loss": 0.3691073954105377
@@ -20385,13 +20385,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052490234375,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0009365784329704115,
- "loss": 0.0191,
+ "loss": 0.0186,
"macro_f1": 0.3333333432674408,
"num_tokens": 3460895.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017473002662882209,
+ "routers_loss": 0.0016955457394942641,
"skip_count": 0.0,
"step": 2146,
"text_loss": 0.3947436511516571
@@ -20404,13 +20404,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.05224609375,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.0009364274788113651,
- "loss": 0.0094,
+ "loss": 0.0096,
"macro_f1": 0.6666666865348816,
"num_tokens": 3464101.0,
"repeat_count": 1.0,
- "routers_loss": 0.008070237934589386,
+ "routers_loss": 0.006169239990413189,
"skip_count": 0.0,
"step": 2148,
"text_loss": 0.3348555266857147
@@ -20423,13 +20423,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.800000011920929,
"f1_skip": 1.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0009362763574124858,
- "loss": 0.0191,
+ "loss": 0.019,
"macro_f1": 0.9265305995941162,
"num_tokens": 3467417.0,
"repeat_count": 3.0,
- "routers_loss": 0.021709222346544266,
+ "routers_loss": 0.024033790454268456,
"skip_count": 1.0,
"step": 2150,
"text_loss": 0.496633380651474
@@ -20442,13 +20442,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.046630859375,
+ "grad_norm": 0.042724609375,
"learning_rate": 0.0009361250688316829,
- "loss": 0.014,
+ "loss": 0.0142,
"macro_f1": 0.3333333432674408,
"num_tokens": 3470917.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022237664088606834,
+ "routers_loss": 0.0024986129719763994,
"skip_count": 0.0,
"step": 2152,
"text_loss": 0.6857671737670898
@@ -20461,13 +20461,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.0546875,
"learning_rate": 0.0009359736131269312,
"loss": 0.0153,
"macro_f1": 0.6666666865348816,
"num_tokens": 3473624.0,
"repeat_count": 0.0,
- "routers_loss": 0.00838750321418047,
+ "routers_loss": 0.008183322846889496,
"skip_count": 1.0,
"step": 2154,
"text_loss": 0.13883116841316223
@@ -20480,13 +20480,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0576171875,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0009358219903562684,
- "loss": 0.01,
+ "loss": 0.0106,
"macro_f1": 0.6666666865348816,
"num_tokens": 3476472.0,
"repeat_count": 0.0,
- "routers_loss": 0.010190514847636223,
+ "routers_loss": 0.011198793537914753,
"skip_count": 3.0,
"step": 2156,
"text_loss": 0.24243666231632233
@@ -20499,13 +20499,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0419921875,
+ "grad_norm": 0.04296875,
"learning_rate": 0.0009356702005777969,
- "loss": 0.0124,
+ "loss": 0.0125,
"macro_f1": 0.3333333432674408,
"num_tokens": 3479688.0,
"repeat_count": 0.0,
- "routers_loss": 0.002411153633147478,
+ "routers_loss": 0.002520184963941574,
"skip_count": 0.0,
"step": 2158,
"text_loss": 0.6407818794250488
@@ -20518,13 +20518,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009355182438496825,
- "loss": 0.0141,
+ "loss": 0.0142,
"macro_f1": 0.3333333432674408,
"num_tokens": 3482598.0,
"repeat_count": 0.0,
- "routers_loss": 0.001032356172800064,
+ "routers_loss": 0.0011065017897635698,
"skip_count": 0.0,
"step": 2160,
"text_loss": 0.7214245796203613
@@ -20537,13 +20537,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05908203125,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0009353661202301557,
- "loss": 0.0147,
+ "loss": 0.0144,
"macro_f1": 0.3333333432674408,
"num_tokens": 3486271.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022046815138310194,
+ "routers_loss": 0.0017824085662141442,
"skip_count": 0.0,
"step": 2162,
"text_loss": 0.5140969157218933
@@ -20556,32 +20556,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.051513671875,
+ "grad_norm": 0.053466796875,
"learning_rate": 0.0009352138297775101,
"loss": 0.0145,
"macro_f1": 0.3333333432674408,
"num_tokens": 3489206.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014977266546338797,
+ "routers_loss": 0.001542879967018962,
"skip_count": 0.0,
"step": 2164,
"text_loss": 0.7956416606903076
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6666666865348816,
- "avg_layers": 26.0,
+ "acc_skip": 1.0,
+ "avg_layers": 25.0,
"epoch": 10.169063692398003,
- "f1_execute": 0.9803921580314636,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.800000011920929,
+ "f1_skip": 1.0,
"grad_norm": 0.0771484375,
"learning_rate": 0.000935061372550104,
- "loss": 0.0132,
- "macro_f1": 0.5934640765190125,
+ "loss": 0.0134,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3492003.0,
"repeat_count": 0.0,
- "routers_loss": 0.016847684979438782,
+ "routers_loss": 0.01420794241130352,
"skip_count": 3.0,
"step": 2166,
"text_loss": 0.27489882707595825
@@ -20594,13 +20594,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0009349087486063594,
- "loss": 0.0168,
+ "loss": 0.0166,
"macro_f1": 0.6666666865348816,
"num_tokens": 3494784.0,
"repeat_count": 0.0,
- "routers_loss": 0.0036806222051382065,
+ "routers_loss": 0.003614309709519148,
"skip_count": 1.0,
"step": 2168,
"text_loss": 0.2962227761745453
@@ -20613,13 +20613,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0009347559580047618,
- "loss": 0.0174,
+ "loss": 0.0175,
"macro_f1": 0.8814815282821655,
"num_tokens": 3497886.0,
"repeat_count": 2.0,
- "routers_loss": 0.021412594243884087,
+ "routers_loss": 0.02122853323817253,
"skip_count": 4.0,
"step": 2170,
"text_loss": 0.5919580459594727
@@ -20627,18 +20627,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 10.197240974464338,
- "f1_execute": 1.0,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.06591796875,
+ "f1_skip": 0.6666666865348816,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.000934603000803861,
- "loss": 0.0134,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0135,
+ "macro_f1": 0.5492662787437439,
"num_tokens": 3500939.0,
"repeat_count": 0.0,
- "routers_loss": 0.0201424453407526,
+ "routers_loss": 0.02042219042778015,
"skip_count": 1.0,
"step": 2172,
"text_loss": 0.28722381591796875
@@ -20651,13 +20651,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05419921875,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0009344498770622704,
- "loss": 0.0131,
+ "loss": 0.013,
"macro_f1": 0.3333333432674408,
"num_tokens": 3504852.0,
"repeat_count": 0.0,
- "routers_loss": 0.005059401970356703,
+ "routers_loss": 0.004345106892287731,
"skip_count": 0.0,
"step": 2174,
"text_loss": 0.603236734867096
@@ -20670,13 +20670,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.1064453125,
"learning_rate": 0.0009342965868386673,
"loss": 0.0101,
"macro_f1": 0.3333333432674408,
"num_tokens": 3508320.0,
"repeat_count": 0.0,
- "routers_loss": 0.004006600938737392,
+ "routers_loss": 0.00368050136603415,
"skip_count": 0.0,
"step": 2176,
"text_loss": 0.6020491719245911
@@ -20691,11 +20691,11 @@
"f1_skip": 0.0,
"grad_norm": 0.060302734375,
"learning_rate": 0.000934143130191793,
- "loss": 0.0109,
+ "loss": 0.0108,
"macro_f1": 0.3333333432674408,
"num_tokens": 3511278.0,
"repeat_count": 0.0,
- "routers_loss": 0.013246738351881504,
+ "routers_loss": 0.013425769284367561,
"skip_count": 0.0,
"step": 2178,
"text_loss": 0.5954724550247192
@@ -20708,13 +20708,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06005859375,
+ "grad_norm": 0.060546875,
"learning_rate": 0.000933989507180452,
- "loss": 0.0151,
+ "loss": 0.0149,
"macro_f1": 0.3333333432674408,
"num_tokens": 3514361.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031937146559357643,
+ "routers_loss": 0.002896249992772937,
"skip_count": 0.0,
"step": 2180,
"text_loss": 0.39175131916999817
@@ -20727,13 +20727,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0556640625,
+ "grad_norm": 0.052978515625,
"learning_rate": 0.0009338357178635135,
- "loss": 0.0151,
+ "loss": 0.0147,
"macro_f1": 0.6603773832321167,
"num_tokens": 3517962.0,
"repeat_count": 1.0,
- "routers_loss": 0.014782631769776344,
+ "routers_loss": 0.011538350023329258,
"skip_count": 1.0,
"step": 2182,
"text_loss": 0.4482830762863159
@@ -20746,13 +20746,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.0869140625,
"learning_rate": 0.0009336817622999093,
- "loss": 0.0112,
+ "loss": 0.011,
"macro_f1": 0.3272727429866791,
"num_tokens": 3521299.0,
"repeat_count": 1.0,
- "routers_loss": 0.02318345196545124,
+ "routers_loss": 0.022787930443882942,
"skip_count": 0.0,
"step": 2184,
"text_loss": 0.35177817940711975
@@ -20765,13 +20765,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.055419921875,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009335276405486357,
- "loss": 0.0134,
+ "loss": 0.0139,
"macro_f1": 0.3272727429866791,
"num_tokens": 3524611.0,
"repeat_count": 0.0,
- "routers_loss": 0.011735675856471062,
+ "routers_loss": 0.011597735807299614,
"skip_count": 1.0,
"step": 2186,
"text_loss": 0.24868851900100708
@@ -20784,13 +20784,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0009333733526687524,
- "loss": 0.0198,
+ "loss": 0.0196,
"macro_f1": 0.3333333432674408,
"num_tokens": 3528012.0,
"repeat_count": 0.0,
- "routers_loss": 0.01558679062873125,
+ "routers_loss": 0.014253967441618443,
"skip_count": 0.0,
"step": 2188,
"text_loss": 0.3970910310745239
@@ -20803,13 +20803,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056396484375,
+ "grad_norm": 0.054931640625,
"learning_rate": 0.000933218898719383,
- "loss": 0.0163,
+ "loss": 0.0162,
"macro_f1": 0.3333333432674408,
"num_tokens": 3530908.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019149131840094924,
+ "routers_loss": 0.001659149187617004,
"skip_count": 0.0,
"step": 2190,
"text_loss": 0.7618573307991028
@@ -20822,13 +20822,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07958984375,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0009330642787597141,
- "loss": 0.0161,
+ "loss": 0.0159,
"macro_f1": 0.3333333432674408,
"num_tokens": 3533993.0,
"repeat_count": 0.0,
- "routers_loss": 0.0056966920383274555,
+ "routers_loss": 0.005574346985667944,
"skip_count": 0.0,
"step": 2192,
"text_loss": 0.16470147669315338
@@ -20841,13 +20841,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07080078125,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0009329094928489969,
"loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 3537310.0,
"repeat_count": 0.0,
- "routers_loss": 0.002511024009436369,
+ "routers_loss": 0.0026400673668831587,
"skip_count": 0.0,
"step": 2194,
"text_loss": 0.3400416374206543
@@ -20860,13 +20860,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08935546875,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0009327545410465452,
- "loss": 0.0126,
+ "loss": 0.0124,
"macro_f1": 0.6666666865348816,
"num_tokens": 3540045.0,
"repeat_count": 0.0,
- "routers_loss": 0.008584192954003811,
+ "routers_loss": 0.008448398672044277,
"skip_count": 3.0,
"step": 2196,
"text_loss": 0.3110542297363281
@@ -20879,13 +20879,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.0009325994234117372,
- "loss": 0.0129,
+ "loss": 0.0122,
"macro_f1": 0.32098764181137085,
"num_tokens": 3544097.0,
"repeat_count": 0.0,
- "routers_loss": 0.03748156875371933,
+ "routers_loss": 0.037553198635578156,
"skip_count": 2.0,
"step": 2198,
"text_loss": 0.36126700043678284
@@ -20898,13 +20898,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.09814453125,
+ "grad_norm": 0.09716796875,
"learning_rate": 0.000932444140004014,
- "loss": 0.0129,
+ "loss": 0.0124,
"macro_f1": 0.6666666865348816,
"num_tokens": 3547054.0,
"repeat_count": 1.0,
- "routers_loss": 0.006402099970728159,
+ "routers_loss": 0.006464479025453329,
"skip_count": 0.0,
"step": 2200,
"text_loss": 0.4947047233581543
@@ -20917,13 +20917,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.158203125,
+ "grad_norm": 0.1015625,
"learning_rate": 0.0009322886908828805,
- "loss": 0.015,
+ "loss": 0.0138,
"macro_f1": 0.6666666865348816,
"num_tokens": 3549903.0,
"repeat_count": 1.0,
- "routers_loss": 0.0055928584188222885,
+ "routers_loss": 0.005384812597185373,
"skip_count": 0.0,
"step": 2202,
"text_loss": 0.5923738479614258
@@ -20936,13 +20936,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0625,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0009321330761079052,
"loss": 0.0149,
"macro_f1": 0.6666666865348816,
"num_tokens": 3553745.0,
"repeat_count": 0.0,
- "routers_loss": 0.013155708089470863,
+ "routers_loss": 0.015346619300544262,
"skip_count": 2.0,
"step": 2204,
"text_loss": 0.1904175877571106
@@ -20955,13 +20955,13 @@
"f1_execute": 0.9268292784690857,
"f1_repeat": 0.800000011920929,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.06494140625,
"learning_rate": 0.00093197729573872,
- "loss": 0.0206,
+ "loss": 0.0203,
"macro_f1": 0.8422764539718628,
"num_tokens": 3557235.0,
"repeat_count": 3.0,
- "routers_loss": 0.12029488384723663,
+ "routers_loss": 0.1207597479224205,
"skip_count": 6.0,
"step": 2206,
"text_loss": 0.3904837667942047
@@ -20974,13 +20974,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0771484375,
"learning_rate": 0.0009318213498350202,
- "loss": 0.011,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 3560795.0,
"repeat_count": 0.0,
- "routers_loss": 0.0037007431965321302,
+ "routers_loss": 0.003334777895361185,
"skip_count": 0.0,
"step": 2208,
"text_loss": 0.4268290102481842
@@ -20993,13 +20993,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.0537109375,
"learning_rate": 0.0009316652384565645,
- "loss": 0.0124,
+ "loss": 0.0123,
"macro_f1": 0.3333333432674408,
"num_tokens": 3563754.0,
"repeat_count": 0.0,
- "routers_loss": 0.004071404226124287,
+ "routers_loss": 0.004230072256177664,
"skip_count": 0.0,
"step": 2210,
"text_loss": 0.40049710869789124
@@ -21012,13 +21012,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.046875,
"learning_rate": 0.0009315089616631751,
- "loss": 0.0103,
+ "loss": 0.0106,
"macro_f1": 0.3333333432674408,
"num_tokens": 3567173.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006955390563234687,
+ "routers_loss": 0.0006645230459980667,
"skip_count": 0.0,
"step": 2212,
"text_loss": 0.42568323016166687
@@ -21031,32 +21031,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0849609375,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0009313525195147376,
- "loss": 0.0128,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 3570831.0,
"repeat_count": 0.0,
- "routers_loss": 0.010293997824192047,
+ "routers_loss": 0.0097877848893404,
"skip_count": 0.0,
"step": 2214,
"text_loss": 0.45808279514312744
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.5,
"acc_skip": 0.3333333432674408,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 10.40387437628412,
- "f1_execute": 0.9583333134651184,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9387754797935486,
+ "f1_repeat": 0.6666666865348816,
"f1_skip": 0.5,
- "grad_norm": 0.07470703125,
+ "grad_norm": 0.076171875,
"learning_rate": 0.000931195912071201,
- "loss": 0.0185,
- "macro_f1": 0.8194444179534912,
+ "loss": 0.0187,
+ "macro_f1": 0.7018141150474548,
"num_tokens": 3573745.0,
"repeat_count": 2.0,
- "routers_loss": 0.06593773514032364,
+ "routers_loss": 0.07351134717464447,
"skip_count": 3.0,
"step": 2216,
"text_loss": 0.285696804523468
@@ -21069,13 +21069,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009310391393925775,
- "loss": 0.013,
+ "loss": 0.0125,
"macro_f1": 0.3333333432674408,
"num_tokens": 3576785.0,
"repeat_count": 0.0,
- "routers_loss": 0.00347105972468853,
+ "routers_loss": 0.0033160944003611803,
"skip_count": 0.0,
"step": 2218,
"text_loss": 0.17516443133354187
@@ -21088,32 +21088,32 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.04736328125,
+ "grad_norm": 0.047119140625,
"learning_rate": 0.0009308822015389424,
- "loss": 0.0244,
+ "loss": 0.0241,
"macro_f1": 0.5427350401878357,
"num_tokens": 3580695.0,
"repeat_count": 1.0,
- "routers_loss": 0.04871147498488426,
+ "routers_loss": 0.052930232137441635,
"skip_count": 1.0,
"step": 2220,
"text_loss": 0.5918155908584595
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 26.0,
+ "acc_skip": 0.75,
+ "avg_layers": 25.0,
"epoch": 10.432051658350455,
- "f1_execute": 0.9600000381469727,
+ "f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.05517578125,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.072265625,
"learning_rate": 0.0009307250985704352,
- "loss": 0.012,
- "macro_f1": 0.542222261428833,
+ "loss": 0.0128,
+ "macro_f1": 0.6122449040412903,
"num_tokens": 3583729.0,
"repeat_count": 0.0,
- "routers_loss": 0.024859672412276268,
+ "routers_loss": 0.025454653427004814,
"skip_count": 4.0,
"step": 2222,
"text_loss": 0.2652169466018677
@@ -21126,13 +21126,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.052001953125,
"learning_rate": 0.0009305678305472575,
- "loss": 0.016,
+ "loss": 0.0158,
"macro_f1": 0.3333333432674408,
"num_tokens": 3586775.0,
"repeat_count": 0.0,
- "routers_loss": 0.010990055277943611,
+ "routers_loss": 0.011279845610260963,
"skip_count": 0.0,
"step": 2224,
"text_loss": 0.3511691987514496
@@ -21145,13 +21145,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10986328125,
+ "grad_norm": 0.10791015625,
"learning_rate": 0.000930410397529675,
- "loss": 0.0171,
+ "loss": 0.017,
"macro_f1": 0.3333333432674408,
"num_tokens": 3589676.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025031559634953737,
+ "routers_loss": 0.002700264798477292,
"skip_count": 0.0,
"step": 2226,
"text_loss": 0.24045433104038239
@@ -21164,13 +21164,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.048095703125,
"learning_rate": 0.000930252799578016,
- "loss": 0.0147,
+ "loss": 0.0146,
"macro_f1": 1.0,
"num_tokens": 3593242.0,
"repeat_count": 1.0,
- "routers_loss": 0.008100497536361217,
+ "routers_loss": 0.00826631672680378,
"skip_count": 2.0,
"step": 2228,
"text_loss": 0.3777645528316498
@@ -21183,13 +21183,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0009300950367526728,
- "loss": 0.0128,
+ "loss": 0.0131,
"macro_f1": 0.8820862174034119,
"num_tokens": 3596807.0,
"repeat_count": 2.0,
- "routers_loss": 0.03150207921862602,
+ "routers_loss": 0.036221496760845184,
"skip_count": 2.0,
"step": 2230,
"text_loss": 0.502962589263916
@@ -21202,13 +21202,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07470703125,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009299371091141001,
- "loss": 0.0132,
+ "loss": 0.0131,
"macro_f1": 0.3333333432674408,
"num_tokens": 3600150.0,
"repeat_count": 0.0,
- "routers_loss": 0.006253884173929691,
+ "routers_loss": 0.006449893582612276,
"skip_count": 0.0,
"step": 2232,
"text_loss": 0.20256924629211426
@@ -21221,13 +21221,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.046142578125,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.0009297790167228161,
- "loss": 0.0119,
+ "loss": 0.012,
"macro_f1": 0.6666666865348816,
"num_tokens": 3602988.0,
"repeat_count": 0.0,
- "routers_loss": 0.007228068076074123,
+ "routers_loss": 0.007872486487030983,
"skip_count": 2.0,
"step": 2234,
"text_loss": 0.42476826906204224
@@ -21240,13 +21240,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0009296207596394022,
- "loss": 0.0103,
+ "loss": 0.0101,
"macro_f1": 0.32098764181137085,
"num_tokens": 3606071.0,
"repeat_count": 0.0,
- "routers_loss": 0.02524643763899803,
+ "routers_loss": 0.027397040277719498,
"skip_count": 2.0,
"step": 2236,
"text_loss": 0.23432791233062744
@@ -21259,13 +21259,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.0595703125,
"learning_rate": 0.0009294623379245028,
- "loss": 0.0119,
+ "loss": 0.0117,
"macro_f1": 0.3333333432674408,
"num_tokens": 3609389.0,
"repeat_count": 0.0,
- "routers_loss": 0.009672109968960285,
+ "routers_loss": 0.01042645052075386,
"skip_count": 0.0,
"step": 2238,
"text_loss": 0.16665785014629364
@@ -21278,13 +21278,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0498046875,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0009293037516388252,
- "loss": 0.0155,
+ "loss": 0.0161,
"macro_f1": 0.3333333432674408,
"num_tokens": 3612105.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010066524846479297,
+ "routers_loss": 0.0012458425480872393,
"skip_count": 0.0,
"step": 2240,
"text_loss": 0.59421306848526
@@ -21297,13 +21297,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0732421875,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0009291450008431404,
- "loss": 0.0184,
+ "loss": 0.0185,
"macro_f1": 1.0,
"num_tokens": 3615439.0,
"repeat_count": 1.0,
- "routers_loss": 0.005509128328412771,
+ "routers_loss": 0.005781981628388166,
"skip_count": 1.0,
"step": 2242,
"text_loss": 0.510798454284668
@@ -21316,13 +21316,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
- "grad_norm": 0.09423828125,
+ "grad_norm": 0.0966796875,
"learning_rate": 0.0009289860855982814,
- "loss": 0.0172,
+ "loss": 0.0166,
"macro_f1": 0.4871794879436493,
"num_tokens": 3618842.0,
"repeat_count": 0.0,
- "routers_loss": 0.030802007764577866,
+ "routers_loss": 0.031195320188999176,
"skip_count": 3.0,
"step": 2244,
"text_loss": 0.7574363350868225
@@ -21335,13 +21335,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.04931640625,
"learning_rate": 0.0009288270059651454,
"loss": 0.0133,
"macro_f1": 0.3333333432674408,
"num_tokens": 3621823.0,
"repeat_count": 0.0,
- "routers_loss": 0.001686889911070466,
+ "routers_loss": 0.001746491645462811,
"skip_count": 0.0,
"step": 2246,
"text_loss": 0.5125683546066284
@@ -21354,13 +21354,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.1943359375,
+ "grad_norm": 0.220703125,
"learning_rate": 0.0009286677620046918,
- "loss": 0.0163,
+ "loss": 0.0159,
"macro_f1": 0.5492662787437439,
"num_tokens": 3624502.0,
"repeat_count": 0.0,
- "routers_loss": 0.03299177065491676,
+ "routers_loss": 0.03792348504066467,
"skip_count": 2.0,
"step": 2248,
"text_loss": 0.7533677220344543
@@ -21375,11 +21375,11 @@
"f1_skip": 0.0,
"grad_norm": 0.07763671875,
"learning_rate": 0.0009285083537779429,
- "loss": 0.0119,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 3627057.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010354233672842383,
+ "routers_loss": 0.0009684451506473124,
"skip_count": 0.0,
"step": 2250,
"text_loss": 0.2219279706478119
@@ -21392,13 +21392,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.10205078125,
+ "grad_norm": 0.11767578125,
"learning_rate": 0.0009283487813459845,
- "loss": 0.0145,
+ "loss": 0.0148,
"macro_f1": 0.5492662787437439,
"num_tokens": 3629720.0,
"repeat_count": 0.0,
- "routers_loss": 0.02196674607694149,
+ "routers_loss": 0.022757573053240776,
"skip_count": 2.0,
"step": 2252,
"text_loss": 0.6903313994407654
@@ -21411,13 +21411,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.1201171875,
+ "grad_norm": 0.1376953125,
"learning_rate": 0.0009281890447699652,
"loss": 0.015,
"macro_f1": 0.6666666865348816,
"num_tokens": 3633234.0,
"repeat_count": 1.0,
- "routers_loss": 0.002239946974441409,
+ "routers_loss": 0.003613058477640152,
"skip_count": 0.0,
"step": 2254,
"text_loss": 0.6278893351554871
@@ -21430,13 +21430,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.046142578125,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0009280291441110961,
- "loss": 0.0117,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 3636289.0,
"repeat_count": 0.0,
- "routers_loss": 0.0063575254753232,
+ "routers_loss": 0.006214062683284283,
"skip_count": 0.0,
"step": 2256,
"text_loss": 0.3011114001274109
@@ -21449,13 +21449,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.040283203125,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0009278690794306517,
- "loss": 0.0143,
+ "loss": 0.014,
"macro_f1": 0.5492662787437439,
"num_tokens": 3640251.0,
"repeat_count": 0.0,
- "routers_loss": 0.0524379126727581,
+ "routers_loss": 0.052556321024894714,
"skip_count": 2.0,
"step": 2258,
"text_loss": 0.19894185662269592
@@ -21468,13 +21468,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.8571428656578064,
"f1_skip": 1.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.08251953125,
"learning_rate": 0.0009277088507899689,
- "loss": 0.0156,
+ "loss": 0.0163,
"macro_f1": 0.9452888369560242,
"num_tokens": 3643527.0,
"repeat_count": 4.0,
- "routers_loss": 0.052486274391412735,
+ "routers_loss": 0.0572301521897316,
"skip_count": 1.0,
"step": 2260,
"text_loss": 0.5593410134315491
@@ -21487,13 +21487,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041748046875,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.0009275484582504475,
"loss": 0.0104,
"macro_f1": 0.3333333432674408,
"num_tokens": 3646959.0,
"repeat_count": 0.0,
- "routers_loss": 0.006877690553665161,
+ "routers_loss": 0.008010074496269226,
"skip_count": 0.0,
"step": 2262,
"text_loss": 0.2128177285194397
@@ -21506,13 +21506,13 @@
"f1_execute": 0.95652174949646,
"f1_repeat": 0.800000011920929,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.05322265625,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0009273879018735505,
- "loss": 0.0136,
+ "loss": 0.0138,
"macro_f1": 0.8521739840507507,
"num_tokens": 3651298.0,
"repeat_count": 3.0,
- "routers_loss": 0.03128742054104805,
+ "routers_loss": 0.035729870200157166,
"skip_count": 3.0,
"step": 2264,
"text_loss": 0.2987811267375946
@@ -21525,13 +21525,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1689453125,
+ "grad_norm": 0.1474609375,
"learning_rate": 0.0009272271817208031,
- "loss": 0.0188,
+ "loss": 0.0182,
"macro_f1": 0.3333333432674408,
"num_tokens": 3655609.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028425443451851606,
+ "routers_loss": 0.002379779238253832,
"skip_count": 0.0,
"step": 2266,
"text_loss": 0.6024088263511658
@@ -21544,13 +21544,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06689453125,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0009270662978537939,
- "loss": 0.0101,
+ "loss": 0.0098,
"macro_f1": 0.3333333432674408,
"num_tokens": 3658444.0,
"repeat_count": 0.0,
- "routers_loss": 0.009712206199765205,
+ "routers_loss": 0.008943650871515274,
"skip_count": 0.0,
"step": 2268,
"text_loss": 0.1741207242012024
@@ -21563,13 +21563,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0009269052503341736,
- "loss": 0.0162,
+ "loss": 0.0161,
"macro_f1": 0.6595745086669922,
"num_tokens": 3662282.0,
"repeat_count": 1.0,
- "routers_loss": 0.03980376198887825,
+ "routers_loss": 0.030201267451047897,
"skip_count": 4.0,
"step": 2270,
"text_loss": 0.7300035953521729
@@ -21582,13 +21582,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.0009267440392236562,
- "loss": 0.0098,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 3665531.0,
"repeat_count": 0.0,
- "routers_loss": 0.0030603872146457434,
+ "routers_loss": 0.0026635683607310057,
"skip_count": 0.0,
"step": 2272,
"text_loss": 0.31535038352012634
@@ -21601,13 +21601,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0009265826645840178,
"loss": 0.0151,
"macro_f1": 0.3333333432674408,
"num_tokens": 3668407.0,
"repeat_count": 0.0,
- "routers_loss": 0.004795679822564125,
+ "routers_loss": 0.004258926957845688,
"skip_count": 0.0,
"step": 2274,
"text_loss": 0.7272579073905945
@@ -21620,13 +21620,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.1435546875,
+ "grad_norm": 0.125,
"learning_rate": 0.0009264211264770976,
- "loss": 0.0155,
+ "loss": 0.0154,
"macro_f1": 0.6122449040412903,
"num_tokens": 3671503.0,
"repeat_count": 0.0,
- "routers_loss": 0.0340447798371315,
+ "routers_loss": 0.038987524807453156,
"skip_count": 4.0,
"step": 2276,
"text_loss": 0.7488982677459717
@@ -21639,13 +21639,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.099609375,
"learning_rate": 0.0009262594249647975,
- "loss": 0.016,
+ "loss": 0.0164,
"macro_f1": 0.6666666865348816,
"num_tokens": 3674107.0,
"repeat_count": 0.0,
- "routers_loss": 0.007436402142047882,
+ "routers_loss": 0.007211760152131319,
"skip_count": 1.0,
"step": 2278,
"text_loss": 0.1992369294166565
@@ -21658,13 +21658,13 @@
"f1_execute": 0.9767441749572754,
"f1_repeat": 0.8571428656578064,
"f1_skip": 1.0,
- "grad_norm": 0.056396484375,
+ "grad_norm": 0.0546875,
"learning_rate": 0.0009260975601090815,
- "loss": 0.0113,
+ "loss": 0.0112,
"macro_f1": 0.9446290731430054,
"num_tokens": 3677184.0,
"repeat_count": 4.0,
- "routers_loss": 0.02465176396071911,
+ "routers_loss": 0.02538592554628849,
"skip_count": 3.0,
"step": 2280,
"text_loss": 0.46402135491371155
@@ -21677,13 +21677,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0009259355319719768,
- "loss": 0.0167,
+ "loss": 0.0162,
"macro_f1": 0.3333333432674408,
"num_tokens": 3680683.0,
"repeat_count": 0.0,
- "routers_loss": 0.0037910486571490765,
+ "routers_loss": 0.0038464947137981653,
"skip_count": 0.0,
"step": 2282,
"text_loss": 0.5804527401924133
@@ -21696,13 +21696,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.130859375,
+ "grad_norm": 0.1611328125,
"learning_rate": 0.0009257733406155726,
- "loss": 0.0161,
+ "loss": 0.0169,
"macro_f1": 0.3333333432674408,
"num_tokens": 3683928.0,
"repeat_count": 0.0,
- "routers_loss": 0.003716849023476243,
+ "routers_loss": 0.004841136280447245,
"skip_count": 0.0,
"step": 2284,
"text_loss": 0.4834538400173187
@@ -21715,13 +21715,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0009256109861020212,
- "loss": 0.0118,
+ "loss": 0.0115,
"macro_f1": 0.3333333432674408,
"num_tokens": 3687101.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021690395660698414,
+ "routers_loss": 0.002191900508478284,
"skip_count": 0.0,
"step": 2286,
"text_loss": 0.8199604749679565
@@ -21734,13 +21734,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.000925448468493537,
"loss": 0.0162,
"macro_f1": 0.5427350401878357,
"num_tokens": 3690490.0,
"repeat_count": 1.0,
- "routers_loss": 0.034040264785289764,
+ "routers_loss": 0.03488675877451897,
"skip_count": 2.0,
"step": 2288,
"text_loss": 0.33263635635375977
@@ -21753,32 +21753,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0009252857878523971,
- "loss": 0.0133,
+ "loss": 0.0134,
"macro_f1": 0.6666666865348816,
"num_tokens": 3694109.0,
"repeat_count": 1.0,
- "routers_loss": 0.0027822356205433607,
+ "routers_loss": 0.002897309372201562,
"skip_count": 0.0,
"step": 2290,
"text_loss": 0.47494807839393616
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 10.760786615791018,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.0634765625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.05810546875,
"learning_rate": 0.000925122944240941,
- "loss": 0.0156,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0153,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3697233.0,
"repeat_count": 0.0,
- "routers_loss": 0.020813947543501854,
+ "routers_loss": 0.01842675730586052,
"skip_count": 2.0,
"step": 2292,
"text_loss": 0.14693495631217957
@@ -21791,13 +21791,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.042236328125,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0009249599377215707,
- "loss": 0.0145,
+ "loss": 0.0146,
"macro_f1": 0.5866667032241821,
"num_tokens": 3700376.0,
"repeat_count": 1.0,
- "routers_loss": 0.038725610822439194,
+ "routers_loss": 0.04169808700680733,
"skip_count": 3.0,
"step": 2294,
"text_loss": 0.38051268458366394
@@ -21810,13 +21810,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059326171875,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.0009247967683567507,
- "loss": 0.0117,
+ "loss": 0.0112,
"macro_f1": 0.3272727429866791,
"num_tokens": 3703212.0,
"repeat_count": 0.0,
- "routers_loss": 0.01360203418880701,
+ "routers_loss": 0.012183113023638725,
"skip_count": 1.0,
"step": 2296,
"text_loss": 0.23789077997207642
@@ -21829,13 +21829,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0498046875,
+ "grad_norm": 0.05712890625,
"learning_rate": 0.0009246334362090077,
- "loss": 0.0135,
+ "loss": 0.0137,
"macro_f1": 0.8823530077934265,
"num_tokens": 3706490.0,
"repeat_count": 1.0,
- "routers_loss": 0.021909991279244423,
+ "routers_loss": 0.01880069635808468,
"skip_count": 2.0,
"step": 2298,
"text_loss": 0.29067978262901306
@@ -21848,13 +21848,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0830078125,
+ "grad_norm": 0.08203125,
"learning_rate": 0.000924469941340931,
- "loss": 0.0175,
+ "loss": 0.0173,
"macro_f1": 0.3272727429866791,
"num_tokens": 3709804.0,
"repeat_count": 1.0,
- "routers_loss": 0.03153124824166298,
+ "routers_loss": 0.027359159663319588,
"skip_count": 0.0,
"step": 2300,
"text_loss": 0.67828369140625
@@ -21867,13 +21867,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.000924306283815172,
- "loss": 0.0154,
+ "loss": 0.0153,
"macro_f1": 0.3333333432674408,
"num_tokens": 3712824.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034419491421431303,
+ "routers_loss": 0.003152279881760478,
"skip_count": 0.0,
"step": 2302,
"text_loss": 0.8333184719085693
@@ -21886,13 +21886,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0009241424636944445,
- "loss": 0.0163,
+ "loss": 0.0159,
"macro_f1": 0.5492662787437439,
"num_tokens": 3715385.0,
"repeat_count": 0.0,
- "routers_loss": 0.03655214607715607,
+ "routers_loss": 0.0442950464785099,
"skip_count": 2.0,
"step": 2304,
"text_loss": 0.41893699765205383
@@ -21905,13 +21905,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0576171875,
+ "grad_norm": 0.058837890625,
"learning_rate": 0.0009239784810415249,
- "loss": 0.014,
+ "loss": 0.0137,
"macro_f1": 0.8823530077934265,
"num_tokens": 3719080.0,
"repeat_count": 1.0,
- "routers_loss": 0.015360959805548191,
+ "routers_loss": 0.015729321166872978,
"skip_count": 2.0,
"step": 2306,
"text_loss": 0.13360483944416046
@@ -21924,13 +21924,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.0537109375,
+ "grad_norm": 0.06787109375,
"learning_rate": 0.0009238143359192514,
"loss": 0.0136,
"macro_f1": 0.5934640765190125,
"num_tokens": 3722439.0,
"repeat_count": 0.0,
- "routers_loss": 0.027275927364826202,
+ "routers_loss": 0.028816604986786842,
"skip_count": 3.0,
"step": 2308,
"text_loss": 0.39594101905822754
@@ -21943,13 +21943,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0546875,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.000923650028390525,
- "loss": 0.0163,
+ "loss": 0.0166,
"macro_f1": 0.6666666865348816,
"num_tokens": 3725092.0,
"repeat_count": 0.0,
- "routers_loss": 0.003742894157767296,
+ "routers_loss": 0.0036455015651881695,
"skip_count": 2.0,
"step": 2310,
"text_loss": 0.6169708371162415
@@ -21962,13 +21962,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0927734375,
+ "grad_norm": 0.09814453125,
"learning_rate": 0.0009234855585183086,
- "loss": 0.0135,
+ "loss": 0.014,
"macro_f1": 0.6666666865348816,
"num_tokens": 3728412.0,
"repeat_count": 0.0,
- "routers_loss": 0.009356650523841381,
+ "routers_loss": 0.007565604057163,
"skip_count": 1.0,
"step": 2312,
"text_loss": 0.21257059276103973
@@ -21983,11 +21983,11 @@
"f1_skip": 0.800000011920929,
"grad_norm": 0.0517578125,
"learning_rate": 0.0009233209263656273,
- "loss": 0.0189,
+ "loss": 0.0184,
"macro_f1": 0.9262410998344421,
"num_tokens": 3731467.0,
"repeat_count": 2.0,
- "routers_loss": 0.02852487564086914,
+ "routers_loss": 0.02510629966855049,
"skip_count": 3.0,
"step": 2314,
"text_loss": 0.21639840304851532
@@ -22000,13 +22000,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.057861328125,
"learning_rate": 0.0009231561319955684,
- "loss": 0.0151,
+ "loss": 0.0154,
"macro_f1": 0.3333333432674408,
"num_tokens": 3734906.0,
"repeat_count": 0.0,
- "routers_loss": 0.007533316500484943,
+ "routers_loss": 0.00872227642685175,
"skip_count": 0.0,
"step": 2316,
"text_loss": 0.35639774799346924
@@ -22019,13 +22019,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.09130859375,
+ "grad_norm": 0.08349609375,
"learning_rate": 0.0009229911754712815,
"loss": 0.0176,
"macro_f1": 0.3333333432674408,
"num_tokens": 3737943.0,
"repeat_count": 0.0,
- "routers_loss": 0.004666361026465893,
+ "routers_loss": 0.004695790819823742,
"skip_count": 0.0,
"step": 2318,
"text_loss": 0.5269573330879211
@@ -22038,32 +22038,32 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.036376953125,
"learning_rate": 0.0009228260568559781,
- "loss": 0.0117,
+ "loss": 0.0115,
"macro_f1": 0.3272727429866791,
"num_tokens": 3741833.0,
"repeat_count": 1.0,
- "routers_loss": 0.020992714911699295,
+ "routers_loss": 0.0217357836663723,
"skip_count": 0.0,
"step": 2320,
"text_loss": 0.5110208988189697
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 10.901673026122689,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.1416015625,
+ "f1_skip": 0.0,
+ "grad_norm": 0.1953125,
"learning_rate": 0.0009226607762129322,
- "loss": 0.0204,
- "macro_f1": 0.6603773832321167,
+ "loss": 0.0201,
+ "macro_f1": 0.32098764181137085,
"num_tokens": 3744642.0,
"repeat_count": 1.0,
- "routers_loss": 0.047016773372888565,
+ "routers_loss": 0.05595960095524788,
"skip_count": 1.0,
"step": 2322,
"text_loss": 0.6291998624801636
@@ -22078,11 +22078,11 @@
"f1_skip": 0.0,
"grad_norm": 0.056884765625,
"learning_rate": 0.0009224953336054796,
- "loss": 0.0156,
+ "loss": 0.0161,
"macro_f1": 0.3333333432674408,
"num_tokens": 3748127.0,
"repeat_count": 0.0,
- "routers_loss": 0.006612313445657492,
+ "routers_loss": 0.0071634589694440365,
"skip_count": 0.0,
"step": 2324,
"text_loss": 0.7404762506484985
@@ -22095,13 +22095,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.000922329729097018,
- "loss": 0.0164,
+ "loss": 0.0169,
"macro_f1": 0.3333333432674408,
"num_tokens": 3751373.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012452995870262384,
+ "routers_loss": 0.0011676300782710314,
"skip_count": 0.0,
"step": 2326,
"text_loss": 0.2915459871292114
@@ -22114,13 +22114,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.055908203125,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.0009221639627510075,
- "loss": 0.0128,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 3754518.0,
"repeat_count": 0.0,
- "routers_loss": 0.011379311792552471,
+ "routers_loss": 0.01039792038500309,
"skip_count": 0.0,
"step": 2328,
"text_loss": 0.22066321969032288
@@ -22133,13 +22133,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0009219980346309702,
- "loss": 0.0127,
+ "loss": 0.0128,
"macro_f1": 0.3333333432674408,
"num_tokens": 3757621.0,
"repeat_count": 0.0,
- "routers_loss": 0.002973968628793955,
+ "routers_loss": 0.0032070958986878395,
"skip_count": 0.0,
"step": 2330,
"text_loss": 0.5558560490608215
@@ -22152,13 +22152,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.076171875,
"learning_rate": 0.0009218319448004899,
- "loss": 0.012,
+ "loss": 0.0118,
"macro_f1": 0.3333333432674408,
"num_tokens": 3760885.0,
"repeat_count": 0.0,
- "routers_loss": 0.00768645154312253,
+ "routers_loss": 0.007085457909852266,
"skip_count": 0.0,
"step": 2332,
"text_loss": 0.4348253607749939
@@ -22171,13 +22171,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.1103515625,
"learning_rate": 0.0009216656933232129,
- "loss": 0.0167,
+ "loss": 0.016,
"macro_f1": 0.6666666865348816,
"num_tokens": 3764462.0,
"repeat_count": 0.0,
- "routers_loss": 0.006761785596609116,
+ "routers_loss": 0.005504854489117861,
"skip_count": 1.0,
"step": 2334,
"text_loss": 0.35828644037246704
@@ -22190,13 +22190,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0576171875,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0009214992802628463,
- "loss": 0.0129,
+ "loss": 0.0131,
"macro_f1": 0.3333333432674408,
"num_tokens": 3767159.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013711688807234168,
+ "routers_loss": 0.0013970810687169433,
"skip_count": 0.0,
"step": 2336,
"text_loss": 0.2956557869911194
@@ -22209,13 +22209,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08154296875,
+ "grad_norm": 0.08203125,
"learning_rate": 0.0009213327056831607,
- "loss": 0.0174,
+ "loss": 0.0181,
"macro_f1": 0.3272727429866791,
"num_tokens": 3770408.0,
"repeat_count": 0.0,
- "routers_loss": 0.04009406641125679,
+ "routers_loss": 0.0427570566534996,
"skip_count": 1.0,
"step": 2338,
"text_loss": 0.14883014559745789
@@ -22228,13 +22228,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0009211659696479875,
- "loss": 0.0095,
+ "loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 3773474.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013272224459797144,
+ "routers_loss": 0.0011273405980318785,
"skip_count": 0.0,
"step": 2340,
"text_loss": 0.26011669635772705
@@ -22249,11 +22249,11 @@
"f1_skip": 0.0,
"grad_norm": 0.059814453125,
"learning_rate": 0.00092099907222122,
- "loss": 0.0145,
+ "loss": 0.0148,
"macro_f1": 0.3333333432674408,
"num_tokens": 3776909.0,
"repeat_count": 0.0,
- "routers_loss": 0.001724833040498197,
+ "routers_loss": 0.0016178421210497618,
"skip_count": 0.0,
"step": 2342,
"text_loss": 0.49078530073165894
@@ -22266,13 +22266,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05908203125,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.000920832013466814,
- "loss": 0.0132,
+ "loss": 0.0129,
"macro_f1": 0.3333333432674408,
"num_tokens": 3780741.0,
"repeat_count": 0.0,
- "routers_loss": 0.005641496740281582,
+ "routers_loss": 0.005510095041245222,
"skip_count": 0.0,
"step": 2344,
"text_loss": 0.4870249927043915
@@ -22285,13 +22285,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.037109375,
"learning_rate": 0.0009206647934487866,
- "loss": 0.011,
+ "loss": 0.0114,
"macro_f1": 0.6666666865348816,
"num_tokens": 3784673.0,
"repeat_count": 1.0,
- "routers_loss": 0.003907595761120319,
+ "routers_loss": 0.0047357892617583275,
"skip_count": 0.0,
"step": 2346,
"text_loss": 0.3251725733280182
@@ -22304,13 +22304,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.057861328125,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0009204974122312167,
- "loss": 0.0141,
+ "loss": 0.0142,
"macro_f1": 0.6666666865348816,
"num_tokens": 3787503.0,
"repeat_count": 0.0,
- "routers_loss": 0.007570050656795502,
+ "routers_loss": 0.00795028731226921,
"skip_count": 1.0,
"step": 2348,
"text_loss": 0.18282145261764526
@@ -22323,13 +22323,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.055908203125,
+ "grad_norm": 0.060546875,
"learning_rate": 0.0009203298698782452,
- "loss": 0.0079,
+ "loss": 0.0081,
"macro_f1": 0.6666666865348816,
"num_tokens": 3790528.0,
"repeat_count": 1.0,
- "routers_loss": 0.0009280897793360054,
+ "routers_loss": 0.0009506374481134117,
"skip_count": 0.0,
"step": 2350,
"text_loss": 0.4093080461025238
@@ -22342,13 +22342,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.045166015625,
+ "grad_norm": 0.047607421875,
"learning_rate": 0.0009201621664540747,
"loss": 0.0155,
"macro_f1": 0.6666666865348816,
"num_tokens": 3794134.0,
"repeat_count": 1.0,
- "routers_loss": 0.005288597662001848,
+ "routers_loss": 0.005159572698175907,
"skip_count": 0.0,
"step": 2352,
"text_loss": 0.5451981425285339
@@ -22361,13 +22361,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009199943020229694,
- "loss": 0.0146,
+ "loss": 0.0148,
"macro_f1": 0.3333333432674408,
"num_tokens": 3797414.0,
"repeat_count": 0.0,
- "routers_loss": 0.002237799344584346,
+ "routers_loss": 0.002356168581172824,
"skip_count": 0.0,
"step": 2354,
"text_loss": 0.3070453405380249
@@ -22380,13 +22380,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.0810546875,
"learning_rate": 0.0009198262766492554,
- "loss": 0.0144,
+ "loss": 0.0141,
"macro_f1": 0.6666666865348816,
"num_tokens": 3800094.0,
"repeat_count": 0.0,
- "routers_loss": 0.006226782687008381,
+ "routers_loss": 0.0051761893555521965,
"skip_count": 1.0,
"step": 2356,
"text_loss": 0.5880904197692871
@@ -22399,13 +22399,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.049072265625,
+ "grad_norm": 0.049560546875,
"learning_rate": 0.00091965809039732,
- "loss": 0.0136,
+ "loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 3803280.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027645498048514128,
+ "routers_loss": 0.0025952060241252184,
"skip_count": 0.0,
"step": 2358,
"text_loss": 0.5210731625556946
@@ -22418,13 +22418,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.06787109375,
"learning_rate": 0.0009194897433316127,
- "loss": 0.0122,
+ "loss": 0.0125,
"macro_f1": 0.6666666865348816,
"num_tokens": 3805866.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034913592971861362,
+ "routers_loss": 0.0042560105212032795,
"skip_count": 2.0,
"step": 2360,
"text_loss": 0.6472984552383423
@@ -22437,13 +22437,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08056640625,
+ "grad_norm": 0.07568359375,
"learning_rate": 0.0009193212355166446,
- "loss": 0.0112,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 3808952.0,
"repeat_count": 0.0,
- "routers_loss": 0.002706601284444332,
+ "routers_loss": 0.0026232977397739887,
"skip_count": 0.0,
"step": 2362,
"text_loss": 0.450063556432724
@@ -22456,13 +22456,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0009191525670169881,
- "loss": 0.0108,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 3812080.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032696903217583895,
+ "routers_loss": 0.0034355956595391035,
"skip_count": 0.0,
"step": 2364,
"text_loss": 0.49727216362953186
@@ -22475,13 +22475,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.000918983737897277,
- "loss": 0.0115,
+ "loss": 0.0112,
"macro_f1": 0.6666666865348816,
"num_tokens": 3815282.0,
"repeat_count": 0.0,
- "routers_loss": 0.006245410069823265,
+ "routers_loss": 0.0055653867311775684,
"skip_count": 1.0,
"step": 2366,
"text_loss": 0.6336377859115601
@@ -22496,11 +22496,11 @@
"f1_skip": 1.0,
"grad_norm": 0.033447265625,
"learning_rate": 0.0009188147482222071,
- "loss": 0.0079,
+ "loss": 0.008,
"macro_f1": 1.0,
"num_tokens": 3818106.0,
"repeat_count": 2.0,
- "routers_loss": 0.011230813339352608,
+ "routers_loss": 0.011016021482646465,
"skip_count": 2.0,
"step": 2368,
"text_loss": 0.22513329982757568
@@ -22515,11 +22515,11 @@
"f1_skip": 0.0,
"grad_norm": 0.04296875,
"learning_rate": 0.0009186455980565358,
- "loss": 0.0109,
+ "loss": 0.0105,
"macro_f1": 0.6666666865348816,
"num_tokens": 3821228.0,
"repeat_count": 1.0,
- "routers_loss": 0.014897257089614868,
+ "routers_loss": 0.014039464294910431,
"skip_count": 0.0,
"step": 2370,
"text_loss": 0.21331638097763062
@@ -22532,13 +22532,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0009184762874650816,
- "loss": 0.0131,
+ "loss": 0.0128,
"macro_f1": 0.3333333432674408,
"num_tokens": 3825048.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015503648901358247,
+ "routers_loss": 0.001088051125407219,
"skip_count": 0.0,
"step": 2372,
"text_loss": 0.6031543612480164
@@ -22551,13 +22551,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.095703125,
"learning_rate": 0.0009183068165127245,
- "loss": 0.0127,
+ "loss": 0.013,
"macro_f1": 0.6666666865348816,
"num_tokens": 3828781.0,
"repeat_count": 0.0,
- "routers_loss": 0.00723480898886919,
+ "routers_loss": 0.006263940595090389,
"skip_count": 1.0,
"step": 2374,
"text_loss": 0.6249601244926453
@@ -22570,13 +22570,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.076171875,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0009181371852644062,
- "loss": 0.0139,
+ "loss": 0.0133,
"macro_f1": 0.6666666865348816,
"num_tokens": 3832507.0,
"repeat_count": 1.0,
- "routers_loss": 0.002053398173302412,
+ "routers_loss": 0.001987969037145376,
"skip_count": 0.0,
"step": 2376,
"text_loss": 0.37972065806388855
@@ -22589,32 +22589,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06689453125,
+ "grad_norm": 0.0908203125,
"learning_rate": 0.0009179673937851299,
"loss": 0.0158,
"macro_f1": 0.6666666865348816,
"num_tokens": 3835644.0,
"repeat_count": 0.0,
- "routers_loss": 0.007927518337965012,
+ "routers_loss": 0.007635094691067934,
"skip_count": 1.0,
"step": 2378,
"text_loss": 0.46319663524627686
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 28.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 11.173759906075727,
"f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.06298828125,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.0009177974421399598,
- "loss": 0.0144,
- "macro_f1": 0.5555555820465088,
+ "loss": 0.0137,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 3838700.0,
"repeat_count": 0.0,
- "routers_loss": 0.01924682781100273,
+ "routers_loss": 0.01617279462516308,
"skip_count": 2.0,
"step": 2380,
"text_loss": 0.32141056656837463
@@ -22627,13 +22627,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.046875,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0009176273303940217,
- "loss": 0.0106,
+ "loss": 0.011,
"macro_f1": 0.6666666865348816,
"num_tokens": 3841953.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021689811255782843,
+ "routers_loss": 0.0022273799404501915,
"skip_count": 2.0,
"step": 2382,
"text_loss": 0.5908139944076538
@@ -22646,13 +22646,13 @@
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.062255859375,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0009174570586125026,
- "loss": 0.0119,
+ "loss": 0.0122,
"macro_f1": 0.32098767161369324,
"num_tokens": 3845763.0,
"repeat_count": 1.0,
- "routers_loss": 0.03431013971567154,
+ "routers_loss": 0.030915161594748497,
"skip_count": 0.0,
"step": 2384,
"text_loss": 0.41400137543678284
@@ -22665,13 +22665,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.043212890625,
+ "grad_norm": 0.04248046875,
"learning_rate": 0.0009172866268606513,
- "loss": 0.0123,
+ "loss": 0.0122,
"macro_f1": 0.6666666865348816,
"num_tokens": 3848984.0,
"repeat_count": 0.0,
- "routers_loss": 0.008275258354842663,
+ "routers_loss": 0.010480951517820358,
"skip_count": 2.0,
"step": 2386,
"text_loss": 0.2560874819755554
@@ -22684,13 +22684,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04736328125,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0009171160352037775,
- "loss": 0.0121,
+ "loss": 0.0124,
"macro_f1": 0.6666666865348816,
"num_tokens": 3852118.0,
"repeat_count": 0.0,
- "routers_loss": 0.007780806161463261,
+ "routers_loss": 0.00809961836785078,
"skip_count": 1.0,
"step": 2388,
"text_loss": 0.28236693143844604
@@ -22709,7 +22709,7 @@
"macro_f1": 1.0,
"num_tokens": 3855314.0,
"repeat_count": 1.0,
- "routers_loss": 0.00553786288946867,
+ "routers_loss": 0.005569872446358204,
"skip_count": 1.0,
"step": 2390,
"text_loss": 0.4578137695789337
@@ -22722,13 +22722,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08447265625,
+ "grad_norm": 0.1123046875,
"learning_rate": 0.0009167743724365073,
- "loss": 0.01,
+ "loss": 0.0105,
"macro_f1": 0.6666666865348816,
"num_tokens": 3858301.0,
"repeat_count": 0.0,
- "routers_loss": 0.004066115710884333,
+ "routers_loss": 0.0038610948249697685,
"skip_count": 1.0,
"step": 2392,
"text_loss": 0.14082716405391693
@@ -22741,13 +22741,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0810546875,
+ "grad_norm": 0.1376953125,
"learning_rate": 0.0009166033014570368,
- "loss": 0.0104,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 3861296.0,
"repeat_count": 0.0,
- "routers_loss": 0.002403446938842535,
+ "routers_loss": 0.0017607157351449132,
"skip_count": 0.0,
"step": 2394,
"text_loss": 0.384442001581192
@@ -22760,13 +22760,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.054443359375,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.0009164320708343954,
- "loss": 0.0137,
+ "loss": 0.0131,
"macro_f1": 0.6666666865348816,
"num_tokens": 3863985.0,
"repeat_count": 2.0,
- "routers_loss": 0.010212135501205921,
+ "routers_loss": 0.009627950377762318,
"skip_count": 0.0,
"step": 2396,
"text_loss": 0.6969521045684814
@@ -22779,13 +22779,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07275390625,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0009162606806341989,
"loss": 0.0107,
"macro_f1": 0.3333333432674408,
"num_tokens": 3866636.0,
"repeat_count": 0.0,
- "routers_loss": 0.007781816180795431,
+ "routers_loss": 0.006915586534887552,
"skip_count": 0.0,
"step": 2398,
"text_loss": 0.48069697618484497
@@ -22798,32 +22798,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.04248046875,
"learning_rate": 0.0009160891309221242,
- "loss": 0.0151,
+ "loss": 0.0149,
"macro_f1": 0.6666666865348816,
"num_tokens": 3870867.0,
"repeat_count": 1.0,
- "routers_loss": 0.0016227158484980464,
+ "routers_loss": 0.0013031222624704242,
"skip_count": 0.0,
"step": 2400,
"text_loss": 0.3882075846195221
},
{
"acc_repeat": 0.5,
- "acc_skip": 1.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
"epoch": 11.277076606985618,
- "f1_execute": 0.9803921580314636,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.6666666865348816,
- "f1_skip": 1.0,
- "grad_norm": 0.06298828125,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0009159174217639096,
- "loss": 0.0114,
- "macro_f1": 0.8823530077934265,
+ "loss": 0.0112,
+ "macro_f1": 0.5427350401878357,
"num_tokens": 3873663.0,
"repeat_count": 2.0,
- "routers_loss": 0.06490851938724518,
+ "routers_loss": 0.06621067970991135,
"skip_count": 1.0,
"step": 2402,
"text_loss": 0.5740041136741638
@@ -22836,13 +22836,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0009157455532253547,
- "loss": 0.0075,
+ "loss": 0.0071,
"macro_f1": 0.6666666865348816,
"num_tokens": 3876788.0,
"repeat_count": 1.0,
- "routers_loss": 0.007105287164449692,
+ "routers_loss": 0.005957918707281351,
"skip_count": 0.0,
"step": 2404,
"text_loss": 0.26025933027267456
@@ -22855,13 +22855,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 1.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.06787109375,
+ "grad_norm": 0.08642578125,
"learning_rate": 0.0009155735253723191,
- "loss": 0.0125,
+ "loss": 0.0126,
"macro_f1": 0.9452888369560242,
"num_tokens": 3879942.0,
"repeat_count": 1.0,
- "routers_loss": 0.03736003860831261,
+ "routers_loss": 0.039429809898138046,
"skip_count": 4.0,
"step": 2406,
"text_loss": 1.1349908113479614
@@ -22874,13 +22874,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.047607421875,
"learning_rate": 0.0009154013382707251,
- "loss": 0.011,
+ "loss": 0.0113,
"macro_f1": 0.3333333432674408,
"num_tokens": 3882682.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012925176415592432,
+ "routers_loss": 0.0012570557883009315,
"skip_count": 0.0,
"step": 2408,
"text_loss": 0.5611135363578796
@@ -22895,11 +22895,11 @@
"f1_skip": 0.0,
"grad_norm": 0.034423828125,
"learning_rate": 0.0009152289919865543,
- "loss": 0.0124,
+ "loss": 0.0123,
"macro_f1": 0.3333333432674408,
"num_tokens": 3886425.0,
"repeat_count": 0.0,
- "routers_loss": 0.001746711554005742,
+ "routers_loss": 0.0017455556662753224,
"skip_count": 0.0,
"step": 2410,
"text_loss": 0.7523751854896545
@@ -22912,13 +22912,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04345703125,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0009150564865858506,
- "loss": 0.0112,
+ "loss": 0.0114,
"macro_f1": 0.6666666865348816,
"num_tokens": 3889273.0,
"repeat_count": 0.0,
- "routers_loss": 0.011005193926393986,
+ "routers_loss": 0.011178011074662209,
"skip_count": 1.0,
"step": 2412,
"text_loss": 0.26942551136016846
@@ -22931,13 +22931,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.800000011920929,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0009148838221347182,
- "loss": 0.0102,
+ "loss": 0.0107,
"macro_f1": 0.5934640765190125,
"num_tokens": 3892199.0,
"repeat_count": 3.0,
- "routers_loss": 0.017795369029045105,
+ "routers_loss": 0.019628092646598816,
"skip_count": 0.0,
"step": 2414,
"text_loss": 0.5492315888404846
@@ -22950,13 +22950,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.0009147109986993225,
"loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 3895362.0,
"repeat_count": 1.0,
- "routers_loss": 0.011693861335515976,
+ "routers_loss": 0.012255983427166939,
"skip_count": 0.0,
"step": 2416,
"text_loss": 0.23798216879367828
@@ -22969,13 +22969,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1259765625,
+ "grad_norm": 0.11669921875,
"learning_rate": 0.0009145380163458899,
- "loss": 0.0177,
+ "loss": 0.0178,
"macro_f1": 0.3333333432674408,
"num_tokens": 3898476.0,
"repeat_count": 0.0,
- "routers_loss": 0.007135285064578056,
+ "routers_loss": 0.007018954027444124,
"skip_count": 0.0,
"step": 2418,
"text_loss": 0.1923145055770874
@@ -22988,13 +22988,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03515625,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0009143648751407074,
- "loss": 0.0082,
+ "loss": 0.0081,
"macro_f1": 0.3333333432674408,
"num_tokens": 3901817.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008607010240666568,
+ "routers_loss": 0.0008574824314564466,
"skip_count": 0.0,
"step": 2420,
"text_loss": 0.4001806974411011
@@ -23007,13 +23007,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.11328125,
"learning_rate": 0.0009141915751501231,
- "loss": 0.0101,
+ "loss": 0.0102,
"macro_f1": 0.5492662787437439,
"num_tokens": 3905461.0,
"repeat_count": 0.0,
- "routers_loss": 0.015359465964138508,
+ "routers_loss": 0.01572350226342678,
"skip_count": 2.0,
"step": 2422,
"text_loss": 0.19519129395484924
@@ -23026,13 +23026,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0380859375,
+ "grad_norm": 0.037353515625,
"learning_rate": 0.0009140181164405458,
- "loss": 0.011,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 3908878.0,
"repeat_count": 0.0,
- "routers_loss": 0.00047823251225054264,
+ "routers_loss": 0.0005503420252352953,
"skip_count": 0.0,
"step": 2424,
"text_loss": 0.6937088370323181
@@ -23047,11 +23047,11 @@
"f1_skip": 0.0,
"grad_norm": 0.068359375,
"learning_rate": 0.0009138444990784454,
- "loss": 0.0129,
+ "loss": 0.013,
"macro_f1": 0.3333333432674408,
"num_tokens": 3912053.0,
"repeat_count": 0.0,
- "routers_loss": 0.0070601715706288815,
+ "routers_loss": 0.007556677330285311,
"skip_count": 0.0,
"step": 2426,
"text_loss": 0.35431069135665894
@@ -23064,13 +23064,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0947265625,
+ "grad_norm": 0.06201171875,
"learning_rate": 0.000913670723130352,
- "loss": 0.0123,
+ "loss": 0.0117,
"macro_f1": 0.3333333432674408,
"num_tokens": 3915192.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010537977796047926,
+ "routers_loss": 0.0013609991874545813,
"skip_count": 0.0,
"step": 2428,
"text_loss": 0.5171207189559937
@@ -23083,13 +23083,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0517578125,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0009134967886628573,
- "loss": 0.0117,
+ "loss": 0.0115,
"macro_f1": 1.0,
"num_tokens": 3917927.0,
"repeat_count": 2.0,
- "routers_loss": 0.012852456420660019,
+ "routers_loss": 0.010895746760070324,
"skip_count": 2.0,
"step": 2430,
"text_loss": 0.2852934002876282
@@ -23102,13 +23102,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0009133226957426133,
- "loss": 0.0134,
+ "loss": 0.0132,
"macro_f1": 0.5492662787437439,
"num_tokens": 3921460.0,
"repeat_count": 2.0,
- "routers_loss": 0.05307198315858841,
+ "routers_loss": 0.04196908697485924,
"skip_count": 0.0,
"step": 2432,
"text_loss": 0.4864770770072937
@@ -23121,13 +23121,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1015625,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.0009131484444363324,
- "loss": 0.0154,
+ "loss": 0.0155,
"macro_f1": 0.3333333432674408,
"num_tokens": 3924662.0,
"repeat_count": 0.0,
- "routers_loss": 0.004656757228076458,
+ "routers_loss": 0.004484197124838829,
"skip_count": 0.0,
"step": 2434,
"text_loss": 0.7568684220314026
@@ -23140,13 +23140,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0498046875,
+ "grad_norm": 0.05078125,
"learning_rate": 0.0009129740348107882,
- "loss": 0.0113,
+ "loss": 0.0114,
"macro_f1": 0.6666666865348816,
"num_tokens": 3927337.0,
"repeat_count": 0.0,
- "routers_loss": 0.0042406003922224045,
+ "routers_loss": 0.004351360257714987,
"skip_count": 2.0,
"step": 2436,
"text_loss": 0.5953161716461182
@@ -23159,13 +23159,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.0517578125,
+ "grad_norm": 0.04736328125,
"learning_rate": 0.0009127994669328151,
- "loss": 0.0089,
+ "loss": 0.0085,
"macro_f1": 0.6122449040412903,
"num_tokens": 3930407.0,
"repeat_count": 0.0,
- "routers_loss": 0.018079286441206932,
+ "routers_loss": 0.01664198748767376,
"skip_count": 4.0,
"step": 2438,
"text_loss": 0.5320524573326111
@@ -23178,13 +23178,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.0595703125,
"learning_rate": 0.0009126247408693071,
- "loss": 0.0072,
+ "loss": 0.0071,
"macro_f1": 0.6666666865348816,
"num_tokens": 3933184.0,
"repeat_count": 0.0,
- "routers_loss": 0.002266801195219159,
+ "routers_loss": 0.0017819046042859554,
"skip_count": 1.0,
"step": 2440,
"text_loss": 0.6051273345947266
@@ -23197,13 +23197,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0009124498566872204,
- "loss": 0.01,
+ "loss": 0.0105,
"macro_f1": 0.3333333432674408,
"num_tokens": 3936620.0,
"repeat_count": 0.0,
- "routers_loss": 0.005790423136204481,
+ "routers_loss": 0.005519696045666933,
"skip_count": 0.0,
"step": 2442,
"text_loss": 0.12987950444221497
@@ -23216,13 +23216,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052734375,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0009122748144535704,
- "loss": 0.011,
+ "loss": 0.0111,
"macro_f1": 0.32098764181137085,
"num_tokens": 3940010.0,
"repeat_count": 0.0,
- "routers_loss": 0.04591076448559761,
+ "routers_loss": 0.04543351009488106,
"skip_count": 2.0,
"step": 2444,
"text_loss": 0.4642033576965332
@@ -23235,13 +23235,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.04296875,
"learning_rate": 0.0009120996142354338,
- "loss": 0.0122,
+ "loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 3943135.0,
"repeat_count": 0.0,
- "routers_loss": 0.004969341680407524,
+ "routers_loss": 0.00550565542653203,
"skip_count": 0.0,
"step": 2446,
"text_loss": 0.5697627067565918
@@ -23254,13 +23254,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05615234375,
+ "grad_norm": 0.05029296875,
"learning_rate": 0.0009119242560999477,
"loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 3946650.0,
"repeat_count": 0.0,
- "routers_loss": 0.00830315612256527,
+ "routers_loss": 0.008842485956847668,
"skip_count": 0.0,
"step": 2448,
"text_loss": 0.17046524584293365
@@ -23273,13 +23273,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.078125,
+ "grad_norm": 0.08154296875,
"learning_rate": 0.0009117487401143095,
"loss": 0.0154,
"macro_f1": 0.6666666865348816,
"num_tokens": 3949470.0,
"repeat_count": 1.0,
- "routers_loss": 0.0059144929982721806,
+ "routers_loss": 0.005900127813220024,
"skip_count": 0.0,
"step": 2450,
"text_loss": 0.37260866165161133
@@ -23292,13 +23292,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.030029296875,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0009115730663457773,
- "loss": 0.0132,
+ "loss": 0.0137,
"macro_f1": 1.0,
"num_tokens": 3952546.0,
"repeat_count": 1.0,
- "routers_loss": 0.0029762545600533485,
+ "routers_loss": 0.003409258322790265,
"skip_count": 1.0,
"step": 2452,
"text_loss": 0.5308008193969727
@@ -23311,13 +23311,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.052001953125,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0009113972348616698,
- "loss": 0.0091,
+ "loss": 0.0095,
"macro_f1": 0.6666666865348816,
"num_tokens": 3955817.0,
"repeat_count": 0.0,
- "routers_loss": 0.011962058953940868,
+ "routers_loss": 0.010098597034811974,
"skip_count": 1.0,
"step": 2454,
"text_loss": 0.39226648211479187
@@ -23330,13 +23330,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1474609375,
+ "grad_norm": 0.1640625,
"learning_rate": 0.0009112212457293658,
- "loss": 0.0101,
+ "loss": 0.0102,
"macro_f1": 0.3272727429866791,
"num_tokens": 3958911.0,
"repeat_count": 0.0,
- "routers_loss": 0.07289884239435196,
+ "routers_loss": 0.08184818178415298,
"skip_count": 0.0,
"step": 2456,
"text_loss": 0.45411455631256104
@@ -23349,13 +23349,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.041259765625,
"learning_rate": 0.0009110450990163047,
- "loss": 0.0124,
+ "loss": 0.0127,
"macro_f1": 0.3333333432674408,
"num_tokens": 3962584.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009638209594413638,
+ "routers_loss": 0.0009352223132736981,
"skip_count": 0.0,
"step": 2458,
"text_loss": 0.47292324900627136
@@ -23368,13 +23368,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0400390625,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0009108687947899863,
- "loss": 0.0078,
+ "loss": 0.0077,
"macro_f1": 1.0,
"num_tokens": 3965597.0,
"repeat_count": 1.0,
- "routers_loss": 0.008587516844272614,
+ "routers_loss": 0.008150188252329826,
"skip_count": 2.0,
"step": 2460,
"text_loss": 0.33208340406417847
@@ -23387,13 +23387,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0009106923331179707,
- "loss": 0.0126,
+ "loss": 0.0125,
"macro_f1": 0.5492662787437439,
"num_tokens": 3968664.0,
"repeat_count": 0.0,
- "routers_loss": 0.05080332234501839,
+ "routers_loss": 0.050999004393815994,
"skip_count": 2.0,
"step": 2462,
"text_loss": 0.2459995150566101
@@ -23406,13 +23406,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07080078125,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0009105157140678782,
- "loss": 0.0124,
+ "loss": 0.0126,
"macro_f1": 0.6666666865348816,
"num_tokens": 3971772.0,
"repeat_count": 0.0,
- "routers_loss": 0.007348654326051474,
+ "routers_loss": 0.006196586415171623,
"skip_count": 1.0,
"step": 2464,
"text_loss": 0.23956991732120514
@@ -23425,13 +23425,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06787109375,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0009103389377073896,
- "loss": 0.0099,
+ "loss": 0.01,
"macro_f1": 0.3333333432674408,
"num_tokens": 3976224.0,
"repeat_count": 0.0,
- "routers_loss": 0.007161752786487341,
+ "routers_loss": 0.008181816898286343,
"skip_count": 0.0,
"step": 2466,
"text_loss": 0.3235875070095062
@@ -23444,13 +23444,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.057373046875,
"learning_rate": 0.0009101620041042462,
- "loss": 0.0119,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 3978876.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015090530505403876,
+ "routers_loss": 0.0015451472718268633,
"skip_count": 0.0,
"step": 2468,
"text_loss": 0.4038759469985962
@@ -23463,13 +23463,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07275390625,
+ "grad_norm": 0.09130859375,
"learning_rate": 0.000909984913326249,
- "loss": 0.0129,
+ "loss": 0.0131,
"macro_f1": 0.3272727429866791,
"num_tokens": 3981992.0,
"repeat_count": 0.0,
- "routers_loss": 0.021420184522867203,
+ "routers_loss": 0.021785033866763115,
"skip_count": 1.0,
"step": 2470,
"text_loss": 0.6346460580825806
@@ -23482,13 +23482,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.0712890625,
"learning_rate": 0.0009098076654412595,
- "loss": 0.0092,
+ "loss": 0.0094,
"macro_f1": 0.3333333432674408,
"num_tokens": 3984560.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010742908343672752,
+ "routers_loss": 0.0011462471447885036,
"skip_count": 0.0,
"step": 2472,
"text_loss": 0.3449646532535553
@@ -23501,13 +23501,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05078125,
+ "grad_norm": 0.049560546875,
"learning_rate": 0.0009096302605171996,
- "loss": 0.011,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 3987548.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015209210105240345,
+ "routers_loss": 0.0014367027906700969,
"skip_count": 0.0,
"step": 2474,
"text_loss": 0.5918350219726562
@@ -23520,13 +23520,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.0478515625,
"learning_rate": 0.0009094526986220513,
"loss": 0.0124,
"macro_f1": 0.3333333432674408,
"num_tokens": 3990727.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008761848439462483,
+ "routers_loss": 0.0008977655088528991,
"skip_count": 0.0,
"step": 2476,
"text_loss": 0.463350385427475
@@ -23539,13 +23539,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.0009092749798238563,
- "loss": 0.0146,
+ "loss": 0.015,
"macro_f1": 0.3272727429866791,
"num_tokens": 3993757.0,
"repeat_count": 1.0,
- "routers_loss": 0.01623794063925743,
+ "routers_loss": 0.016712551936507225,
"skip_count": 0.0,
"step": 2478,
"text_loss": 0.5621229410171509
@@ -23558,13 +23558,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07080078125,
+ "grad_norm": 0.06640625,
"learning_rate": 0.000909097104190717,
- "loss": 0.0174,
+ "loss": 0.0172,
"macro_f1": 0.32098764181137085,
"num_tokens": 3997259.0,
"repeat_count": 0.0,
- "routers_loss": 0.04170118644833565,
+ "routers_loss": 0.04134179651737213,
"skip_count": 2.0,
"step": 2480,
"text_loss": 0.375476598739624
@@ -23577,32 +23577,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.046875,
+ "grad_norm": 0.044677734375,
"learning_rate": 0.0009089190717907956,
- "loss": 0.0116,
+ "loss": 0.0117,
"macro_f1": 0.3333333432674408,
"num_tokens": 4000563.0,
"repeat_count": 0.0,
- "routers_loss": 0.003591755870729685,
+ "routers_loss": 0.003462378401309252,
"skip_count": 0.0,
"step": 2482,
"text_loss": 0.5553798675537109
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 11.66216612855885,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.0693359375,
+ "f1_skip": 1.0,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0009087408826923146,
- "loss": 0.0185,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0182,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 4004065.0,
"repeat_count": 0.0,
- "routers_loss": 0.009214848279953003,
+ "routers_loss": 0.008057428523898125,
"skip_count": 2.0,
"step": 2484,
"text_loss": 0.4329465329647064
@@ -23615,13 +23615,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0009085625369635564,
- "loss": 0.0111,
+ "loss": 0.0114,
"macro_f1": 0.3333333432674408,
"num_tokens": 4007119.0,
"repeat_count": 0.0,
- "routers_loss": 0.0059350160881876945,
+ "routers_loss": 0.005759050603955984,
"skip_count": 0.0,
"step": 2486,
"text_loss": 0.501268744468689
@@ -23634,13 +23634,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10693359375,
+ "grad_norm": 0.1240234375,
"learning_rate": 0.0009083840346728631,
- "loss": 0.0118,
+ "loss": 0.0122,
"macro_f1": 0.3272727429866791,
"num_tokens": 4010547.0,
"repeat_count": 1.0,
- "routers_loss": 0.019803427159786224,
+ "routers_loss": 0.020763102918863297,
"skip_count": 0.0,
"step": 2488,
"text_loss": 0.480196475982666
@@ -23653,13 +23653,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.058349609375,
+ "grad_norm": 0.05078125,
"learning_rate": 0.0009082053758886374,
- "loss": 0.0118,
+ "loss": 0.0117,
"macro_f1": 0.6666666865348816,
"num_tokens": 4014600.0,
"repeat_count": 0.0,
- "routers_loss": 0.006243673153221607,
+ "routers_loss": 0.005801836494356394,
"skip_count": 1.0,
"step": 2490,
"text_loss": 0.18249782919883728
@@ -23672,13 +23672,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0009080265606793416,
- "loss": 0.0132,
+ "loss": 0.0128,
"macro_f1": 1.0,
"num_tokens": 4017964.0,
"repeat_count": 1.0,
- "routers_loss": 0.003960726782679558,
+ "routers_loss": 0.004226063843816519,
"skip_count": 1.0,
"step": 2492,
"text_loss": 0.6573076248168945
@@ -23691,13 +23691,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0537109375,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.000907847589113498,
- "loss": 0.0127,
+ "loss": 0.0125,
"macro_f1": 0.6666666865348816,
"num_tokens": 4020694.0,
"repeat_count": 0.0,
- "routers_loss": 0.004959117621183395,
+ "routers_loss": 0.004281101748347282,
"skip_count": 2.0,
"step": 2494,
"text_loss": 0.3944586217403412
@@ -23710,13 +23710,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.000907668461259689,
- "loss": 0.0157,
+ "loss": 0.0152,
"macro_f1": 0.6666666865348816,
"num_tokens": 4023757.0,
"repeat_count": 0.0,
- "routers_loss": 0.009721433743834496,
+ "routers_loss": 0.008786370046436787,
"skip_count": 1.0,
"step": 2496,
"text_loss": 0.6452898979187012
@@ -23729,13 +23729,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0009074891771865566,
- "loss": 0.0124,
+ "loss": 0.0125,
"macro_f1": 0.3333333432674408,
"num_tokens": 4026601.0,
"repeat_count": 0.0,
- "routers_loss": 0.00491701066493988,
+ "routers_loss": 0.005209595896303654,
"skip_count": 0.0,
"step": 2498,
"text_loss": 0.9633619785308838
@@ -23748,13 +23748,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.03759765625,
"learning_rate": 0.0009073097369628028,
- "loss": 0.0131,
+ "loss": 0.013,
"macro_f1": 1.0,
"num_tokens": 4030321.0,
"repeat_count": 3.0,
- "routers_loss": 0.009832080453634262,
+ "routers_loss": 0.00860709697008133,
"skip_count": 1.0,
"step": 2500,
"text_loss": 0.48566827178001404
@@ -23767,13 +23767,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047607421875,
+ "grad_norm": 0.04443359375,
"learning_rate": 0.0009071301406571893,
- "loss": 0.0137,
+ "loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 4033234.0,
"repeat_count": 0.0,
- "routers_loss": 0.003301833290606737,
+ "routers_loss": 0.0035277456045150757,
"skip_count": 0.0,
"step": 2502,
"text_loss": 0.3771554231643677
@@ -23786,13 +23786,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.000906950388338538,
- "loss": 0.0134,
+ "loss": 0.0136,
"macro_f1": 0.3333333432674408,
"num_tokens": 4036417.0,
"repeat_count": 0.0,
- "routers_loss": 0.001580960932187736,
+ "routers_loss": 0.0013424850767478347,
"skip_count": 0.0,
"step": 2504,
"text_loss": 0.8962806463241577
@@ -23805,13 +23805,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0908203125,
+ "grad_norm": 0.09912109375,
"learning_rate": 0.0009067704800757301,
- "loss": 0.0091,
+ "loss": 0.0095,
"macro_f1": 0.3333333432674408,
"num_tokens": 4039564.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011505817528814077,
+ "routers_loss": 0.0010423909407109022,
"skip_count": 0.0,
"step": 2506,
"text_loss": 0.43170279264450073
@@ -23824,13 +23824,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.04248046875,
"learning_rate": 0.000906590415937707,
- "loss": 0.0095,
+ "loss": 0.0094,
"macro_f1": 0.3272727429866791,
"num_tokens": 4043212.0,
"repeat_count": 0.0,
- "routers_loss": 0.023224346339702606,
+ "routers_loss": 0.021780289709568024,
"skip_count": 1.0,
"step": 2508,
"text_loss": 0.41495826840400696
@@ -23843,13 +23843,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0009064101959934696,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 4046687.0,
"repeat_count": 0.0,
- "routers_loss": 0.007955167442560196,
+ "routers_loss": 0.007261929102241993,
"skip_count": 1.0,
"step": 2510,
"text_loss": 0.21821187436580658
@@ -23862,13 +23862,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.057861328125,
"learning_rate": 0.0009062298203120783,
- "loss": 0.0101,
+ "loss": 0.0102,
"macro_f1": 0.6666666865348816,
"num_tokens": 4050735.0,
"repeat_count": 0.0,
- "routers_loss": 0.006164440419524908,
+ "routers_loss": 0.007447180338203907,
"skip_count": 2.0,
"step": 2512,
"text_loss": 0.1818767935037613
@@ -23881,13 +23881,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.058837890625,
+ "grad_norm": 0.06494140625,
"learning_rate": 0.0009060492889626535,
- "loss": 0.014,
+ "loss": 0.0142,
"macro_f1": 0.3272727429866791,
"num_tokens": 4054426.0,
"repeat_count": 1.0,
- "routers_loss": 0.0713663101196289,
+ "routers_loss": 0.0718490406870842,
"skip_count": 0.0,
"step": 2514,
"text_loss": 0.22798970341682434
@@ -23900,13 +23900,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08544921875,
+ "grad_norm": 0.099609375,
"learning_rate": 0.0009058686020143753,
- "loss": 0.0182,
+ "loss": 0.0183,
"macro_f1": 0.3333333432674408,
"num_tokens": 4057615.0,
"repeat_count": 0.0,
- "routers_loss": 0.0052308146841824055,
+ "routers_loss": 0.0052676633931696415,
"skip_count": 0.0,
"step": 2516,
"text_loss": 0.1712338626384735
@@ -23919,13 +23919,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04052734375,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.0009056877595364832,
- "loss": 0.0143,
+ "loss": 0.0137,
"macro_f1": 0.3333333432674408,
"num_tokens": 4060338.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020465939305722713,
+ "routers_loss": 0.0018052728846669197,
"skip_count": 0.0,
"step": 2518,
"text_loss": 0.6811438798904419
@@ -23938,13 +23938,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.083984375,
"learning_rate": 0.0009055067615982761,
- "loss": 0.0114,
+ "loss": 0.0113,
"macro_f1": 0.3333333432674408,
"num_tokens": 4062887.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008663221378810704,
+ "routers_loss": 0.0009029926732182503,
"skip_count": 0.0,
"step": 2520,
"text_loss": 0.5480356812477112
@@ -23957,13 +23957,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.051025390625,
"learning_rate": 0.0009053256082691133,
- "loss": 0.0104,
+ "loss": 0.0106,
"macro_f1": 0.3333333432674408,
"num_tokens": 4065357.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026889131404459476,
+ "routers_loss": 0.0027515271212905645,
"skip_count": 0.0,
"step": 2522,
"text_loss": 0.5234101414680481
@@ -23978,11 +23978,11 @@
"f1_skip": 0.0,
"grad_norm": 0.08203125,
"learning_rate": 0.0009051442996184127,
- "loss": 0.0181,
+ "loss": 0.0174,
"macro_f1": 0.3333333432674408,
"num_tokens": 4068111.0,
"repeat_count": 0.0,
- "routers_loss": 0.002255887258797884,
+ "routers_loss": 0.002199822571128607,
"skip_count": 0.0,
"step": 2524,
"text_loss": 0.2418575882911682
@@ -23995,13 +23995,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.060546875,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009049628357156521,
- "loss": 0.0144,
+ "loss": 0.0143,
"macro_f1": 0.6666666865348816,
"num_tokens": 4071284.0,
"repeat_count": 0.0,
- "routers_loss": 0.005672316066920757,
+ "routers_loss": 0.006303096655756235,
"skip_count": 2.0,
"step": 2526,
"text_loss": 0.7948065996170044
@@ -24014,13 +24014,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0380859375,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.000904781216630369,
- "loss": 0.007,
+ "loss": 0.0068,
"macro_f1": 0.6601307392120361,
"num_tokens": 4074750.0,
"repeat_count": 1.0,
- "routers_loss": 0.017167411744594574,
+ "routers_loss": 0.01791904680430889,
"skip_count": 2.0,
"step": 2528,
"text_loss": 0.809726357460022
@@ -24033,13 +24033,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.053955078125,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0009045994424321602,
- "loss": 0.0101,
+ "loss": 0.0102,
"macro_f1": 1.0,
"num_tokens": 4078617.0,
"repeat_count": 2.0,
- "routers_loss": 0.019105618819594383,
+ "routers_loss": 0.016553178429603577,
"skip_count": 2.0,
"step": 2530,
"text_loss": 0.8755000829696655
@@ -24052,13 +24052,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.060791015625,
+ "grad_norm": 0.061767578125,
"learning_rate": 0.0009044175131906817,
"loss": 0.0145,
"macro_f1": 0.3333333432674408,
"num_tokens": 4080936.0,
"repeat_count": 0.0,
- "routers_loss": 0.007993129082024097,
+ "routers_loss": 0.00884837657213211,
"skip_count": 0.0,
"step": 2532,
"text_loss": 0.795871913433075
@@ -24071,13 +24071,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.05029296875,
"learning_rate": 0.0009042354289756491,
- "loss": 0.0124,
+ "loss": 0.0122,
"macro_f1": 0.3333333432674408,
"num_tokens": 4084459.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024954001419246197,
+ "routers_loss": 0.0024387789890170097,
"skip_count": 0.0,
"step": 2534,
"text_loss": 0.18875400722026825
@@ -24090,13 +24090,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.0625,
"learning_rate": 0.0009040531898568379,
- "loss": 0.0169,
+ "loss": 0.0171,
"macro_f1": 0.3333333432674408,
"num_tokens": 4088464.0,
"repeat_count": 0.0,
- "routers_loss": 0.004360117018222809,
+ "routers_loss": 0.00491489190608263,
"skip_count": 0.0,
"step": 2536,
"text_loss": 0.334369033575058
@@ -24109,13 +24109,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0927734375,
+ "grad_norm": 0.091796875,
"learning_rate": 0.000903870795904082,
- "loss": 0.0142,
+ "loss": 0.0145,
"macro_f1": 0.6666666865348816,
"num_tokens": 4091659.0,
"repeat_count": 0.0,
- "routers_loss": 0.00429064966738224,
+ "routers_loss": 0.004592662677168846,
"skip_count": 2.0,
"step": 2538,
"text_loss": 0.21298295259475708
@@ -24130,11 +24130,11 @@
"f1_skip": 0.6666666865348816,
"grad_norm": 0.0458984375,
"learning_rate": 0.000903688247187275,
- "loss": 0.0136,
+ "loss": 0.0137,
"macro_f1": 0.5492662787437439,
"num_tokens": 4095496.0,
"repeat_count": 0.0,
- "routers_loss": 0.0132954316213727,
+ "routers_loss": 0.011647242121398449,
"skip_count": 2.0,
"step": 2540,
"text_loss": 0.2985081672668457
@@ -24147,13 +24147,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.04443359375,
"learning_rate": 0.0009035055437763704,
- "loss": 0.0129,
+ "loss": 0.0124,
"macro_f1": 0.3333333432674408,
"num_tokens": 4098663.0,
"repeat_count": 0.0,
- "routers_loss": 0.002104961546137929,
+ "routers_loss": 0.0021238960325717926,
"skip_count": 0.0,
"step": 2542,
"text_loss": 0.35359489917755127
@@ -24166,13 +24166,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.060791015625,
+ "grad_norm": 0.05859375,
"learning_rate": 0.0009033226857413803,
- "loss": 0.0167,
+ "loss": 0.0163,
"macro_f1": 0.6666666865348816,
"num_tokens": 4101588.0,
"repeat_count": 1.0,
- "routers_loss": 0.002973714144900441,
+ "routers_loss": 0.0024701557122170925,
"skip_count": 0.0,
"step": 2544,
"text_loss": 1.1577601432800293
@@ -24185,13 +24185,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.080078125,
"learning_rate": 0.000903139673152376,
- "loss": 0.0119,
+ "loss": 0.012,
"macro_f1": 0.3333333432674408,
"num_tokens": 4104643.0,
"repeat_count": 0.0,
- "routers_loss": 0.002359170001000166,
+ "routers_loss": 0.002499542199075222,
"skip_count": 0.0,
"step": 2546,
"text_loss": 1.0173401832580566
@@ -24204,13 +24204,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0615234375,
+ "grad_norm": 0.059814453125,
"learning_rate": 0.0009029565060794885,
- "loss": 0.0168,
+ "loss": 0.0165,
"macro_f1": 0.3333333432674408,
"num_tokens": 4109247.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033595687709748745,
+ "routers_loss": 0.0034200598020106554,
"skip_count": 0.0,
"step": 2548,
"text_loss": 0.5690504312515259
@@ -24223,13 +24223,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.07421875,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0009027731845929079,
"loss": 0.0155,
"macro_f1": 0.8823530077934265,
"num_tokens": 4112597.0,
"repeat_count": 1.0,
- "routers_loss": 0.015323673374950886,
+ "routers_loss": 0.015981333330273628,
"skip_count": 1.0,
"step": 2550,
"text_loss": 0.294549822807312
@@ -24242,13 +24242,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.043212890625,
+ "grad_norm": 0.06103515625,
"learning_rate": 0.0009025897087628829,
- "loss": 0.0063,
+ "loss": 0.0064,
"macro_f1": 0.5492662787437439,
"num_tokens": 4115844.0,
"repeat_count": 0.0,
- "routers_loss": 0.02122018299996853,
+ "routers_loss": 0.02606951631605625,
"skip_count": 2.0,
"step": 2552,
"text_loss": 0.22692419588565826
@@ -24261,13 +24261,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07763671875,
+ "grad_norm": 0.080078125,
"learning_rate": 0.0009024060786597222,
"loss": 0.0202,
"macro_f1": 0.3333333432674408,
"num_tokens": 4118634.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010765352053567767,
+ "routers_loss": 0.001026194542646408,
"skip_count": 0.0,
"step": 2554,
"text_loss": 0.6807059645652771
@@ -24280,13 +24280,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.000902222294353793,
- "loss": 0.0128,
+ "loss": 0.0124,
"macro_f1": 0.3333333432674408,
"num_tokens": 4122024.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017301233019679785,
+ "routers_loss": 0.001974924933165312,
"skip_count": 0.0,
"step": 2556,
"text_loss": 0.7373668551445007
@@ -24299,13 +24299,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.055908203125,
+ "grad_norm": 0.04833984375,
"learning_rate": 0.0009020383559155219,
- "loss": 0.0056,
+ "loss": 0.0054,
"macro_f1": 1.0,
"num_tokens": 4124803.0,
"repeat_count": 1.0,
- "routers_loss": 0.004307204391807318,
+ "routers_loss": 0.004662613850086927,
"skip_count": 2.0,
"step": 2558,
"text_loss": 0.21808166801929474
@@ -24318,13 +24318,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.029541015625,
+ "grad_norm": 0.0263671875,
"learning_rate": 0.0009018542634153943,
- "loss": 0.0064,
+ "loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 4127680.0,
"repeat_count": 0.0,
- "routers_loss": 0.0073805381543934345,
+ "routers_loss": 0.006881687790155411,
"skip_count": 0.0,
"step": 2560,
"text_loss": 0.25192978978157043
@@ -24339,11 +24339,11 @@
"f1_skip": 1.0,
"grad_norm": 0.049560546875,
"learning_rate": 0.0009016700169239551,
- "loss": 0.0108,
+ "loss": 0.0105,
"macro_f1": 1.0,
"num_tokens": 4130431.0,
"repeat_count": 1.0,
- "routers_loss": 0.005493874195963144,
+ "routers_loss": 0.005977808032184839,
"skip_count": 1.0,
"step": 2562,
"text_loss": 0.4700816869735718
@@ -24356,13 +24356,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0009014856165118075,
- "loss": 0.0154,
+ "loss": 0.0153,
"macro_f1": 0.6666666865348816,
"num_tokens": 4133535.0,
"repeat_count": 0.0,
- "routers_loss": 0.006889877840876579,
+ "routers_loss": 0.007005698047578335,
"skip_count": 1.0,
"step": 2564,
"text_loss": 0.6558199524879456
@@ -24375,13 +24375,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03125,
+ "grad_norm": 0.030517578125,
"learning_rate": 0.0009013010622496144,
- "loss": 0.009,
+ "loss": 0.0088,
"macro_f1": 0.3333333432674408,
"num_tokens": 4136534.0,
"repeat_count": 0.0,
- "routers_loss": 0.008495541289448738,
+ "routers_loss": 0.007262171246111393,
"skip_count": 0.0,
"step": 2566,
"text_loss": 0.2565421462059021
@@ -24394,13 +24394,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0009011163542080971,
- "loss": 0.0089,
+ "loss": 0.0088,
"macro_f1": 0.5934640765190125,
"num_tokens": 4139762.0,
"repeat_count": 0.0,
- "routers_loss": 0.05929862707853317,
+ "routers_loss": 0.05431923270225525,
"skip_count": 3.0,
"step": 2568,
"text_loss": 0.19896510243415833
@@ -24413,13 +24413,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.026611328125,
"learning_rate": 0.0009009314924580363,
- "loss": 0.0086,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 4143398.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033934004604816437,
+ "routers_loss": 0.003667369019240141,
"skip_count": 0.0,
"step": 2570,
"text_loss": 0.6581419110298157
@@ -24432,13 +24432,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.054931640625,
+ "grad_norm": 0.052978515625,
"learning_rate": 0.0009007464770702712,
"loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 4146248.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012826769379898906,
+ "routers_loss": 0.00132099783513695,
"skip_count": 0.0,
"step": 2572,
"text_loss": 0.5316711068153381
@@ -24451,13 +24451,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.0009005613081157002,
"loss": 0.0132,
"macro_f1": 0.3333333432674408,
"num_tokens": 4149455.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019460092298686504,
+ "routers_loss": 0.0020061524119228125,
"skip_count": 0.0,
"step": 2574,
"text_loss": 0.5400773882865906
@@ -24470,13 +24470,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.064453125,
+ "grad_norm": 0.05517578125,
"learning_rate": 0.0009003759856652802,
- "loss": 0.0112,
+ "loss": 0.0111,
"macro_f1": 0.6666666865348816,
"num_tokens": 4152774.0,
"repeat_count": 0.0,
- "routers_loss": 0.004493138287216425,
+ "routers_loss": 0.002621434163302183,
"skip_count": 1.0,
"step": 2576,
"text_loss": 0.3672606945037842
@@ -24489,13 +24489,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.055908203125,
+ "grad_norm": 0.051513671875,
"learning_rate": 0.0009001905097900273,
"loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 4155835.0,
"repeat_count": 0.0,
- "routers_loss": 0.005607665050774813,
+ "routers_loss": 0.005290219560265541,
"skip_count": 0.0,
"step": 2578,
"text_loss": 0.8159038424491882
@@ -24508,13 +24508,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04345703125,
+ "grad_norm": 0.040771484375,
"learning_rate": 0.0009000048805610161,
- "loss": 0.0123,
+ "loss": 0.0119,
"macro_f1": 0.3333333432674408,
"num_tokens": 4158874.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015080278972163796,
+ "routers_loss": 0.0013576085912063718,
"skip_count": 0.0,
"step": 2580,
"text_loss": 0.5518951416015625
@@ -24529,11 +24529,11 @@
"f1_skip": 0.0,
"grad_norm": 0.138671875,
"learning_rate": 0.00089981909804938,
- "loss": 0.0142,
+ "loss": 0.0143,
"macro_f1": 0.3333333432674408,
"num_tokens": 4162076.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022276053205132484,
+ "routers_loss": 0.0021483441814780235,
"skip_count": 0.0,
"step": 2582,
"text_loss": 0.43552228808403015
@@ -24546,13 +24546,13 @@
"f1_execute": 0.9387754797935486,
"f1_repeat": 1.0,
"f1_skip": 0.4000000059604645,
- "grad_norm": 0.07421875,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0008996331623263114,
- "loss": 0.0116,
+ "loss": 0.0117,
"macro_f1": 0.7795917987823486,
"num_tokens": 4165041.0,
"repeat_count": 1.0,
- "routers_loss": 0.0499282106757164,
+ "routers_loss": 0.0544300302863121,
"skip_count": 4.0,
"step": 2584,
"text_loss": 0.24812501668930054
@@ -24565,13 +24565,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.047607421875,
"learning_rate": 0.0008994470734630611,
- "loss": 0.01,
+ "loss": 0.0101,
"macro_f1": 0.3333333432674408,
"num_tokens": 4168290.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016360745066776872,
+ "routers_loss": 0.0017150711501017213,
"skip_count": 0.0,
"step": 2586,
"text_loss": 0.6392097473144531
@@ -24584,32 +24584,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05224609375,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0008992608315309388,
- "loss": 0.0149,
+ "loss": 0.015,
"macro_f1": 0.6666666865348816,
"num_tokens": 4171310.0,
"repeat_count": 0.0,
- "routers_loss": 0.0037772543728351593,
+ "routers_loss": 0.0046473173424601555,
"skip_count": 2.0,
"step": 2588,
"text_loss": 0.6534156799316406
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 12.15967126504256,
- "f1_execute": 0.9615384340286255,
+ "f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.060791015625,
+ "f1_skip": 0.0,
+ "grad_norm": 0.06591796875,
"learning_rate": 0.0008990744366013125,
- "loss": 0.0104,
- "macro_f1": 0.6538461446762085,
+ "loss": 0.0105,
+ "macro_f1": 0.3144654333591461,
"num_tokens": 4174042.0,
"repeat_count": 2.0,
- "routers_loss": 0.05992122367024422,
+ "routers_loss": 0.060913100838661194,
"skip_count": 1.0,
"step": 2590,
"text_loss": 0.5365690588951111
@@ -24622,13 +24622,13 @@
"f1_execute": 0.9583333134651184,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.055419921875,
"learning_rate": 0.0008988878887456093,
"loss": 0.0118,
"macro_f1": 0.6051587462425232,
"num_tokens": 4177666.0,
"repeat_count": 1.0,
- "routers_loss": 0.0679154023528099,
+ "routers_loss": 0.06268956512212753,
"skip_count": 4.0,
"step": 2592,
"text_loss": 0.226226806640625
@@ -24643,11 +24643,11 @@
"f1_skip": 0.0,
"grad_norm": 0.03662109375,
"learning_rate": 0.0008987011880353149,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.32098764181137085,
"num_tokens": 4180490.0,
"repeat_count": 0.0,
- "routers_loss": 0.03284052759408951,
+ "routers_loss": 0.030141465365886688,
"skip_count": 2.0,
"step": 2594,
"text_loss": 0.2581401765346527
@@ -24660,13 +24660,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.051513671875,
+ "grad_norm": 0.044677734375,
"learning_rate": 0.0008985143345419729,
- "loss": 0.0087,
+ "loss": 0.0082,
"macro_f1": 0.5492662787437439,
"num_tokens": 4183300.0,
"repeat_count": 0.0,
- "routers_loss": 0.01971421390771866,
+ "routers_loss": 0.018745863810181618,
"skip_count": 2.0,
"step": 2596,
"text_loss": 0.7778542637825012
@@ -24679,13 +24679,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0703125,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0008983273283371862,
- "loss": 0.0099,
+ "loss": 0.0096,
"macro_f1": 0.5492662787437439,
"num_tokens": 4186535.0,
"repeat_count": 0.0,
- "routers_loss": 0.028065117076039314,
+ "routers_loss": 0.026792079210281372,
"skip_count": 2.0,
"step": 2598,
"text_loss": 0.34700271487236023
@@ -24698,13 +24698,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0008981401694926159,
- "loss": 0.0077,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 4189082.0,
"repeat_count": 0.0,
- "routers_loss": 0.00166845612693578,
+ "routers_loss": 0.001914160675369203,
"skip_count": 0.0,
"step": 2600,
"text_loss": 0.6879339218139648
@@ -24717,13 +24717,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0008979528580799815,
- "loss": 0.0138,
+ "loss": 0.0136,
"macro_f1": 0.6666666865348816,
"num_tokens": 4192330.0,
"repeat_count": 0.0,
- "routers_loss": 0.007527270819991827,
+ "routers_loss": 0.007978348061442375,
"skip_count": 2.0,
"step": 2602,
"text_loss": 0.3524550497531891
@@ -24736,13 +24736,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.0008977653941710613,
- "loss": 0.0137,
+ "loss": 0.0134,
"macro_f1": 0.6666666865348816,
"num_tokens": 4196117.0,
"repeat_count": 2.0,
- "routers_loss": 0.00412185862660408,
+ "routers_loss": 0.0035376469604671,
"skip_count": 0.0,
"step": 2604,
"text_loss": 0.42356348037719727
@@ -24755,13 +24755,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06005859375,
+ "grad_norm": 0.05810546875,
"learning_rate": 0.0008975777778376916,
- "loss": 0.0157,
+ "loss": 0.0156,
"macro_f1": 0.6666666865348816,
"num_tokens": 4200423.0,
"repeat_count": 0.0,
- "routers_loss": 0.007787751499563456,
+ "routers_loss": 0.008262477815151215,
"skip_count": 1.0,
"step": 2606,
"text_loss": 0.5272893905639648
@@ -24774,13 +24774,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0008973900091517675,
"loss": 0.0114,
"macro_f1": 0.3272727429866791,
"num_tokens": 4203257.0,
"repeat_count": 0.0,
- "routers_loss": 0.024111779406666756,
+ "routers_loss": 0.022957922890782356,
"skip_count": 1.0,
"step": 2608,
"text_loss": 0.2713734805583954
@@ -24793,13 +24793,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.045166015625,
+ "grad_norm": 0.043701171875,
"learning_rate": 0.000897202088185242,
- "loss": 0.0091,
+ "loss": 0.0085,
"macro_f1": 0.6666666865348816,
"num_tokens": 4206243.0,
"repeat_count": 0.0,
- "routers_loss": 0.0057326615788042545,
+ "routers_loss": 0.006623407825827599,
"skip_count": 2.0,
"step": 2610,
"text_loss": 0.5920525789260864
@@ -24812,13 +24812,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.0517578125,
"learning_rate": 0.0008970140150101274,
- "loss": 0.0118,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 4209264.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008877563523128629,
+ "routers_loss": 0.0008602747693657875,
"skip_count": 0.0,
"step": 2612,
"text_loss": 0.33421996235847473
@@ -24831,13 +24831,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.030517578125,
"learning_rate": 0.0008968257896984932,
- "loss": 0.0067,
+ "loss": 0.0062,
"macro_f1": 0.6666666865348816,
"num_tokens": 4212058.0,
"repeat_count": 0.0,
- "routers_loss": 0.0039034869987517595,
+ "routers_loss": 0.0024653903674334288,
"skip_count": 1.0,
"step": 2614,
"text_loss": 0.37923356890678406
@@ -24850,13 +24850,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.06298828125,
"learning_rate": 0.0008966374123224677,
- "loss": 0.0085,
+ "loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 4214929.0,
"repeat_count": 0.0,
- "routers_loss": 0.01140254084020853,
+ "routers_loss": 0.010878405533730984,
"skip_count": 0.0,
"step": 2616,
"text_loss": 0.4350503981113434
@@ -24869,13 +24869,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03125,
+ "grad_norm": 0.0303955078125,
"learning_rate": 0.0008964488829542376,
"loss": 0.0083,
"macro_f1": 0.3272727429866791,
"num_tokens": 4219170.0,
"repeat_count": 0.0,
- "routers_loss": 0.028559349477291107,
+ "routers_loss": 0.02864212542772293,
"skip_count": 1.0,
"step": 2618,
"text_loss": 0.26250728964805603
@@ -24888,13 +24888,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.061279296875,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0008962602016660478,
- "loss": 0.0097,
+ "loss": 0.0096,
"macro_f1": 0.6666666865348816,
"num_tokens": 4222077.0,
"repeat_count": 0.0,
- "routers_loss": 0.010525460354983807,
+ "routers_loss": 0.010444172658026218,
"skip_count": 2.0,
"step": 2620,
"text_loss": 0.4718937575817108
@@ -24907,13 +24907,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.048583984375,
+ "grad_norm": 0.0478515625,
"learning_rate": 0.0008960713685302011,
- "loss": 0.0104,
+ "loss": 0.0105,
"macro_f1": 0.6666666865348816,
"num_tokens": 4225383.0,
"repeat_count": 0.0,
- "routers_loss": 0.005284689832478762,
+ "routers_loss": 0.006409442983567715,
"skip_count": 1.0,
"step": 2622,
"text_loss": 0.30420538783073425
@@ -24926,13 +24926,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0284423828125,
+ "grad_norm": 0.02978515625,
"learning_rate": 0.0008958823836190588,
- "loss": 0.0051,
+ "loss": 0.005,
"macro_f1": 0.3272727429866791,
"num_tokens": 4228349.0,
"repeat_count": 0.0,
- "routers_loss": 0.011040215380489826,
+ "routers_loss": 0.009996986016631126,
"skip_count": 1.0,
"step": 2624,
"text_loss": 0.5392362475395203
@@ -24945,13 +24945,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.031494140625,
"learning_rate": 0.0008956932470050404,
"loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 4232007.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014406041009351611,
+ "routers_loss": 0.0014383369125425816,
"skip_count": 0.0,
"step": 2626,
"text_loss": 0.7112401127815247
@@ -24964,13 +24964,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0008955039587606233,
- "loss": 0.0111,
+ "loss": 0.0109,
"macro_f1": 0.6666666865348816,
"num_tokens": 4235122.0,
"repeat_count": 0.0,
- "routers_loss": 0.007106760982424021,
+ "routers_loss": 0.00781513936817646,
"skip_count": 3.0,
"step": 2628,
"text_loss": 0.17802883684635162
@@ -24983,13 +24983,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0400390625,
+ "grad_norm": 0.0439453125,
"learning_rate": 0.0008953145189583429,
- "loss": 0.0125,
+ "loss": 0.0126,
"macro_f1": 0.542222261428833,
"num_tokens": 4238248.0,
"repeat_count": 0.0,
- "routers_loss": 0.06423533707857132,
+ "routers_loss": 0.062252625823020935,
"skip_count": 4.0,
"step": 2630,
"text_loss": 0.5551572442054749
@@ -25002,13 +25002,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0008951249276707933,
- "loss": 0.012,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 4241042.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010294591775164008,
+ "routers_loss": 0.0011421777307987213,
"skip_count": 0.0,
"step": 2632,
"text_loss": 0.7092233896255493
@@ -25021,13 +25021,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0008949351849706261,
- "loss": 0.0122,
+ "loss": 0.0117,
"macro_f1": 0.3333333432674408,
"num_tokens": 4243939.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032732547260820866,
+ "routers_loss": 0.0032689040526747704,
"skip_count": 0.0,
"step": 2634,
"text_loss": 0.19925718009471893
@@ -25040,13 +25040,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0390625,
+ "grad_norm": 0.033935546875,
"learning_rate": 0.0008947452909305509,
- "loss": 0.0112,
+ "loss": 0.0109,
"macro_f1": 0.6666666865348816,
"num_tokens": 4247535.0,
"repeat_count": 1.0,
- "routers_loss": 0.0021109411027282476,
+ "routers_loss": 0.002066014800220728,
"skip_count": 0.0,
"step": 2636,
"text_loss": 0.5249715447425842
@@ -25059,13 +25059,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.11279296875,
+ "grad_norm": 0.09326171875,
"learning_rate": 0.0008945552456233356,
"loss": 0.0169,
"macro_f1": 0.8820862174034119,
"num_tokens": 4251441.0,
"repeat_count": 2.0,
- "routers_loss": 0.029545020312070847,
+ "routers_loss": 0.029332537204027176,
"skip_count": 2.0,
"step": 2638,
"text_loss": 0.19229578971862793
@@ -25078,13 +25078,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.078125,
"learning_rate": 0.0008943650491218058,
- "loss": 0.0083,
+ "loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 4254314.0,
"repeat_count": 0.0,
- "routers_loss": 0.0075805820524692535,
+ "routers_loss": 0.0075911120511591434,
"skip_count": 0.0,
"step": 2640,
"text_loss": 0.27059751749038696
@@ -25097,13 +25097,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.0008941747014988453,
- "loss": 0.0155,
+ "loss": 0.0156,
"macro_f1": 0.3333333432674408,
"num_tokens": 4257442.0,
"repeat_count": 0.0,
- "routers_loss": 0.008832095190882683,
+ "routers_loss": 0.009030844084918499,
"skip_count": 0.0,
"step": 2642,
"text_loss": 0.36747801303863525
@@ -25116,13 +25116,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.080078125,
+ "grad_norm": 0.123046875,
"learning_rate": 0.0008939842028273956,
- "loss": 0.011,
+ "loss": 0.0112,
"macro_f1": 0.6666666865348816,
"num_tokens": 4260386.0,
"repeat_count": 0.0,
- "routers_loss": 0.008952614851295948,
+ "routers_loss": 0.007844001986086369,
"skip_count": 1.0,
"step": 2644,
"text_loss": 0.6397647857666016
@@ -25135,13 +25135,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0250244140625,
+ "grad_norm": 0.0283203125,
"learning_rate": 0.0008937935531804562,
- "loss": 0.0075,
+ "loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 4263516.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017659157747402787,
+ "routers_loss": 0.0018789108144119382,
"skip_count": 0.0,
"step": 2646,
"text_loss": 0.4795534908771515
@@ -25154,13 +25154,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05419921875,
+ "grad_norm": 0.06494140625,
"learning_rate": 0.0008936027526310844,
- "loss": 0.0101,
+ "loss": 0.0098,
"macro_f1": 0.3272727429866791,
"num_tokens": 4266744.0,
"repeat_count": 0.0,
- "routers_loss": 0.03944230079650879,
+ "routers_loss": 0.0348590686917305,
"skip_count": 1.0,
"step": 2648,
"text_loss": 0.27691999077796936
@@ -25173,13 +25173,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.000893411801252395,
"loss": 0.015,
"macro_f1": 0.6666666865348816,
"num_tokens": 4269766.0,
"repeat_count": 0.0,
- "routers_loss": 0.0037144431844353676,
+ "routers_loss": 0.004543309565633535,
"skip_count": 1.0,
"step": 2650,
"text_loss": 0.18867231905460358
@@ -25192,13 +25192,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0008932206991175615,
- "loss": 0.0143,
+ "loss": 0.0141,
"macro_f1": 0.6666666865348816,
"num_tokens": 4273513.0,
"repeat_count": 0.0,
- "routers_loss": 0.003659905167296529,
+ "routers_loss": 0.0035277456045150757,
"skip_count": 1.0,
"step": 2652,
"text_loss": 0.45613357424736023
@@ -25211,13 +25211,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.055908203125,
"learning_rate": 0.0008930294462998143,
"loss": 0.015,
"macro_f1": 0.6666666865348816,
"num_tokens": 4276878.0,
"repeat_count": 1.0,
- "routers_loss": 0.011676746420562267,
+ "routers_loss": 0.011337592266499996,
"skip_count": 0.0,
"step": 2654,
"text_loss": 0.24733254313468933
@@ -25230,13 +25230,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.0869140625,
"learning_rate": 0.0008928380428724419,
- "loss": 0.0061,
+ "loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 4279915.0,
"repeat_count": 0.0,
- "routers_loss": 0.000998969655483961,
+ "routers_loss": 0.0010295971296727657,
"skip_count": 1.0,
"step": 2656,
"text_loss": 0.41722849011421204
@@ -25249,13 +25249,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0008926464889087903,
- "loss": 0.0109,
+ "loss": 0.0116,
"macro_f1": 0.6666666865348816,
"num_tokens": 4282888.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016260759439319372,
+ "routers_loss": 0.0017198545392602682,
"skip_count": 2.0,
"step": 2658,
"text_loss": 0.738322377204895
@@ -25268,13 +25268,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0008924547844822634,
- "loss": 0.0101,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 4285805.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010900370543822646,
+ "routers_loss": 0.001339946174994111,
"skip_count": 0.0,
"step": 2660,
"text_loss": 0.4802379906177521
@@ -25287,13 +25287,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.05322265625,
"learning_rate": 0.000892262929666323,
- "loss": 0.0101,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 4290282.0,
"repeat_count": 0.0,
- "routers_loss": 0.002275131642818451,
+ "routers_loss": 0.0022340165451169014,
"skip_count": 0.0,
"step": 2662,
"text_loss": 0.6503544449806213
@@ -25306,13 +25306,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0419921875,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0008920709245344878,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 4294106.0,
"repeat_count": 0.0,
- "routers_loss": 0.00575100164860487,
+ "routers_loss": 0.005288850050419569,
"skip_count": 1.0,
"step": 2664,
"text_loss": 0.12312037497758865
@@ -25325,13 +25325,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.038330078125,
+ "grad_norm": 0.041259765625,
"learning_rate": 0.0008918787691603347,
- "loss": 0.0122,
+ "loss": 0.0121,
"macro_f1": 0.6666666865348816,
"num_tokens": 4298013.0,
"repeat_count": 0.0,
- "routers_loss": 0.004139711149036884,
+ "routers_loss": 0.004259659443050623,
"skip_count": 1.0,
"step": 2666,
"text_loss": 0.3070000112056732
@@ -25344,13 +25344,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0439453125,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.000891686463617498,
- "loss": 0.0072,
+ "loss": 0.0069,
"macro_f1": 0.6666666865348816,
"num_tokens": 4300799.0,
"repeat_count": 0.0,
- "routers_loss": 0.008856390602886677,
+ "routers_loss": 0.009489355608820915,
"skip_count": 1.0,
"step": 2668,
"text_loss": 0.18535588681697845
@@ -25363,13 +25363,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0576171875,
+ "grad_norm": 0.055908203125,
"learning_rate": 0.0008914940079796696,
- "loss": 0.0116,
+ "loss": 0.0114,
"macro_f1": 0.3333333432674408,
"num_tokens": 4304641.0,
"repeat_count": 0.0,
- "routers_loss": 0.002438562922179699,
+ "routers_loss": 0.0025417013093829155,
"skip_count": 0.0,
"step": 2670,
"text_loss": 0.482585072517395
@@ -25382,13 +25382,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0008913014023205988,
"loss": 0.0108,
"macro_f1": 0.3333333432674408,
"num_tokens": 4307462.0,
"repeat_count": 0.0,
- "routers_loss": 0.006435772404074669,
+ "routers_loss": 0.006371749565005302,
"skip_count": 0.0,
"step": 2672,
"text_loss": 0.7064456939697266
@@ -25401,13 +25401,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.0008911086467140925,
- "loss": 0.0069,
+ "loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 4310396.0,
"repeat_count": 0.0,
- "routers_loss": 0.002773779444396496,
+ "routers_loss": 0.0027512952219694853,
"skip_count": 0.0,
"step": 2674,
"text_loss": 0.23532851040363312
@@ -25420,13 +25420,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.05712890625,
"learning_rate": 0.000890915741234015,
- "loss": 0.0135,
+ "loss": 0.0133,
"macro_f1": 0.6666666865348816,
"num_tokens": 4314781.0,
"repeat_count": 0.0,
- "routers_loss": 0.00862761028110981,
+ "routers_loss": 0.008253013715147972,
"skip_count": 1.0,
"step": 2676,
"text_loss": 0.30950358510017395
@@ -25439,13 +25439,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033203125,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0008907226859542879,
- "loss": 0.0104,
+ "loss": 0.0105,
"macro_f1": 0.6666666865348816,
"num_tokens": 4317988.0,
"repeat_count": 0.0,
- "routers_loss": 0.005587176885455847,
+ "routers_loss": 0.005409995559602976,
"skip_count": 2.0,
"step": 2678,
"text_loss": 0.4930732846260071
@@ -25458,13 +25458,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.042236328125,
+ "grad_norm": 0.060546875,
"learning_rate": 0.0008905294809488907,
- "loss": 0.0082,
+ "loss": 0.0084,
"macro_f1": 1.0,
"num_tokens": 4321014.0,
"repeat_count": 1.0,
- "routers_loss": 0.0033104203175753355,
+ "routers_loss": 0.0029942214023321867,
"skip_count": 1.0,
"step": 2680,
"text_loss": 0.6224040389060974
@@ -25477,13 +25477,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0008903361262918595,
- "loss": 0.0117,
+ "loss": 0.0115,
"macro_f1": 0.6666666865348816,
"num_tokens": 4324268.0,
"repeat_count": 0.0,
- "routers_loss": 0.008205405436456203,
+ "routers_loss": 0.008411120623350143,
"skip_count": 1.0,
"step": 2682,
"text_loss": 0.16296671330928802
@@ -25496,13 +25496,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.052734375,
+ "grad_norm": 0.05126953125,
"learning_rate": 0.0008901426220572884,
- "loss": 0.0142,
+ "loss": 0.0138,
"macro_f1": 1.0,
"num_tokens": 4327494.0,
"repeat_count": 2.0,
- "routers_loss": 0.007884894497692585,
+ "routers_loss": 0.01039006095379591,
"skip_count": 4.0,
"step": 2684,
"text_loss": 0.43866512179374695
@@ -25515,13 +25515,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.060791015625,
"learning_rate": 0.0008899489683193286,
- "loss": 0.011,
+ "loss": 0.0107,
"macro_f1": 0.3333333432674408,
"num_tokens": 4330936.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009336905204690993,
+ "routers_loss": 0.0009329111780971289,
"skip_count": 0.0,
"step": 2686,
"text_loss": 0.44250962138175964
@@ -25534,13 +25534,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0810546875,
+ "grad_norm": 0.07421875,
"learning_rate": 0.0008897551651521885,
"loss": 0.0111,
"macro_f1": 0.3333333432674408,
"num_tokens": 4334123.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033622782211750746,
+ "routers_loss": 0.003197216661646962,
"skip_count": 0.0,
"step": 2688,
"text_loss": 0.48313501477241516
@@ -25553,13 +25553,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07763671875,
+ "grad_norm": 0.09716796875,
"learning_rate": 0.0008895612126301339,
"loss": 0.0157,
"macro_f1": 0.3333333432674408,
"num_tokens": 4337610.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034563415683805943,
+ "routers_loss": 0.0033548236824572086,
"skip_count": 0.0,
"step": 2690,
"text_loss": 0.4715327322483063
@@ -25572,13 +25572,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.051513671875,
"learning_rate": 0.0008893671108274877,
- "loss": 0.0115,
+ "loss": 0.0118,
"macro_f1": 0.3333333432674408,
"num_tokens": 4341026.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022277699317783117,
+ "routers_loss": 0.0024757643695920706,
"skip_count": 0.0,
"step": 2692,
"text_loss": 0.43402785062789917
@@ -25591,13 +25591,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0008891728598186302,
- "loss": 0.011,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 4344422.0,
"repeat_count": 0.0,
- "routers_loss": 0.003892304375767708,
+ "routers_loss": 0.003317243419587612,
"skip_count": 0.0,
"step": 2694,
"text_loss": 0.8498559594154358
@@ -25610,13 +25610,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.0380859375,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0008889784596779986,
- "loss": 0.0092,
+ "loss": 0.009,
"macro_f1": 0.5934640765190125,
"num_tokens": 4347507.0,
"repeat_count": 0.0,
- "routers_loss": 0.015058296732604504,
+ "routers_loss": 0.01577926240861416,
"skip_count": 3.0,
"step": 2696,
"text_loss": 0.5646669864654541
@@ -25629,13 +25629,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.10546875,
+ "grad_norm": 0.11328125,
"learning_rate": 0.0008887839104800876,
- "loss": 0.0118,
+ "loss": 0.0124,
"macro_f1": 0.3333333432674408,
"num_tokens": 4350414.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033561652526259422,
+ "routers_loss": 0.002953822258859873,
"skip_count": 0.0,
"step": 2698,
"text_loss": 0.5145012140274048
@@ -25648,13 +25648,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.05029296875,
"learning_rate": 0.0008885892122994486,
- "loss": 0.0116,
+ "loss": 0.0112,
"macro_f1": 0.3333333432674408,
"num_tokens": 4354110.0,
"repeat_count": 0.0,
- "routers_loss": 0.0062471418641507626,
+ "routers_loss": 0.005849295295774937,
"skip_count": 0.0,
"step": 2700,
"text_loss": 0.580982506275177
@@ -25667,13 +25667,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.0008883943652106903,
"loss": 0.0086,
"macro_f1": 1.0,
"num_tokens": 4357323.0,
"repeat_count": 1.0,
- "routers_loss": 0.011802209541201591,
+ "routers_loss": 0.012347398325800896,
"skip_count": 2.0,
"step": 2702,
"text_loss": 0.2234988808631897
@@ -25686,13 +25686,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06982421875,
+ "grad_norm": 0.0673828125,
"learning_rate": 0.0008881993692884787,
- "loss": 0.0132,
+ "loss": 0.0128,
"macro_f1": 0.6666666865348816,
"num_tokens": 4360228.0,
"repeat_count": 0.0,
- "routers_loss": 0.0041528744623064995,
+ "routers_loss": 0.003574999049305916,
"skip_count": 1.0,
"step": 2704,
"text_loss": 0.4261806607246399
@@ -25705,13 +25705,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0439453125,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0008880042246075365,
- "loss": 0.0094,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 4363905.0,
"repeat_count": 0.0,
- "routers_loss": 0.003151095937937498,
+ "routers_loss": 0.0031574300955981016,
"skip_count": 0.0,
"step": 2706,
"text_loss": 0.691118061542511
@@ -25724,13 +25724,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.0008878089312426433,
"loss": 0.0091,
"macro_f1": 0.3333333432674408,
"num_tokens": 4366736.0,
"repeat_count": 0.0,
- "routers_loss": 0.003142676781862974,
+ "routers_loss": 0.003195564029738307,
"skip_count": 0.0,
"step": 2708,
"text_loss": 0.613926112651825
@@ -25743,13 +25743,13 @@
"f1_execute": 0.9583333134651184,
"f1_repeat": 0.0,
"f1_skip": 0.75,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.054443359375,
"learning_rate": 0.0008876134892686363,
"loss": 0.011,
"macro_f1": 0.5694444179534912,
"num_tokens": 4370146.0,
"repeat_count": 0.0,
- "routers_loss": 0.032964516431093216,
+ "routers_loss": 0.038784291595220566,
"skip_count": 5.0,
"step": 2710,
"text_loss": 0.2723451852798462
@@ -25762,13 +25762,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.080078125,
+ "grad_norm": 0.0830078125,
"learning_rate": 0.000887417898760409,
- "loss": 0.0123,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 4373653.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006848900229670107,
+ "routers_loss": 0.0006457131239585578,
"skip_count": 0.0,
"step": 2712,
"text_loss": 0.31667640805244446
@@ -25781,13 +25781,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.10498046875,
"learning_rate": 0.000887222159792912,
- "loss": 0.0156,
+ "loss": 0.0155,
"macro_f1": 0.6603773832321167,
"num_tokens": 4376993.0,
"repeat_count": 1.0,
- "routers_loss": 0.04388813674449921,
+ "routers_loss": 0.045078590512275696,
"skip_count": 1.0,
"step": 2714,
"text_loss": 0.5872798562049866
@@ -25800,13 +25800,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.034912109375,
"learning_rate": 0.0008870262724411528,
- "loss": 0.0122,
+ "loss": 0.012,
"macro_f1": 0.3333333432674408,
"num_tokens": 4380160.0,
"repeat_count": 0.0,
- "routers_loss": 0.003538437420502305,
+ "routers_loss": 0.003628545207902789,
"skip_count": 0.0,
"step": 2716,
"text_loss": 0.7468157410621643
@@ -25819,13 +25819,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.1328125,
+ "grad_norm": 0.11181640625,
"learning_rate": 0.0008868302367801962,
- "loss": 0.0123,
+ "loss": 0.0118,
"macro_f1": 0.6598639488220215,
"num_tokens": 4383100.0,
"repeat_count": 1.0,
- "routers_loss": 0.05479869619011879,
+ "routers_loss": 0.05404464527964592,
"skip_count": 3.0,
"step": 2718,
"text_loss": 0.2970244884490967
@@ -25838,13 +25838,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0008866340528851629,
"loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 4386700.0,
"repeat_count": 0.0,
- "routers_loss": 0.0070296903140842915,
+ "routers_loss": 0.007000274024903774,
"skip_count": 0.0,
"step": 2720,
"text_loss": 0.34521186351776123
@@ -25857,13 +25857,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05810546875,
+ "grad_norm": 0.052978515625,
"learning_rate": 0.0008864377208312313,
- "loss": 0.0085,
+ "loss": 0.0082,
"macro_f1": 0.8823530077934265,
"num_tokens": 4390299.0,
"repeat_count": 1.0,
- "routers_loss": 0.02051853947341442,
+ "routers_loss": 0.02025366574525833,
"skip_count": 2.0,
"step": 2722,
"text_loss": 1.0536936521530151
@@ -25876,13 +25876,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.000886241240693636,
- "loss": 0.0096,
+ "loss": 0.0098,
"macro_f1": 0.3333333432674408,
"num_tokens": 4393353.0,
"repeat_count": 0.0,
- "routers_loss": 0.002662461483851075,
+ "routers_loss": 0.00251673418097198,
"skip_count": 0.0,
"step": 2724,
"text_loss": 0.5678093433380127
@@ -25895,13 +25895,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.05615234375,
+ "grad_norm": 0.052001953125,
"learning_rate": 0.0008860446125476686,
"loss": 0.0135,
"macro_f1": 0.6666666865348816,
"num_tokens": 4396446.0,
"repeat_count": 1.0,
- "routers_loss": 0.009321866557002068,
+ "routers_loss": 0.009532532654702663,
"skip_count": 0.0,
"step": 2726,
"text_loss": 0.23775041103363037
@@ -25914,13 +25914,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.109375,
+ "grad_norm": 0.091796875,
"learning_rate": 0.0008858478364686776,
- "loss": 0.0102,
+ "loss": 0.0099,
"macro_f1": 0.6666666865348816,
"num_tokens": 4399977.0,
"repeat_count": 1.0,
- "routers_loss": 0.01029124017804861,
+ "routers_loss": 0.008062181062996387,
"skip_count": 0.0,
"step": 2728,
"text_loss": 0.18888695538043976
@@ -25933,13 +25933,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037353515625,
+ "grad_norm": 0.035888671875,
"learning_rate": 0.0008856509125320678,
- "loss": 0.0082,
+ "loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 4404406.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008023424888961017,
+ "routers_loss": 0.0007731119985692203,
"skip_count": 0.0,
"step": 2730,
"text_loss": 0.47331541776657104
@@ -25952,13 +25952,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0517578125,
+ "grad_norm": 0.0498046875,
"learning_rate": 0.0008854538408133006,
- "loss": 0.0115,
+ "loss": 0.0114,
"macro_f1": 0.6666666865348816,
"num_tokens": 4407165.0,
"repeat_count": 0.0,
- "routers_loss": 0.003058656118810177,
+ "routers_loss": 0.003115242812782526,
"skip_count": 1.0,
"step": 2732,
"text_loss": 0.491370290517807
@@ -25971,13 +25971,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039794921875,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0008852566213878947,
- "loss": 0.0082,
+ "loss": 0.0081,
"macro_f1": 0.3333333432674408,
"num_tokens": 4410101.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010282890871167183,
+ "routers_loss": 0.0008958528051152825,
"skip_count": 0.0,
"step": 2734,
"text_loss": 0.42188262939453125
@@ -25990,13 +25990,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.07421875,
+ "grad_norm": 0.07763671875,
"learning_rate": 0.0008850592543314246,
- "loss": 0.0123,
+ "loss": 0.0118,
"macro_f1": 1.0,
"num_tokens": 4413015.0,
"repeat_count": 1.0,
- "routers_loss": 0.014785367995500565,
+ "routers_loss": 0.01139112375676632,
"skip_count": 1.0,
"step": 2736,
"text_loss": 0.4716498553752899
@@ -26009,13 +26009,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0654296875,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0008848617397195218,
- "loss": 0.0089,
+ "loss": 0.0084,
"macro_f1": 0.6603773832321167,
"num_tokens": 4416404.0,
"repeat_count": 1.0,
- "routers_loss": 0.017717093229293823,
+ "routers_loss": 0.01609630137681961,
"skip_count": 1.0,
"step": 2738,
"text_loss": 0.19490821659564972
@@ -26028,13 +26028,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0008846640776278745,
- "loss": 0.0067,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 4419408.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011861984385177493,
+ "routers_loss": 0.001489170710556209,
"skip_count": 0.0,
"step": 2740,
"text_loss": 0.6443108320236206
@@ -26047,13 +26047,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0008844662681322269,
"loss": 0.0144,
"macro_f1": 0.6666666865348816,
"num_tokens": 4422067.0,
"repeat_count": 1.0,
- "routers_loss": 0.0013843412743881345,
+ "routers_loss": 0.0014755792217329144,
"skip_count": 0.0,
"step": 2742,
"text_loss": 0.9150356650352478
@@ -26068,11 +26068,11 @@
"f1_skip": 1.0,
"grad_norm": 0.05078125,
"learning_rate": 0.0008842683113083801,
- "loss": 0.0154,
+ "loss": 0.0149,
"macro_f1": 0.6666666865348816,
"num_tokens": 4425647.0,
"repeat_count": 0.0,
- "routers_loss": 0.010318896733224392,
+ "routers_loss": 0.008962674997746944,
"skip_count": 1.0,
"step": 2744,
"text_loss": 0.7103227972984314
@@ -26085,13 +26085,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07861328125,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0008840702072321915,
- "loss": 0.0108,
+ "loss": 0.0104,
"macro_f1": 0.6598639488220215,
"num_tokens": 4428855.0,
"repeat_count": 1.0,
- "routers_loss": 0.029359478503465652,
+ "routers_loss": 0.02554207295179367,
"skip_count": 3.0,
"step": 2746,
"text_loss": 0.27141591906547546
@@ -26104,13 +26104,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0234375,
+ "grad_norm": 0.0230712890625,
"learning_rate": 0.0008838719559795751,
"loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 4432838.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014995118835940957,
+ "routers_loss": 0.0011747616808861494,
"skip_count": 0.0,
"step": 2748,
"text_loss": 0.4007738530635834
@@ -26123,13 +26123,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.03515625,
+ "grad_norm": 0.03466796875,
"learning_rate": 0.0008836735576265009,
- "loss": 0.0074,
+ "loss": 0.0073,
"macro_f1": 0.5492662787437439,
"num_tokens": 4435793.0,
"repeat_count": 0.0,
- "routers_loss": 0.017950648441910744,
+ "routers_loss": 0.017564335837960243,
"skip_count": 2.0,
"step": 2750,
"text_loss": 0.5972410440444946
@@ -26142,13 +26142,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.044921875,
"learning_rate": 0.0008834750122489956,
- "loss": 0.0083,
+ "loss": 0.0086,
"macro_f1": 0.6666666865348816,
"num_tokens": 4438871.0,
"repeat_count": 1.0,
- "routers_loss": 0.0069067892618477345,
+ "routers_loss": 0.007004009559750557,
"skip_count": 0.0,
"step": 2752,
"text_loss": 0.2294853925704956
@@ -26161,13 +26161,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.051513671875,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0008832763199231423,
- "loss": 0.0101,
+ "loss": 0.0107,
"macro_f1": 0.3333333432674408,
"num_tokens": 4441846.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013944554375484586,
+ "routers_loss": 0.0014562139986082911,
"skip_count": 0.0,
"step": 2754,
"text_loss": 0.722432017326355
@@ -26180,13 +26180,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0008830774807250802,
"loss": 0.013,
"macro_f1": 0.3272727429866791,
"num_tokens": 4444786.0,
"repeat_count": 1.0,
- "routers_loss": 0.025158623233437538,
+ "routers_loss": 0.024773593991994858,
"skip_count": 0.0,
"step": 2756,
"text_loss": 0.507905125617981
@@ -26199,13 +26199,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.05419921875,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.0008828784947310049,
- "loss": 0.0131,
+ "loss": 0.0129,
"macro_f1": 0.8823530077934265,
"num_tokens": 4448442.0,
"repeat_count": 1.0,
- "routers_loss": 0.05205477401614189,
+ "routers_loss": 0.04959975928068161,
"skip_count": 2.0,
"step": 2758,
"text_loss": 0.3617522418498993
@@ -26218,13 +26218,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.10791015625,
+ "grad_norm": 0.1025390625,
"learning_rate": 0.000882679362017168,
"loss": 0.0149,
"macro_f1": 1.0,
"num_tokens": 4451401.0,
"repeat_count": 1.0,
- "routers_loss": 0.005898742936551571,
+ "routers_loss": 0.005783245898783207,
"skip_count": 2.0,
"step": 2760,
"text_loss": 0.49187400937080383
@@ -26237,13 +26237,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0008824800826598778,
- "loss": 0.0129,
+ "loss": 0.0127,
"macro_f1": 0.3333333432674408,
"num_tokens": 4454537.0,
"repeat_count": 0.0,
- "routers_loss": 0.006758298724889755,
+ "routers_loss": 0.00656260596588254,
"skip_count": 0.0,
"step": 2762,
"text_loss": 0.6823583245277405
@@ -26256,13 +26256,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.0546875,
"learning_rate": 0.0008822806567354983,
- "loss": 0.0109,
+ "loss": 0.0111,
"macro_f1": 0.6666666865348816,
"num_tokens": 4457706.0,
"repeat_count": 1.0,
- "routers_loss": 0.005730919074267149,
+ "routers_loss": 0.005298966076225042,
"skip_count": 0.0,
"step": 2764,
"text_loss": 0.554322361946106
@@ -26275,13 +26275,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.051025390625,
+ "grad_norm": 0.046630859375,
"learning_rate": 0.0008820810843204501,
- "loss": 0.0098,
+ "loss": 0.0096,
"macro_f1": 0.3272727429866791,
"num_tokens": 4460710.0,
"repeat_count": 0.0,
- "routers_loss": 0.03390989825129509,
+ "routers_loss": 0.03164982795715332,
"skip_count": 1.0,
"step": 2766,
"text_loss": 0.1656961441040039
@@ -26294,13 +26294,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0849609375,
+ "grad_norm": 0.072265625,
"learning_rate": 0.0008818813654912095,
- "loss": 0.0165,
+ "loss": 0.0162,
"macro_f1": 0.3333333432674408,
"num_tokens": 4464001.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007058497285470366,
+ "routers_loss": 0.000715116853825748,
"skip_count": 0.0,
"step": 2768,
"text_loss": 0.5818144083023071
@@ -26313,13 +26313,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.058837890625,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0008816815003243093,
- "loss": 0.0136,
+ "loss": 0.0133,
"macro_f1": 0.3333333432674408,
"num_tokens": 4467364.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027468691114336252,
+ "routers_loss": 0.002851625671610236,
"skip_count": 0.0,
"step": 2770,
"text_loss": 0.6068631410598755
@@ -26332,13 +26332,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.033203125,
"learning_rate": 0.0008814814888963383,
"loss": 0.0073,
"macro_f1": 0.6666666865348816,
"num_tokens": 4470681.0,
"repeat_count": 0.0,
- "routers_loss": 0.00443003186956048,
+ "routers_loss": 0.004729873035103083,
"skip_count": 1.0,
"step": 2772,
"text_loss": 0.5386646389961243
@@ -26351,13 +26351,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0439453125,
+ "grad_norm": 0.04296875,
"learning_rate": 0.000881281331283941,
"loss": 0.0091,
"macro_f1": 0.6666666865348816,
"num_tokens": 4473734.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031219064258038998,
+ "routers_loss": 0.0031853127293288708,
"skip_count": 1.0,
"step": 2774,
"text_loss": 0.5695263147354126
@@ -26370,13 +26370,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03369140625,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0008810810275638182,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 4478404.0,
"repeat_count": 0.0,
- "routers_loss": 0.000846695271320641,
+ "routers_loss": 0.0008977465913631022,
"skip_count": 0.0,
"step": 2776,
"text_loss": 0.4750773310661316
@@ -26389,13 +26389,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0008808805778127269,
- "loss": 0.0075,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 4481287.0,
"repeat_count": 0.0,
- "routers_loss": 0.0074167875573039055,
+ "routers_loss": 0.00469845999032259,
"skip_count": 0.0,
"step": 2778,
"text_loss": 0.14078612625598907
@@ -26408,13 +26408,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.04296875,
+ "grad_norm": 0.049560546875,
"learning_rate": 0.0008806799821074796,
- "loss": 0.0078,
+ "loss": 0.0079,
"macro_f1": 0.5492662787437439,
"num_tokens": 4483929.0,
"repeat_count": 0.0,
- "routers_loss": 0.018358726054430008,
+ "routers_loss": 0.01789761893451214,
"skip_count": 2.0,
"step": 2780,
"text_loss": 0.2167191207408905
@@ -26427,13 +26427,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0008804792405249451,
- "loss": 0.0124,
+ "loss": 0.0123,
"macro_f1": 0.3333333432674408,
"num_tokens": 4487468.0,
"repeat_count": 0.0,
- "routers_loss": 0.001094152103178203,
+ "routers_loss": 0.001018838956952095,
"skip_count": 0.0,
"step": 2782,
"text_loss": 0.5424665212631226
@@ -26446,13 +26446,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.0498046875,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.000880278353142048,
- "loss": 0.0075,
+ "loss": 0.0077,
"macro_f1": 0.8200000524520874,
"num_tokens": 4490942.0,
"repeat_count": 1.0,
- "routers_loss": 0.03035641834139824,
+ "routers_loss": 0.03260354697704315,
"skip_count": 3.0,
"step": 2784,
"text_loss": 0.20994654297828674
@@ -26465,13 +26465,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05517578125,
+ "grad_norm": 0.05322265625,
"learning_rate": 0.0008800773200357683,
- "loss": 0.0123,
+ "loss": 0.0122,
"macro_f1": 0.3333333432674408,
"num_tokens": 4493986.0,
"repeat_count": 0.0,
- "routers_loss": 0.002394269686192274,
+ "routers_loss": 0.003019835101440549,
"skip_count": 0.0,
"step": 2786,
"text_loss": 0.5709528923034668
@@ -26484,13 +26484,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.034423828125,
"learning_rate": 0.0008798761412831429,
"loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 4498232.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028274122159928083,
+ "routers_loss": 0.00285192858427763,
"skip_count": 0.0,
"step": 2788,
"text_loss": 0.5103896260261536
@@ -26503,13 +26503,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0439453125,
+ "grad_norm": 0.044921875,
"learning_rate": 0.0008796748169612634,
- "loss": 0.0088,
+ "loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 4501231.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012642849469557405,
+ "routers_loss": 0.0012469831854104996,
"skip_count": 0.0,
"step": 2790,
"text_loss": 0.43669697642326355
@@ -26522,13 +26522,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03662109375,
+ "grad_norm": 0.039794921875,
"learning_rate": 0.0008794733471472778,
"loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 4504208.0,
"repeat_count": 0.0,
- "routers_loss": 0.010966303758323193,
+ "routers_loss": 0.011512776836752892,
"skip_count": 1.0,
"step": 2792,
"text_loss": 0.2299770563840866
@@ -26541,13 +26541,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.03564453125,
"learning_rate": 0.0008792717319183899,
- "loss": 0.0064,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 4507013.0,
"repeat_count": 0.0,
- "routers_loss": 0.008194026537239552,
+ "routers_loss": 0.00834917277097702,
"skip_count": 0.0,
"step": 2794,
"text_loss": 0.2130603939294815
@@ -26560,13 +26560,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0283203125,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0008790699713518587,
- "loss": 0.008,
+ "loss": 0.0078,
"macro_f1": 0.6666666865348816,
"num_tokens": 4510286.0,
"repeat_count": 0.0,
- "routers_loss": 0.008828429505228996,
+ "routers_loss": 0.008616939187049866,
"skip_count": 2.0,
"step": 2796,
"text_loss": 0.4377101957798004
@@ -26579,13 +26579,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0274658203125,
+ "grad_norm": 0.02783203125,
"learning_rate": 0.0008788680655249994,
- "loss": 0.007,
+ "loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 4513762.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038230866193771362,
+ "routers_loss": 0.003408568911254406,
"skip_count": 0.0,
"step": 2798,
"text_loss": 0.435138463973999
@@ -26598,13 +26598,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0311279296875,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0008786660145151826,
- "loss": 0.009,
+ "loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 4516696.0,
"repeat_count": 1.0,
- "routers_loss": 0.0031088131945580244,
+ "routers_loss": 0.0029398901388049126,
"skip_count": 0.0,
"step": 2800,
"text_loss": 0.3195655047893524
@@ -26617,13 +26617,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.033203125,
"learning_rate": 0.0008784638183998348,
- "loss": 0.0083,
+ "loss": 0.0081,
"macro_f1": 0.3333333432674408,
"num_tokens": 4519760.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014194221002981067,
+ "routers_loss": 0.0013777425047010183,
"skip_count": 0.0,
"step": 2802,
"text_loss": 0.8129430413246155
@@ -26636,13 +26636,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.032470703125,
"learning_rate": 0.0008782614772564379,
- "loss": 0.0099,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 4522106.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031931858975440264,
+ "routers_loss": 0.0031694830395281315,
"skip_count": 0.0,
"step": 2804,
"text_loss": 0.18083660304546356
@@ -26655,13 +26655,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0008780589911625293,
- "loss": 0.0117,
+ "loss": 0.0114,
"macro_f1": 0.3333333432674408,
"num_tokens": 4525743.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021834284998476505,
+ "routers_loss": 0.002161208540201187,
"skip_count": 0.0,
"step": 2806,
"text_loss": 0.8228182792663574
@@ -26674,13 +26674,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0703125,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0008778563601957021,
- "loss": 0.0098,
+ "loss": 0.0095,
"macro_f1": 0.6666666865348816,
"num_tokens": 4529573.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035390176344662905,
+ "routers_loss": 0.0028444856870919466,
"skip_count": 1.0,
"step": 2808,
"text_loss": 0.3715563118457794
@@ -26693,13 +26693,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04296875,
+ "grad_norm": 0.044677734375,
"learning_rate": 0.0008776535844336049,
- "loss": 0.0095,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 4532452.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038604713045060635,
+ "routers_loss": 0.003807213855907321,
"skip_count": 0.0,
"step": 2810,
"text_loss": 0.6012523174285889
@@ -26712,13 +26712,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0008774506639539417,
- "loss": 0.0072,
+ "loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 4536077.0,
"repeat_count": 0.0,
- "routers_loss": 0.00669970503076911,
+ "routers_loss": 0.006698979996144772,
"skip_count": 0.0,
"step": 2812,
"text_loss": 0.27097949385643005
@@ -26731,13 +26731,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.099609375,
"learning_rate": 0.0008772475988344722,
- "loss": 0.0132,
+ "loss": 0.013,
"macro_f1": 0.6666666865348816,
"num_tokens": 4539057.0,
"repeat_count": 0.0,
- "routers_loss": 0.004594485275447369,
+ "routers_loss": 0.004849409218877554,
"skip_count": 1.0,
"step": 2814,
"text_loss": 1.026973843574524
@@ -26750,13 +26750,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0008770443891530109,
- "loss": 0.0116,
+ "loss": 0.0115,
"macro_f1": 0.5934640765190125,
"num_tokens": 4542253.0,
"repeat_count": 0.0,
- "routers_loss": 0.01891930215060711,
+ "routers_loss": 0.019148651510477066,
"skip_count": 3.0,
"step": 2816,
"text_loss": 0.2717585563659668
@@ -26769,13 +26769,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.054931640625,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0008768410349874286,
"loss": 0.0098,
"macro_f1": 0.6601307392120361,
"num_tokens": 4545047.0,
"repeat_count": 1.0,
- "routers_loss": 0.0247862096875906,
+ "routers_loss": 0.02231316640973091,
"skip_count": 2.0,
"step": 2818,
"text_loss": 0.274346262216568
@@ -26788,13 +26788,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0008766375364156508,
"loss": 0.0091,
"macro_f1": 0.6666666865348816,
"num_tokens": 4548371.0,
"repeat_count": 0.0,
- "routers_loss": 0.008566800504922867,
+ "routers_loss": 0.008014129474759102,
"skip_count": 2.0,
"step": 2820,
"text_loss": 0.22850871086120605
@@ -26807,13 +26807,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041748046875,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.0008764338935156586,
"loss": 0.0095,
"macro_f1": 0.3333333432674408,
"num_tokens": 4551276.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013546474510803819,
+ "routers_loss": 0.0014544493751600385,
"skip_count": 0.0,
"step": 2822,
"text_loss": 0.6308462023735046
@@ -26826,13 +26826,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.0390625,
"learning_rate": 0.000876230106365488,
- "loss": 0.0122,
+ "loss": 0.0123,
"macro_f1": 0.6666666865348816,
"num_tokens": 4554143.0,
"repeat_count": 0.0,
- "routers_loss": 0.009204468689858913,
+ "routers_loss": 0.00818584579974413,
"skip_count": 3.0,
"step": 2824,
"text_loss": 0.3484207093715668
@@ -26845,13 +26845,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03271484375,
+ "grad_norm": 0.0264892578125,
"learning_rate": 0.0008760261750432312,
- "loss": 0.0067,
+ "loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 4557256.0,
"repeat_count": 0.0,
- "routers_loss": 0.00787584763020277,
+ "routers_loss": 0.006275608204305172,
"skip_count": 3.0,
"step": 2826,
"text_loss": 0.1927330046892166
@@ -26864,13 +26864,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.0008758220996270348,
- "loss": 0.0102,
+ "loss": 0.0103,
"macro_f1": 1.0,
"num_tokens": 4560202.0,
"repeat_count": 2.0,
- "routers_loss": 0.0057869357988238335,
+ "routers_loss": 0.0055974251590669155,
"skip_count": 2.0,
"step": 2828,
"text_loss": 0.7796496748924255
@@ -26883,13 +26883,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.046142578125,
"learning_rate": 0.0008756178801951007,
- "loss": 0.0128,
+ "loss": 0.0129,
"macro_f1": 0.3333333432674408,
"num_tokens": 4563508.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018274546600878239,
+ "routers_loss": 0.0019799957517534494,
"skip_count": 0.0,
"step": 2830,
"text_loss": 0.49633297324180603
@@ -26902,13 +26902,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.0458984375,
"learning_rate": 0.0008754135168256865,
- "loss": 0.0094,
+ "loss": 0.0095,
"macro_f1": 0.3333333432674408,
"num_tokens": 4566776.0,
"repeat_count": 0.0,
- "routers_loss": 0.004527154844254255,
+ "routers_loss": 0.004538947716355324,
"skip_count": 0.0,
"step": 2832,
"text_loss": 0.5346745252609253
@@ -26921,13 +26921,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0008752090095971044,
"loss": 0.0091,
"macro_f1": 0.3333333432674408,
"num_tokens": 4569787.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018263199599459767,
+ "routers_loss": 0.001663343166001141,
"skip_count": 0.0,
"step": 2834,
"text_loss": 0.5524004697799683
@@ -26940,13 +26940,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.000875004358587722,
- "loss": 0.0088,
+ "loss": 0.0087,
"macro_f1": 0.3333333432674408,
"num_tokens": 4572813.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022649941965937614,
+ "routers_loss": 0.0022988212294876575,
"skip_count": 0.0,
"step": 2836,
"text_loss": 0.4232870042324066
@@ -26959,13 +26959,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.038330078125,
"learning_rate": 0.000874799563875962,
"loss": 0.0083,
"macro_f1": 0.6666666865348816,
"num_tokens": 4575563.0,
"repeat_count": 0.0,
- "routers_loss": 0.00791149027645588,
+ "routers_loss": 0.007781553082168102,
"skip_count": 1.0,
"step": 2838,
"text_loss": 0.19239822030067444
@@ -26978,13 +26978,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.03515625,
"learning_rate": 0.0008745946255403021,
"loss": 0.0072,
"macro_f1": 0.5492662787437439,
"num_tokens": 4578117.0,
"repeat_count": 0.0,
- "routers_loss": 0.016813624650239944,
+ "routers_loss": 0.01872488670051098,
"skip_count": 2.0,
"step": 2840,
"text_loss": 0.2148810178041458
@@ -26999,11 +26999,11 @@
"f1_skip": 1.0,
"grad_norm": 0.04296875,
"learning_rate": 0.0008743895436592749,
- "loss": 0.0079,
+ "loss": 0.0078,
"macro_f1": 1.0,
"num_tokens": 4582330.0,
"repeat_count": 1.0,
- "routers_loss": 0.004429332446306944,
+ "routers_loss": 0.005634195636957884,
"skip_count": 1.0,
"step": 2842,
"text_loss": 0.4929640591144562
@@ -27016,13 +27016,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.048583984375,
"learning_rate": 0.0008741843183114685,
- "loss": 0.0084,
+ "loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 4585765.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007147722644731402,
+ "routers_loss": 0.0008928569150157273,
"skip_count": 0.0,
"step": 2844,
"text_loss": 0.32702967524528503
@@ -27035,13 +27035,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.044189453125,
+ "grad_norm": 0.0439453125,
"learning_rate": 0.0008739789495755253,
- "loss": 0.0092,
+ "loss": 0.0094,
"macro_f1": 0.6666666865348816,
"num_tokens": 4589000.0,
"repeat_count": 0.0,
- "routers_loss": 0.015438012778759003,
+ "routers_loss": 0.014715569093823433,
"skip_count": 4.0,
"step": 2846,
"text_loss": 0.25125816464424133
@@ -27054,13 +27054,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.049560546875,
"learning_rate": 0.0008737734375301433,
- "loss": 0.0138,
+ "loss": 0.0135,
"macro_f1": 0.3333333432674408,
"num_tokens": 4592391.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015892626252025366,
+ "routers_loss": 0.0017551190685480833,
"skip_count": 0.0,
"step": 2848,
"text_loss": 0.6595172882080078
@@ -27073,13 +27073,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.027099609375,
"learning_rate": 0.0008735677822540749,
- "loss": 0.0086,
+ "loss": 0.0085,
"macro_f1": 0.3333333432674408,
"num_tokens": 4596662.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006934175617061555,
+ "routers_loss": 0.0006456313421949744,
"skip_count": 0.0,
"step": 2850,
"text_loss": 0.6290773153305054
@@ -27092,13 +27092,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.0008733619838261276,
"loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 4599682.0,
"repeat_count": 0.0,
- "routers_loss": 0.006811433006078005,
+ "routers_loss": 0.00765060493722558,
"skip_count": 2.0,
"step": 2852,
"text_loss": 0.3268161416053772
@@ -27111,13 +27111,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0008731560423251637,
- "loss": 0.0104,
+ "loss": 0.01,
"macro_f1": 1.0,
"num_tokens": 4603324.0,
"repeat_count": 1.0,
- "routers_loss": 0.012574959546327591,
+ "routers_loss": 0.01161442045122385,
"skip_count": 2.0,
"step": 2854,
"text_loss": 0.3029932975769043
@@ -27130,13 +27130,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 1.0,
"f1_skip": 0.888888955116272,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.0008729499578301005,
"loss": 0.0098,
"macro_f1": 0.9555556178092957,
"num_tokens": 4606975.0,
"repeat_count": 1.0,
- "routers_loss": 0.01913273334503174,
+ "routers_loss": 0.02055389992892742,
"skip_count": 5.0,
"step": 2856,
"text_loss": 0.6268532872200012
@@ -27149,13 +27149,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.05078125,
"learning_rate": 0.00087274373041991,
- "loss": 0.0082,
+ "loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 4609629.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012737065553665161,
+ "routers_loss": 0.0013911726418882608,
"skip_count": 0.0,
"step": 2858,
"text_loss": 0.534355640411377
@@ -27168,13 +27168,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0008725373601736188,
- "loss": 0.0079,
+ "loss": 0.0081,
"macro_f1": 0.6666666865348816,
"num_tokens": 4612913.0,
"repeat_count": 2.0,
- "routers_loss": 0.009088932536542416,
+ "routers_loss": 0.01010701060295105,
"skip_count": 0.0,
"step": 2860,
"text_loss": 0.3391380310058594
@@ -27187,13 +27187,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0260009765625,
+ "grad_norm": 0.0255126953125,
"learning_rate": 0.0008723308471703085,
- "loss": 0.0078,
+ "loss": 0.008,
"macro_f1": 0.6666666865348816,
"num_tokens": 4616718.0,
"repeat_count": 0.0,
- "routers_loss": 0.006364458240568638,
+ "routers_loss": 0.005969462916254997,
"skip_count": 1.0,
"step": 2862,
"text_loss": 0.47250816226005554
@@ -27206,13 +27206,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047607421875,
+ "grad_norm": 0.046630859375,
"learning_rate": 0.0008721241914891152,
- "loss": 0.0084,
+ "loss": 0.0083,
"macro_f1": 0.3333333432674408,
"num_tokens": 4619680.0,
"repeat_count": 0.0,
- "routers_loss": 0.002686808817088604,
+ "routers_loss": 0.0027780034579336643,
"skip_count": 0.0,
"step": 2864,
"text_loss": 0.3249278664588928
@@ -27225,13 +27225,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.0439453125,
"learning_rate": 0.0008719173932092295,
- "loss": 0.0047,
+ "loss": 0.0044,
"macro_f1": 0.3333333432674408,
"num_tokens": 4622700.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018892486114054918,
+ "routers_loss": 0.0015912104863673449,
"skip_count": 0.0,
"step": 2866,
"text_loss": 0.7789985537528992
@@ -27244,13 +27244,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.049072265625,
+ "grad_norm": 0.05126953125,
"learning_rate": 0.0008717104524098973,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 4626637.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035258810967206955,
+ "routers_loss": 0.0036539011634886265,
"skip_count": 0.0,
"step": 2868,
"text_loss": 0.619088351726532
@@ -27263,13 +27263,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.10400390625,
"learning_rate": 0.0008715033691704187,
- "loss": 0.0121,
+ "loss": 0.0118,
"macro_f1": 0.6666666865348816,
"num_tokens": 4629863.0,
"repeat_count": 0.0,
- "routers_loss": 0.007305602077394724,
+ "routers_loss": 0.008402476087212563,
"skip_count": 1.0,
"step": 2870,
"text_loss": 0.5550018548965454
@@ -27282,13 +27282,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.06298828125,
"learning_rate": 0.0008712961435701479,
- "loss": 0.0162,
+ "loss": 0.0161,
"macro_f1": 0.6666666865348816,
"num_tokens": 4632657.0,
"repeat_count": 0.0,
- "routers_loss": 0.012898211367428303,
+ "routers_loss": 0.01400839351117611,
"skip_count": 1.0,
"step": 2872,
"text_loss": 0.17368625104427338
@@ -27301,13 +27301,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.0008710887756884947,
- "loss": 0.0088,
+ "loss": 0.0086,
"macro_f1": 0.3333333432674408,
"num_tokens": 4635885.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013437134912237525,
+ "routers_loss": 0.0014573842054232955,
"skip_count": 0.0,
"step": 2874,
"text_loss": 0.5138643383979797
@@ -27320,13 +27320,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0008708812656049225,
- "loss": 0.0091,
+ "loss": 0.009,
"macro_f1": 0.6666666865348816,
"num_tokens": 4639341.0,
"repeat_count": 0.0,
- "routers_loss": 0.002090727211907506,
+ "routers_loss": 0.002810224425047636,
"skip_count": 1.0,
"step": 2876,
"text_loss": 0.70310378074646
@@ -27341,11 +27341,11 @@
"f1_skip": 0.8571428656578064,
"grad_norm": 0.03564453125,
"learning_rate": 0.0008706736133989497,
- "loss": 0.0107,
+ "loss": 0.0105,
"macro_f1": 0.9449735879898071,
"num_tokens": 4642163.0,
"repeat_count": 2.0,
- "routers_loss": 0.030176319181919098,
+ "routers_loss": 0.029783209785819054,
"skip_count": 4.0,
"step": 2878,
"text_loss": 0.26898008584976196
@@ -27358,13 +27358,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.04150390625,
"learning_rate": 0.0008704658191501491,
- "loss": 0.0091,
+ "loss": 0.0095,
"macro_f1": 0.3333333432674408,
"num_tokens": 4645858.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009633690933696926,
+ "routers_loss": 0.0009193966398015618,
"skip_count": 0.0,
"step": 2880,
"text_loss": 0.6047570705413818
@@ -27377,13 +27377,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.060302734375,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.0008702578829381475,
"loss": 0.0131,
"macro_f1": 0.8814815282821655,
"num_tokens": 4649237.0,
"repeat_count": 2.0,
- "routers_loss": 0.0568491593003273,
+ "routers_loss": 0.05698608607053757,
"skip_count": 4.0,
"step": 2882,
"text_loss": 0.10695219784975052
@@ -27396,13 +27396,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0306396484375,
+ "grad_norm": 0.0311279296875,
"learning_rate": 0.0008700498048426269,
- "loss": 0.0082,
+ "loss": 0.0083,
"macro_f1": 0.3333333432674408,
"num_tokens": 4652362.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012279651127755642,
+ "routers_loss": 0.0011786938412114978,
"skip_count": 0.0,
"step": 2884,
"text_loss": 0.4442957937717438
@@ -27415,13 +27415,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.046142578125,
"learning_rate": 0.0008698415849433229,
- "loss": 0.0097,
+ "loss": 0.0092,
"macro_f1": 0.5492662787437439,
"num_tokens": 4655616.0,
"repeat_count": 2.0,
- "routers_loss": 0.02166076935827732,
+ "routers_loss": 0.02142646163702011,
"skip_count": 0.0,
"step": 2886,
"text_loss": 0.5820964574813843
@@ -27434,13 +27434,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0008696332233200262,
- "loss": 0.012,
+ "loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 4659294.0,
"repeat_count": 0.0,
- "routers_loss": 0.003944257274270058,
+ "routers_loss": 0.004038636106997728,
"skip_count": 0.0,
"step": 2888,
"text_loss": 0.11847645789384842
@@ -27453,13 +27453,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.0478515625,
"learning_rate": 0.0008694247200525806,
- "loss": 0.0092,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 4662512.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013393335975706577,
+ "routers_loss": 0.0013256469974294305,
"skip_count": 0.0,
"step": 2890,
"text_loss": 0.4873582720756531
@@ -27472,13 +27472,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.0008692160752208856,
- "loss": 0.0128,
+ "loss": 0.0129,
"macro_f1": 0.3272727429866791,
"num_tokens": 4666190.0,
"repeat_count": 0.0,
- "routers_loss": 0.0443510003387928,
+ "routers_loss": 0.04477972164750099,
"skip_count": 1.0,
"step": 2892,
"text_loss": 0.44243401288986206
@@ -27491,13 +27491,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.083984375,
+ "grad_norm": 0.09521484375,
"learning_rate": 0.0008690072889048941,
- "loss": 0.0125,
+ "loss": 0.0127,
"macro_f1": 1.0,
"num_tokens": 4668884.0,
"repeat_count": 1.0,
- "routers_loss": 0.0047337980940938,
+ "routers_loss": 0.004407547414302826,
"skip_count": 2.0,
"step": 2894,
"text_loss": 0.6847127079963684
@@ -27510,13 +27510,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.041015625,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0008687983611846133,
- "loss": 0.0082,
+ "loss": 0.008,
"macro_f1": 0.6666666865348816,
"num_tokens": 4672093.0,
"repeat_count": 0.0,
- "routers_loss": 0.0055244253017008305,
+ "routers_loss": 0.005245382897555828,
"skip_count": 1.0,
"step": 2896,
"text_loss": 0.25583332777023315
@@ -27529,13 +27529,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.0458984375,
"learning_rate": 0.0008685892921401049,
- "loss": 0.011,
+ "loss": 0.0108,
"macro_f1": 0.3333333432674408,
"num_tokens": 4674917.0,
"repeat_count": 0.0,
- "routers_loss": 0.001250729663297534,
+ "routers_loss": 0.0010470855049788952,
"skip_count": 0.0,
"step": 2898,
"text_loss": 0.41998377442359924
@@ -27548,13 +27548,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0008683800818514844,
- "loss": 0.0061,
+ "loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 4677739.0,
"repeat_count": 0.0,
- "routers_loss": 0.00974183902144432,
+ "routers_loss": 0.009026622399687767,
"skip_count": 2.0,
"step": 2900,
"text_loss": 0.303053081035614
@@ -27567,13 +27567,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.078125,
+ "grad_norm": 0.09619140625,
"learning_rate": 0.0008681707303989215,
- "loss": 0.0111,
+ "loss": 0.0108,
"macro_f1": 0.3333333432674408,
"num_tokens": 4680721.0,
"repeat_count": 0.0,
- "routers_loss": 0.004882345907390118,
+ "routers_loss": 0.004500916693359613,
"skip_count": 0.0,
"step": 2902,
"text_loss": 0.5573288798332214
@@ -27586,13 +27586,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0008679612378626404,
"loss": 0.0098,
"macro_f1": 0.6666666865348816,
"num_tokens": 4683339.0,
"repeat_count": 0.0,
- "routers_loss": 0.00568242697045207,
+ "routers_loss": 0.005047840531915426,
"skip_count": 1.0,
"step": 2904,
"text_loss": 0.321353554725647
@@ -27605,13 +27605,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0306396484375,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0008677516043229187,
- "loss": 0.0082,
+ "loss": 0.0083,
"macro_f1": 0.3272727429866791,
"num_tokens": 4686453.0,
"repeat_count": 0.0,
- "routers_loss": 0.010831202380359173,
+ "routers_loss": 0.010256914421916008,
"skip_count": 1.0,
"step": 2906,
"text_loss": 0.4300784468650818
@@ -27624,13 +27624,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.05615234375,
+ "grad_norm": 0.05029296875,
"learning_rate": 0.0008675418298600883,
- "loss": 0.0087,
+ "loss": 0.0083,
"macro_f1": 0.6666666865348816,
"num_tokens": 4689645.0,
"repeat_count": 1.0,
- "routers_loss": 0.00235295994207263,
+ "routers_loss": 0.0022669637110084295,
"skip_count": 0.0,
"step": 2908,
"text_loss": 0.5064885020256042
@@ -27643,13 +27643,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0008673319145545358,
"loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 4692320.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011642680037766695,
+ "routers_loss": 0.0011188550852239132,
"skip_count": 0.0,
"step": 2910,
"text_loss": 0.7114819884300232
@@ -27662,13 +27662,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0008671218584867003,
- "loss": 0.0104,
+ "loss": 0.0102,
"macro_f1": 0.6666666865348816,
"num_tokens": 4695116.0,
"repeat_count": 0.0,
- "routers_loss": 0.00278888875618577,
+ "routers_loss": 0.002966561820358038,
"skip_count": 2.0,
"step": 2912,
"text_loss": 0.5662392973899841
@@ -27681,13 +27681,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.049560546875,
+ "grad_norm": 0.047607421875,
"learning_rate": 0.0008669116617370762,
- "loss": 0.008,
+ "loss": 0.0081,
"macro_f1": 0.3333333432674408,
"num_tokens": 4698040.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014630162622779608,
+ "routers_loss": 0.0012894890969619155,
"skip_count": 0.0,
"step": 2914,
"text_loss": 0.718977689743042
@@ -27700,13 +27700,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0986328125,
+ "grad_norm": 0.1552734375,
"learning_rate": 0.0008667013243862111,
- "loss": 0.0159,
+ "loss": 0.0162,
"macro_f1": 0.3333333432674408,
"num_tokens": 4700963.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011393720051273704,
+ "routers_loss": 0.0007232456118799746,
"skip_count": 0.0,
"step": 2916,
"text_loss": 0.3447718024253845
@@ -27719,13 +27719,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02978515625,
+ "grad_norm": 0.0289306640625,
"learning_rate": 0.000866490846514707,
- "loss": 0.0072,
+ "loss": 0.0075,
"macro_f1": 0.3272727429866791,
"num_tokens": 4704471.0,
"repeat_count": 1.0,
- "routers_loss": 0.014218449592590332,
+ "routers_loss": 0.015166680328547955,
"skip_count": 0.0,
"step": 2918,
"text_loss": 0.454946368932724
@@ -27738,13 +27738,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.052978515625,
+ "grad_norm": 0.04736328125,
"learning_rate": 0.000866280228203219,
"loss": 0.0073,
"macro_f1": 1.0,
"num_tokens": 4707238.0,
"repeat_count": 1.0,
- "routers_loss": 0.005367610137909651,
+ "routers_loss": 0.0061312485486269,
"skip_count": 1.0,
"step": 2920,
"text_loss": 0.721788227558136
@@ -27757,13 +27757,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048828125,
+ "grad_norm": 0.055908203125,
"learning_rate": 0.0008660694695324564,
- "loss": 0.0124,
+ "loss": 0.0125,
"macro_f1": 0.3333333432674408,
"num_tokens": 4711323.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020303199999034405,
+ "routers_loss": 0.00169933564029634,
"skip_count": 0.0,
"step": 2922,
"text_loss": 0.7562121748924255
@@ -27776,13 +27776,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06201171875,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0008658585705831829,
- "loss": 0.0123,
+ "loss": 0.0128,
"macro_f1": 0.3333333432674408,
"num_tokens": 4714417.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022230520844459534,
+ "routers_loss": 0.0022731393110007048,
"skip_count": 0.0,
"step": 2924,
"text_loss": 0.5726147890090942
@@ -27795,13 +27795,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.06787109375,
+ "grad_norm": 0.068359375,
"learning_rate": 0.0008656475314362148,
- "loss": 0.0133,
+ "loss": 0.0131,
"macro_f1": 0.8817967176437378,
"num_tokens": 4717445.0,
"repeat_count": 2.0,
- "routers_loss": 0.06414645165205002,
+ "routers_loss": 0.06477782875299454,
"skip_count": 3.0,
"step": 2926,
"text_loss": 0.4505867660045624
@@ -27814,13 +27814,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 1.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.0625,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0008654363521724229,
- "loss": 0.0128,
+ "loss": 0.0129,
"macro_f1": 0.9449735879898071,
"num_tokens": 4722253.0,
"repeat_count": 2.0,
- "routers_loss": 0.022727061063051224,
+ "routers_loss": 0.027405790984630585,
"skip_count": 4.0,
"step": 2928,
"text_loss": 0.24767601490020752
@@ -27833,13 +27833,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.0537109375,
"learning_rate": 0.0008652250328727315,
- "loss": 0.0114,
+ "loss": 0.0112,
"macro_f1": 0.6666666865348816,
"num_tokens": 4725465.0,
"repeat_count": 0.0,
- "routers_loss": 0.006181784905493259,
+ "routers_loss": 0.006544729229062796,
"skip_count": 2.0,
"step": 2930,
"text_loss": 0.4478724002838135
@@ -27852,13 +27852,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.0517578125,
"learning_rate": 0.0008650135736181184,
- "loss": 0.0133,
+ "loss": 0.0134,
"macro_f1": 0.6666666865348816,
"num_tokens": 4729213.0,
"repeat_count": 1.0,
- "routers_loss": 0.005527070257812738,
+ "routers_loss": 0.0055119614116847515,
"skip_count": 0.0,
"step": 2932,
"text_loss": 0.6749323010444641
@@ -27871,13 +27871,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05517578125,
+ "grad_norm": 0.045166015625,
"learning_rate": 0.0008648019744896154,
- "loss": 0.0102,
+ "loss": 0.0101,
"macro_f1": 0.3333333432674408,
"num_tokens": 4732280.0,
"repeat_count": 0.0,
- "routers_loss": 0.008868738077580929,
+ "routers_loss": 0.008374541997909546,
"skip_count": 0.0,
"step": 2934,
"text_loss": 0.4647359251976013
@@ -27890,13 +27890,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.057373046875,
+ "grad_norm": 0.06201171875,
"learning_rate": 0.0008645902355683077,
- "loss": 0.0089,
+ "loss": 0.0091,
"macro_f1": 0.6595745086669922,
"num_tokens": 4736244.0,
"repeat_count": 1.0,
- "routers_loss": 0.07285884022712708,
+ "routers_loss": 0.068686343729496,
"skip_count": 4.0,
"step": 2936,
"text_loss": 0.5356017351150513
@@ -27909,13 +27909,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.043212890625,
+ "grad_norm": 0.042236328125,
"learning_rate": 0.0008643783569353339,
- "loss": 0.0072,
+ "loss": 0.007,
"macro_f1": 0.6666666865348816,
"num_tokens": 4739810.0,
"repeat_count": 2.0,
- "routers_loss": 0.019306030124425888,
+ "routers_loss": 0.017954571172595024,
"skip_count": 0.0,
"step": 2938,
"text_loss": 0.3145926296710968
@@ -27928,13 +27928,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.054443359375,
"learning_rate": 0.0008641663386718863,
- "loss": 0.0084,
+ "loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 4742720.0,
"repeat_count": 0.0,
- "routers_loss": 0.00626454409211874,
+ "routers_loss": 0.006261351052671671,
"skip_count": 1.0,
"step": 2940,
"text_loss": 0.3200613856315613
@@ -27949,11 +27949,11 @@
"f1_skip": 1.0,
"grad_norm": 0.04150390625,
"learning_rate": 0.0008639541808592109,
- "loss": 0.0091,
+ "loss": 0.0093,
"macro_f1": 1.0,
"num_tokens": 4745870.0,
"repeat_count": 1.0,
- "routers_loss": 0.0019172134343534708,
+ "routers_loss": 0.0025341357104480267,
"skip_count": 1.0,
"step": 2942,
"text_loss": 0.5020416378974915
@@ -27968,11 +27968,11 @@
"f1_skip": 1.0,
"grad_norm": 0.025634765625,
"learning_rate": 0.0008637418835786067,
- "loss": 0.0095,
+ "loss": 0.0094,
"macro_f1": 0.6666666865348816,
"num_tokens": 4748943.0,
"repeat_count": 0.0,
- "routers_loss": 0.009745351038873196,
+ "routers_loss": 0.008970048278570175,
"skip_count": 2.0,
"step": 2944,
"text_loss": 0.14517110586166382
@@ -27985,13 +27985,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.043701171875,
+ "grad_norm": 0.055908203125,
"learning_rate": 0.0008635294469114265,
- "loss": 0.011,
+ "loss": 0.0112,
"macro_f1": 0.3333333432674408,
"num_tokens": 4751360.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020624736789613962,
+ "routers_loss": 0.002133632078766823,
"skip_count": 0.0,
"step": 2946,
"text_loss": 0.5367856025695801
@@ -28004,13 +28004,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.091796875,
+ "grad_norm": 0.08837890625,
"learning_rate": 0.0008633168709390766,
- "loss": 0.0118,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 4754403.0,
"repeat_count": 0.0,
- "routers_loss": 0.001082106726244092,
+ "routers_loss": 0.0011866620043292642,
"skip_count": 0.0,
"step": 2948,
"text_loss": 0.38302522897720337
@@ -28023,13 +28023,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.037109375,
"learning_rate": 0.0008631041557430163,
- "loss": 0.0061,
+ "loss": 0.0058,
"macro_f1": 0.6666666865348816,
"num_tokens": 4757867.0,
"repeat_count": 2.0,
- "routers_loss": 0.0026527612935751677,
+ "routers_loss": 0.0026854004245251417,
"skip_count": 0.0,
"step": 2950,
"text_loss": 0.43433454632759094
@@ -28042,13 +28042,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.05859375,
"learning_rate": 0.0008628913014047585,
"loss": 0.0102,
"macro_f1": 0.3333333432674408,
"num_tokens": 4761171.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027245471719652414,
+ "routers_loss": 0.002433479530736804,
"skip_count": 0.0,
"step": 2952,
"text_loss": 0.4725971519947052
@@ -28061,13 +28061,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0286865234375,
+ "grad_norm": 0.028564453125,
"learning_rate": 0.0008626783080058696,
- "loss": 0.0065,
+ "loss": 0.0066,
"macro_f1": 0.3272727429866791,
"num_tokens": 4764752.0,
"repeat_count": 1.0,
- "routers_loss": 0.01764744706451893,
+ "routers_loss": 0.017182493582367897,
"skip_count": 0.0,
"step": 2954,
"text_loss": 0.460641473531723
@@ -28080,13 +28080,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0859375,
+ "grad_norm": 0.12353515625,
"learning_rate": 0.0008624651756279687,
- "loss": 0.0196,
+ "loss": 0.0198,
"macro_f1": 0.3333333432674408,
"num_tokens": 4767453.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019560824148356915,
+ "routers_loss": 0.0018134774873033166,
"skip_count": 0.0,
"step": 2956,
"text_loss": 0.4091459810733795
@@ -28099,13 +28099,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 0.800000011920929,
"f1_skip": 1.0,
- "grad_norm": 0.051025390625,
+ "grad_norm": 0.053466796875,
"learning_rate": 0.000862251904352729,
"loss": 0.0108,
"macro_f1": 0.9259259104728699,
"num_tokens": 4771110.0,
"repeat_count": 3.0,
- "routers_loss": 0.03031078353524208,
+ "routers_loss": 0.0365753099322319,
"skip_count": 3.0,
"step": 2958,
"text_loss": 0.22408585250377655
@@ -28118,13 +28118,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05224609375,
+ "grad_norm": 0.05029296875,
"learning_rate": 0.000862038494261876,
"loss": 0.0109,
"macro_f1": 0.3272727429866791,
"num_tokens": 4774464.0,
"repeat_count": 0.0,
- "routers_loss": 0.024790454655885696,
+ "routers_loss": 0.024343067780137062,
"skip_count": 1.0,
"step": 2960,
"text_loss": 0.16483014822006226
@@ -28137,13 +28137,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052490234375,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0008618249454371891,
- "loss": 0.0099,
+ "loss": 0.01,
"macro_f1": 0.3333333432674408,
"num_tokens": 4777894.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008704765350557864,
+ "routers_loss": 0.0008310087723657489,
"skip_count": 0.0,
"step": 2962,
"text_loss": 0.5573428869247437
@@ -28156,13 +28156,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0008616112579605006,
- "loss": 0.0116,
+ "loss": 0.0117,
"macro_f1": 0.3333333432674408,
"num_tokens": 4781116.0,
"repeat_count": 0.0,
- "routers_loss": 0.0066874073818326,
+ "routers_loss": 0.0065494864247739315,
"skip_count": 0.0,
"step": 2964,
"text_loss": 0.18816794455051422
@@ -28175,13 +28175,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.04248046875,
"learning_rate": 0.0008613974319136957,
- "loss": 0.0091,
+ "loss": 0.009,
"macro_f1": 0.3333333432674408,
"num_tokens": 4784886.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021798228845000267,
+ "routers_loss": 0.0019726944155991077,
"skip_count": 0.0,
"step": 2966,
"text_loss": 0.5097305774688721
@@ -28194,13 +28194,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.076171875,
+ "grad_norm": 0.0849609375,
"learning_rate": 0.0008611834673787134,
"loss": 0.0118,
"macro_f1": 0.3333333432674408,
"num_tokens": 4787563.0,
"repeat_count": 0.0,
- "routers_loss": 0.0063707553781569,
+ "routers_loss": 0.006327496841549873,
"skip_count": 0.0,
"step": 2968,
"text_loss": 0.6953814029693604
@@ -28213,13 +28213,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.5,
"f1_skip": 1.0,
- "grad_norm": 0.0595703125,
+ "grad_norm": 0.056884765625,
"learning_rate": 0.0008609693644375449,
- "loss": 0.0088,
+ "loss": 0.0086,
"macro_f1": 0.8200000524520874,
"num_tokens": 4790421.0,
"repeat_count": 3.0,
- "routers_loss": 0.044509731233119965,
+ "routers_loss": 0.042896661907434464,
"skip_count": 1.0,
"step": 2970,
"text_loss": 0.2573051154613495
@@ -28227,18 +28227,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 1.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 13.953331376577633,
- "f1_execute": 0.9795917868614197,
+ "f1_execute": 1.0,
"f1_repeat": 1.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.1640625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.14453125,
"learning_rate": 0.000860755123172235,
- "loss": 0.01,
- "macro_f1": 0.8820862174034119,
+ "loss": 0.0096,
+ "macro_f1": 1.0,
"num_tokens": 4793786.0,
"repeat_count": 2.0,
- "routers_loss": 0.01667599380016327,
+ "routers_loss": 0.013228793628513813,
"skip_count": 1.0,
"step": 2972,
"text_loss": 0.46614497900009155
@@ -28251,13 +28251,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0274658203125,
+ "grad_norm": 0.0296630859375,
"learning_rate": 0.0008605407436648815,
- "loss": 0.0069,
+ "loss": 0.007,
"macro_f1": 0.6666666865348816,
"num_tokens": 4796864.0,
"repeat_count": 0.0,
- "routers_loss": 0.008433761075139046,
+ "routers_loss": 0.007294759154319763,
"skip_count": 2.0,
"step": 2974,
"text_loss": 0.21555091440677643
@@ -28270,13 +28270,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.059814453125,
+ "grad_norm": 0.057861328125,
"learning_rate": 0.0008603262259976348,
- "loss": 0.0131,
+ "loss": 0.0129,
"macro_f1": 1.0,
"num_tokens": 4800080.0,
"repeat_count": 1.0,
- "routers_loss": 0.002439796691760421,
+ "routers_loss": 0.0024024227168411016,
"skip_count": 5.0,
"step": 2976,
"text_loss": 0.7855485081672668
@@ -28289,13 +28289,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05126953125,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0008601115702526987,
- "loss": 0.0112,
+ "loss": 0.0113,
"macro_f1": 0.3333333432674408,
"num_tokens": 4802899.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015027766348794103,
+ "routers_loss": 0.001433031284250319,
"skip_count": 0.0,
"step": 2978,
"text_loss": 0.6777765154838562
@@ -28308,13 +28308,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06103515625,
+ "grad_norm": 0.04931640625,
"learning_rate": 0.0008598967765123293,
- "loss": 0.0091,
+ "loss": 0.0088,
"macro_f1": 0.3333333432674408,
"num_tokens": 4805835.0,
"repeat_count": 0.0,
- "routers_loss": 0.003235677955672145,
+ "routers_loss": 0.003073975909501314,
"skip_count": 0.0,
"step": 2980,
"text_loss": 0.5926910638809204
@@ -28322,18 +28322,18 @@
{
"acc_repeat": 1.0,
"acc_skip": 0.5,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 14.0,
- "f1_execute": 0.9090908765792847,
- "f1_repeat": 0.6666666865348816,
+ "f1_execute": 0.9333333373069763,
+ "f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.052734375,
+ "grad_norm": 0.05322265625,
"learning_rate": 0.0008596818448588364,
- "loss": 0.0141,
- "macro_f1": 0.7474747896194458,
+ "loss": 0.0139,
+ "macro_f1": 0.8666667342185974,
"num_tokens": 4809028.0,
"repeat_count": 1.0,
- "routers_loss": 0.063179150223732,
+ "routers_loss": 0.06438573449850082,
"skip_count": 6.0,
"step": 2982,
"text_loss": 0.23975612223148346
@@ -28346,13 +28346,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0299072265625,
+ "grad_norm": 0.0302734375,
"learning_rate": 0.0008594667753745821,
- "loss": 0.0055,
+ "loss": 0.0054,
"macro_f1": 0.3272727429866791,
"num_tokens": 4812831.0,
"repeat_count": 0.0,
- "routers_loss": 0.015444152988493443,
+ "routers_loss": 0.014817612245678902,
"skip_count": 1.0,
"step": 2984,
"text_loss": 0.17292268574237823
@@ -28365,13 +28365,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.060546875,
+ "grad_norm": 0.07421875,
"learning_rate": 0.0008592515681419813,
- "loss": 0.0079,
+ "loss": 0.0078,
"macro_f1": 0.5492662787437439,
"num_tokens": 4816005.0,
"repeat_count": 2.0,
- "routers_loss": 0.02485196851193905,
+ "routers_loss": 0.025407327339053154,
"skip_count": 0.0,
"step": 2986,
"text_loss": 0.6403061151504517
@@ -28384,13 +28384,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0008590362232435018,
- "loss": 0.0102,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 4818901.0,
"repeat_count": 0.0,
- "routers_loss": 0.006175600457936525,
+ "routers_loss": 0.006826757453382015,
"skip_count": 0.0,
"step": 2988,
"text_loss": 0.2572069466114044
@@ -28403,13 +28403,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041748046875,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0008588207407616644,
- "loss": 0.0085,
+ "loss": 0.0086,
"macro_f1": 0.3333333432674408,
"num_tokens": 4823120.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008576468680985272,
+ "routers_loss": 0.0009054148104041815,
"skip_count": 0.0,
"step": 2990,
"text_loss": 0.4827076196670532
@@ -28422,13 +28422,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02392578125,
+ "grad_norm": 0.0247802734375,
"learning_rate": 0.0008586051207790422,
- "loss": 0.0059,
+ "loss": 0.0055,
"macro_f1": 0.3333333432674408,
"num_tokens": 4825774.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011548360344022512,
+ "routers_loss": 0.0012294676853343844,
"skip_count": 0.0,
"step": 2992,
"text_loss": 0.40157821774482727
@@ -28441,13 +28441,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.056396484375,
+ "grad_norm": 0.052734375,
"learning_rate": 0.0008583893633782612,
- "loss": 0.0085,
+ "loss": 0.0084,
"macro_f1": 0.5492662787437439,
"num_tokens": 4828841.0,
"repeat_count": 0.0,
- "routers_loss": 0.01307896338403225,
+ "routers_loss": 0.011474622413516045,
"skip_count": 2.0,
"step": 2994,
"text_loss": 0.14842072129249573
@@ -28460,13 +28460,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0615234375,
+ "grad_norm": 0.058837890625,
"learning_rate": 0.0008581734686419999,
"loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 4831458.0,
"repeat_count": 0.0,
- "routers_loss": 0.009716883301734924,
+ "routers_loss": 0.009154081344604492,
"skip_count": 2.0,
"step": 2996,
"text_loss": 0.365400105714798
@@ -28479,13 +28479,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031494140625,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.00085795743665299,
"loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 4834609.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026114562060683966,
+ "routers_loss": 0.002899336162954569,
"skip_count": 0.0,
"step": 2998,
"text_loss": 0.5574684143066406
@@ -28498,13 +28498,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052001953125,
+ "grad_norm": 0.0517578125,
"learning_rate": 0.0008577412674940152,
"loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 4838324.0,
"repeat_count": 0.0,
- "routers_loss": 0.003787368768826127,
+ "routers_loss": 0.0034664268605411053,
"skip_count": 0.0,
"step": 3000,
"text_loss": 0.6752855777740479
@@ -28517,13 +28517,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0281982421875,
+ "grad_norm": 0.03466796875,
"learning_rate": 0.0008575249612479117,
"loss": 0.0127,
"macro_f1": 0.6666666865348816,
"num_tokens": 4841877.0,
"repeat_count": 0.0,
- "routers_loss": 0.004202218260616064,
+ "routers_loss": 0.0036425739526748657,
"skip_count": 2.0,
"step": 3002,
"text_loss": 0.6332980394363403
@@ -28536,13 +28536,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0517578125,
+ "grad_norm": 0.048095703125,
"learning_rate": 0.0008573085179975685,
- "loss": 0.0066,
+ "loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 4845840.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012371218763291836,
+ "routers_loss": 0.0013783496106043458,
"skip_count": 0.0,
"step": 3004,
"text_loss": 0.4219617545604706
@@ -28555,13 +28555,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0008570919378259274,
"loss": 0.007,
"macro_f1": 0.6666666865348816,
"num_tokens": 4848766.0,
"repeat_count": 0.0,
- "routers_loss": 0.005013706628233194,
+ "routers_loss": 0.004823608323931694,
"skip_count": 1.0,
"step": 3006,
"text_loss": 0.7987180948257446
@@ -28574,13 +28574,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.029052734375,
+ "grad_norm": 0.0302734375,
"learning_rate": 0.000856875220815982,
- "loss": 0.0069,
+ "loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 4852310.0,
"repeat_count": 0.0,
- "routers_loss": 0.001336073037236929,
+ "routers_loss": 0.0014760984340682626,
"skip_count": 0.0,
"step": 3008,
"text_loss": 0.35592713952064514
@@ -28593,13 +28593,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0008566583670507788,
"loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 4856146.0,
"repeat_count": 0.0,
- "routers_loss": 0.003256940981373191,
+ "routers_loss": 0.0031717263627797365,
"skip_count": 1.0,
"step": 3010,
"text_loss": 0.19379083812236786
@@ -28612,13 +28612,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041748046875,
+ "grad_norm": 0.0517578125,
"learning_rate": 0.0008564413766134164,
- "loss": 0.0091,
+ "loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 4859386.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038389062974601984,
+ "routers_loss": 0.003361492184922099,
"skip_count": 0.0,
"step": 3012,
"text_loss": 0.39129266142845154
@@ -28631,13 +28631,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052734375,
+ "grad_norm": 0.048583984375,
"learning_rate": 0.0008562242495870463,
- "loss": 0.0119,
+ "loss": 0.0113,
"macro_f1": 0.3333333432674408,
"num_tokens": 4862661.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007799214799888432,
+ "routers_loss": 0.0010563990799710155,
"skip_count": 0.0,
"step": 3014,
"text_loss": 0.5966938734054565
@@ -28650,13 +28650,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0213623046875,
+ "grad_norm": 0.0234375,
"learning_rate": 0.0008560069860548716,
- "loss": 0.006,
+ "loss": 0.0059,
"macro_f1": 0.3333333432674408,
"num_tokens": 4865410.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010348912328481674,
+ "routers_loss": 0.001233913702890277,
"skip_count": 0.0,
"step": 3016,
"text_loss": 0.3386077880859375
@@ -28669,13 +28669,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056884765625,
+ "grad_norm": 0.055419921875,
"learning_rate": 0.0008557895861001484,
- "loss": 0.006,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 4868931.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018167694797739387,
+ "routers_loss": 0.0018066301709041,
"skip_count": 0.0,
"step": 3018,
"text_loss": 0.5222050547599792
@@ -28688,13 +28688,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037353515625,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.0008555720498061845,
- "loss": 0.0078,
+ "loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 4873492.0,
"repeat_count": 0.0,
- "routers_loss": 0.005788089707493782,
+ "routers_loss": 0.0050385501235723495,
"skip_count": 1.0,
"step": 3020,
"text_loss": 0.4558849334716797
@@ -28707,13 +28707,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0008553543772563403,
- "loss": 0.0092,
+ "loss": 0.009,
"macro_f1": 0.3333333432674408,
"num_tokens": 4877026.0,
"repeat_count": 0.0,
- "routers_loss": 0.004194240085780621,
+ "routers_loss": 0.004828717093914747,
"skip_count": 0.0,
"step": 3022,
"text_loss": 0.36598992347717285
@@ -28726,13 +28726,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 1.0,
"f1_skip": 0.888888955116272,
- "grad_norm": 0.05712890625,
+ "grad_norm": 0.06103515625,
"learning_rate": 0.0008551365685340285,
"loss": 0.0084,
"macro_f1": 0.9555556178092957,
"num_tokens": 4879655.0,
"repeat_count": 1.0,
- "routers_loss": 0.019211066886782646,
+ "routers_loss": 0.02049369551241398,
"skip_count": 5.0,
"step": 3024,
"text_loss": 0.5069093704223633
@@ -28745,13 +28745,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0008549186237227138,
- "loss": 0.0092,
+ "loss": 0.0088,
"macro_f1": 0.8823530077934265,
"num_tokens": 4882606.0,
"repeat_count": 1.0,
- "routers_loss": 0.041074834764003754,
+ "routers_loss": 0.03947242721915245,
"skip_count": 2.0,
"step": 3026,
"text_loss": 0.2600715458393097
@@ -28764,13 +28764,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.031982421875,
+ "grad_norm": 0.030029296875,
"learning_rate": 0.0008547005429059128,
- "loss": 0.0075,
+ "loss": 0.0073,
"macro_f1": 0.6666666865348816,
"num_tokens": 4885246.0,
"repeat_count": 2.0,
- "routers_loss": 0.0027008953038603067,
+ "routers_loss": 0.0026363315992057323,
"skip_count": 0.0,
"step": 3028,
"text_loss": 0.37642326951026917
@@ -28783,13 +28783,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.046630859375,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0008544823261671948,
- "loss": 0.0074,
+ "loss": 0.0073,
"macro_f1": 0.3333333432674408,
"num_tokens": 4888109.0,
"repeat_count": 0.0,
- "routers_loss": 0.00402502017095685,
+ "routers_loss": 0.003858231008052826,
"skip_count": 0.0,
"step": 3030,
"text_loss": 0.5875385999679565
@@ -28802,13 +28802,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.0008542639735901804,
- "loss": 0.007,
+ "loss": 0.0067,
"macro_f1": 1.0,
"num_tokens": 4891168.0,
"repeat_count": 1.0,
- "routers_loss": 0.00628731120377779,
+ "routers_loss": 0.004789089784026146,
"skip_count": 1.0,
"step": 3032,
"text_loss": 0.6417325139045715
@@ -28821,32 +28821,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.035888671875,
"learning_rate": 0.0008540454852585434,
- "loss": 0.0117,
+ "loss": 0.0115,
"macro_f1": 0.6666666865348816,
"num_tokens": 4894355.0,
"repeat_count": 0.0,
- "routers_loss": 0.007284072227776051,
+ "routers_loss": 0.007334680762141943,
"skip_count": 2.0,
"step": 3034,
"text_loss": 0.23697198927402496
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.6666666865348816,
- "avg_layers": 26.0,
+ "acc_skip": 0.3333333432674408,
+ "avg_layers": 27.0,
"epoch": 14.253595538597006,
- "f1_execute": 0.9803921580314636,
+ "f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
- "f1_skip": 0.800000011920929,
- "grad_norm": 0.033203125,
+ "f1_skip": 0.5,
+ "grad_norm": 0.034423828125,
"learning_rate": 0.0008538268612560084,
- "loss": 0.0059,
- "macro_f1": 0.5934640765190125,
+ "loss": 0.0058,
+ "macro_f1": 0.4871794879436493,
"num_tokens": 4897543.0,
"repeat_count": 0.0,
- "routers_loss": 0.020328659564256668,
+ "routers_loss": 0.022096361964941025,
"skip_count": 3.0,
"step": 3036,
"text_loss": 0.1989550143480301
@@ -28859,13 +28859,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.047119140625,
"learning_rate": 0.0008536081016663527,
- "loss": 0.0102,
+ "loss": 0.0101,
"macro_f1": 1.0,
"num_tokens": 4900752.0,
"repeat_count": 1.0,
- "routers_loss": 0.002338571473956108,
+ "routers_loss": 0.0037680594250559807,
"skip_count": 2.0,
"step": 3038,
"text_loss": 0.5001366138458252
@@ -28878,13 +28878,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0008533892065734055,
- "loss": 0.0083,
+ "loss": 0.008,
"macro_f1": 0.6666666865348816,
"num_tokens": 4903581.0,
"repeat_count": 0.0,
- "routers_loss": 0.003033763263374567,
+ "routers_loss": 0.0032373068388551474,
"skip_count": 1.0,
"step": 3040,
"text_loss": 0.5019411444664001
@@ -28897,13 +28897,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.042724609375,
"learning_rate": 0.0008531701760610476,
- "loss": 0.012,
+ "loss": 0.0121,
"macro_f1": 1.0,
"num_tokens": 4907108.0,
"repeat_count": 1.0,
- "routers_loss": 0.00831629242748022,
+ "routers_loss": 0.0078013185411691666,
"skip_count": 2.0,
"step": 3042,
"text_loss": 0.3460627794265747
@@ -28916,13 +28916,13 @@
"f1_execute": 0.9600000381469727,
"f1_repeat": 1.0,
"f1_skip": 0.5,
- "grad_norm": 0.04736328125,
+ "grad_norm": 0.04833984375,
"learning_rate": 0.000852951010213212,
- "loss": 0.0087,
+ "loss": 0.0089,
"macro_f1": 0.8200000524520874,
"num_tokens": 4911269.0,
"repeat_count": 1.0,
- "routers_loss": 0.03200878947973251,
+ "routers_loss": 0.03576689213514328,
"skip_count": 3.0,
"step": 3044,
"text_loss": 0.268994003534317
@@ -28935,13 +28935,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0283203125,
+ "grad_norm": 0.02685546875,
"learning_rate": 0.0008527317091138835,
- "loss": 0.0068,
+ "loss": 0.0066,
"macro_f1": 1.0,
"num_tokens": 4914203.0,
"repeat_count": 1.0,
- "routers_loss": 0.003899211063981056,
+ "routers_loss": 0.0032140621915459633,
"skip_count": 1.0,
"step": 3046,
"text_loss": 0.9998719692230225
@@ -28954,13 +28954,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.040771484375,
"learning_rate": 0.0008525122728470987,
"loss": 0.0102,
"macro_f1": 1.0,
"num_tokens": 4918562.0,
"repeat_count": 1.0,
- "routers_loss": 0.00883556716144085,
+ "routers_loss": 0.008559177629649639,
"skip_count": 3.0,
"step": 3048,
"text_loss": 0.3062439560890198
@@ -28973,13 +28973,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03173828125,
+ "grad_norm": 0.03125,
"learning_rate": 0.0008522927014969459,
- "loss": 0.0064,
+ "loss": 0.006,
"macro_f1": 0.6666666865348816,
"num_tokens": 4921940.0,
"repeat_count": 0.0,
- "routers_loss": 0.009054492227733135,
+ "routers_loss": 0.008735597133636475,
"skip_count": 2.0,
"step": 3050,
"text_loss": 0.3637430965900421
@@ -28992,13 +28992,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.05517578125,
"learning_rate": 0.0008520729951475652,
- "loss": 0.0082,
+ "loss": 0.0085,
"macro_f1": 0.3333333432674408,
"num_tokens": 4925416.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011907420121133327,
+ "routers_loss": 0.0012709591537714005,
"skip_count": 0.0,
"step": 3052,
"text_loss": 0.542036235332489
@@ -29011,13 +29011,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0008518531538831488,
"loss": 0.0096,
"macro_f1": 0.6666666865348816,
"num_tokens": 4928695.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013618353987112641,
+ "routers_loss": 0.0010660928674042225,
"skip_count": 1.0,
"step": 3054,
"text_loss": 0.43144503235816956
@@ -29030,13 +29030,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.060546875,
+ "grad_norm": 0.059326171875,
"learning_rate": 0.00085163317778794,
- "loss": 0.0102,
+ "loss": 0.0096,
"macro_f1": 0.6666666865348816,
"num_tokens": 4931504.0,
"repeat_count": 0.0,
- "routers_loss": 0.004202015232294798,
+ "routers_loss": 0.004558971151709557,
"skip_count": 2.0,
"step": 3056,
"text_loss": 0.5257010459899902
@@ -29049,32 +29049,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0498046875,
+ "grad_norm": 0.04931640625,
"learning_rate": 0.0008514130669462341,
- "loss": 0.0109,
+ "loss": 0.0105,
"macro_f1": 0.6666666865348816,
"num_tokens": 4934935.0,
"repeat_count": 0.0,
- "routers_loss": 0.01060314942151308,
+ "routers_loss": 0.010774781927466393,
"skip_count": 2.0,
"step": 3058,
"text_loss": 0.26061776280403137
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.0,
"acc_skip": 1.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 14.366304666862343,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.0390625,
"learning_rate": 0.0008511928214423782,
"loss": 0.0103,
- "macro_f1": 1.0,
+ "macro_f1": 0.6601307392120361,
"num_tokens": 4938047.0,
"repeat_count": 1.0,
- "routers_loss": 0.012400983832776546,
+ "routers_loss": 0.014763157814741135,
"skip_count": 2.0,
"step": 3060,
"text_loss": 0.2856905460357666
@@ -29087,13 +29087,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.046875,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0008509724413607705,
"loss": 0.0087,
"macro_f1": 0.6666666865348816,
"num_tokens": 4941041.0,
"repeat_count": 1.0,
- "routers_loss": 0.004353851079940796,
+ "routers_loss": 0.004613345488905907,
"skip_count": 0.0,
"step": 3062,
"text_loss": 0.2870287001132965
@@ -29106,13 +29106,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.053955078125,
+ "grad_norm": 0.06298828125,
"learning_rate": 0.0008507519267858612,
- "loss": 0.0148,
+ "loss": 0.015,
"macro_f1": 1.0,
"num_tokens": 4944708.0,
"repeat_count": 1.0,
- "routers_loss": 0.009858032688498497,
+ "routers_loss": 0.008584189228713512,
"skip_count": 2.0,
"step": 3064,
"text_loss": 0.15828095376491547
@@ -29125,13 +29125,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0306396484375,
+ "grad_norm": 0.029052734375,
"learning_rate": 0.0008505312778021519,
"loss": 0.006,
"macro_f1": 0.3333333432674408,
"num_tokens": 4948295.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016502789221704006,
+ "routers_loss": 0.0014670816017314792,
"skip_count": 0.0,
"step": 3066,
"text_loss": 0.36697930097579956
@@ -29144,13 +29144,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08642578125,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0008503104944941958,
- "loss": 0.0108,
+ "loss": 0.0107,
"macro_f1": 0.6666666865348816,
"num_tokens": 4951983.0,
"repeat_count": 0.0,
- "routers_loss": 0.00573746208101511,
+ "routers_loss": 0.005348859820514917,
"skip_count": 2.0,
"step": 3068,
"text_loss": 0.21612997353076935
@@ -29163,13 +29163,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0008500895769465972,
- "loss": 0.0113,
+ "loss": 0.0111,
"macro_f1": 0.3333333432674408,
"num_tokens": 4955023.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012014979729428887,
+ "routers_loss": 0.0013203793205320835,
"skip_count": 0.0,
"step": 3070,
"text_loss": 0.9757798314094543
@@ -29182,13 +29182,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.0478515625,
"learning_rate": 0.0008498685252440124,
- "loss": 0.0067,
+ "loss": 0.0065,
"macro_f1": 0.3333333432674408,
"num_tokens": 4957600.0,
"repeat_count": 0.0,
- "routers_loss": 0.006400141399353743,
+ "routers_loss": 0.006907356437295675,
"skip_count": 0.0,
"step": 3072,
"text_loss": 0.356107234954834
@@ -29201,13 +29201,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.046630859375,
+ "grad_norm": 0.061279296875,
"learning_rate": 0.0008496473394711487,
- "loss": 0.0117,
+ "loss": 0.0116,
"macro_f1": 0.6666666865348816,
"num_tokens": 4960746.0,
"repeat_count": 0.0,
- "routers_loss": 0.0030972862150520086,
+ "routers_loss": 0.0027704904787242413,
"skip_count": 1.0,
"step": 3074,
"text_loss": 0.6812908053398132
@@ -29220,13 +29220,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05517578125,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0008494260197127649,
- "loss": 0.0092,
+ "loss": 0.0093,
"macro_f1": 0.6666666865348816,
"num_tokens": 4963845.0,
"repeat_count": 0.0,
- "routers_loss": 0.004087577573955059,
+ "routers_loss": 0.0036796489730477333,
"skip_count": 2.0,
"step": 3076,
"text_loss": 0.7215370535850525
@@ -29239,13 +29239,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.0556640625,
"learning_rate": 0.0008492045660536712,
- "loss": 0.0085,
+ "loss": 0.0084,
"macro_f1": 0.6666666865348816,
"num_tokens": 4966887.0,
"repeat_count": 0.0,
- "routers_loss": 0.003797230776399374,
+ "routers_loss": 0.0037137691397219896,
"skip_count": 1.0,
"step": 3078,
"text_loss": 0.8700299859046936
@@ -29258,13 +29258,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0008489829785787291,
- "loss": 0.0081,
+ "loss": 0.0078,
"macro_f1": 0.8823530077934265,
"num_tokens": 4969859.0,
"repeat_count": 1.0,
- "routers_loss": 0.020377423614263535,
+ "routers_loss": 0.016492314636707306,
"skip_count": 2.0,
"step": 3080,
"text_loss": 0.6520360112190247
@@ -29277,13 +29277,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.043701171875,
"learning_rate": 0.0008487612573728513,
- "loss": 0.0096,
+ "loss": 0.0094,
"macro_f1": 0.6666666865348816,
"num_tokens": 4972628.0,
"repeat_count": 0.0,
- "routers_loss": 0.003695295425131917,
+ "routers_loss": 0.004022917244583368,
"skip_count": 2.0,
"step": 3082,
"text_loss": 0.17498187720775604
@@ -29296,13 +29296,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.044677734375,
"learning_rate": 0.0008485394025210016,
- "loss": 0.0078,
+ "loss": 0.0076,
"macro_f1": 0.6666666865348816,
"num_tokens": 4975475.0,
"repeat_count": 0.0,
- "routers_loss": 0.008704355917870998,
+ "routers_loss": 0.009141159243881702,
"skip_count": 1.0,
"step": 3084,
"text_loss": 0.5975366234779358
@@ -29315,13 +29315,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.045166015625,
"learning_rate": 0.0008483174141081956,
- "loss": 0.0111,
+ "loss": 0.0113,
"macro_f1": 0.3333333432674408,
"num_tokens": 4978858.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031532018911093473,
+ "routers_loss": 0.0031561285723000765,
"skip_count": 0.0,
"step": 3086,
"text_loss": 0.18748866021633148
@@ -29334,13 +29334,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.04150390625,
"learning_rate": 0.0008480952922194991,
"loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 4982142.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007620530668646097,
+ "routers_loss": 0.0007894713780842721,
"skip_count": 0.0,
"step": 3088,
"text_loss": 0.42083197832107544
@@ -29353,13 +29353,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037841796875,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.0008478730369400302,
- "loss": 0.0086,
+ "loss": 0.0083,
"macro_f1": 0.3333333432674408,
"num_tokens": 4984872.0,
"repeat_count": 0.0,
- "routers_loss": 0.000692489615175873,
+ "routers_loss": 0.0005908289458602667,
"skip_count": 0.0,
"step": 3090,
"text_loss": 0.45337188243865967
@@ -29372,13 +29372,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0240478515625,
+ "grad_norm": 0.02392578125,
"learning_rate": 0.0008476506483549573,
- "loss": 0.0103,
+ "loss": 0.0101,
"macro_f1": 1.0,
"num_tokens": 4988137.0,
"repeat_count": 1.0,
- "routers_loss": 0.001856967923231423,
+ "routers_loss": 0.0016509373672306538,
"skip_count": 2.0,
"step": 3092,
"text_loss": 0.6397262811660767
@@ -29391,13 +29391,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.031982421875,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.0008474281265495002,
- "loss": 0.0075,
+ "loss": 0.0076,
"macro_f1": 0.6666666865348816,
"num_tokens": 4991164.0,
"repeat_count": 0.0,
- "routers_loss": 0.004027622286230326,
+ "routers_loss": 0.004088304936885834,
"skip_count": 1.0,
"step": 3094,
"text_loss": 0.18352322280406952
@@ -29410,32 +29410,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03857421875,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.0008472054716089295,
- "loss": 0.0061,
+ "loss": 0.0059,
"macro_f1": 0.3333333432674408,
"num_tokens": 4993876.0,
"repeat_count": 0.0,
- "routers_loss": 0.004844399634748697,
+ "routers_loss": 0.005200014915317297,
"skip_count": 0.0,
"step": 3096,
"text_loss": 0.2776511013507843
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.0,
"acc_skip": 1.0,
- "avg_layers": 27.0,
+ "avg_layers": 26.0,
"epoch": 14.544760786615791,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9803921580314636,
+ "f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0286865234375,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.0008469826836185673,
"loss": 0.01,
- "macro_f1": 1.0,
+ "macro_f1": 0.6601307392120361,
"num_tokens": 4997068.0,
"repeat_count": 1.0,
- "routers_loss": 0.012379852123558521,
+ "routers_loss": 0.012686059810221195,
"skip_count": 2.0,
"step": 3098,
"text_loss": 0.23209233582019806
@@ -29448,13 +29448,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.055419921875,
"learning_rate": 0.0008467597626637858,
- "loss": 0.0076,
+ "loss": 0.0074,
"macro_f1": 1.0,
"num_tokens": 5000038.0,
"repeat_count": 1.0,
- "routers_loss": 0.00575951999053359,
+ "routers_loss": 0.006401528604328632,
"skip_count": 2.0,
"step": 3100,
"text_loss": 0.45936745405197144
@@ -29467,13 +29467,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0008465367088300093,
"loss": 0.0075,
"macro_f1": 0.3272727429866791,
"num_tokens": 5002870.0,
"repeat_count": 0.0,
- "routers_loss": 0.013157932087779045,
+ "routers_loss": 0.016640547662973404,
"skip_count": 1.0,
"step": 3102,
"text_loss": 0.44502779841423035
@@ -29486,13 +29486,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0283203125,
+ "grad_norm": 0.0272216796875,
"learning_rate": 0.0008463135222027124,
- "loss": 0.0052,
+ "loss": 0.0054,
"macro_f1": 0.6666666865348816,
"num_tokens": 5006357.0,
"repeat_count": 0.0,
- "routers_loss": 0.008679390884935856,
+ "routers_loss": 0.008411331102252007,
"skip_count": 2.0,
"step": 3104,
"text_loss": 0.3414570391178131
@@ -29505,13 +29505,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0008460902028674204,
- "loss": 0.0059,
+ "loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 5009059.0,
"repeat_count": 0.0,
- "routers_loss": 0.001076352084055543,
+ "routers_loss": 0.0010406570509076118,
"skip_count": 0.0,
"step": 3106,
"text_loss": 0.5931221842765808
@@ -29524,13 +29524,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.030029296875,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.0008458667509097098,
- "loss": 0.0112,
+ "loss": 0.0115,
"macro_f1": 0.3333333432674408,
"num_tokens": 5012327.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021328055299818516,
+ "routers_loss": 0.001959054498001933,
"skip_count": 0.0,
"step": 3108,
"text_loss": 0.5191171169281006
@@ -29543,13 +29543,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07470703125,
+ "grad_norm": 0.06640625,
"learning_rate": 0.0008456431664152078,
- "loss": 0.0129,
+ "loss": 0.0127,
"macro_f1": 0.3333333432674408,
"num_tokens": 5015472.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010206506121903658,
+ "routers_loss": 0.000994380097836256,
"skip_count": 0.0,
"step": 3110,
"text_loss": 0.4455361068248749
@@ -29562,13 +29562,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0263671875,
+ "grad_norm": 0.0264892578125,
"learning_rate": 0.0008454194494695923,
- "loss": 0.0111,
+ "loss": 0.0109,
"macro_f1": 0.3333333432674408,
"num_tokens": 5018901.0,
"repeat_count": 0.0,
- "routers_loss": 0.0041310288943350315,
+ "routers_loss": 0.0037662344984710217,
"skip_count": 0.0,
"step": 3112,
"text_loss": 0.5335362553596497
@@ -29581,13 +29581,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0240478515625,
+ "grad_norm": 0.02294921875,
"learning_rate": 0.0008451956001585923,
- "loss": 0.0066,
+ "loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 5022520.0,
"repeat_count": 0.0,
- "routers_loss": 0.00994859915226698,
+ "routers_loss": 0.008664715103805065,
"skip_count": 3.0,
"step": 3114,
"text_loss": 0.16230148077011108
@@ -29600,13 +29600,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0419921875,
+ "grad_norm": 0.0498046875,
"learning_rate": 0.000844971618567987,
- "loss": 0.0087,
+ "loss": 0.0086,
"macro_f1": 0.3333333432674408,
"num_tokens": 5025505.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016823343466967344,
+ "routers_loss": 0.0015904927859082818,
"skip_count": 0.0,
"step": 3116,
"text_loss": 0.6989432573318481
@@ -29619,13 +29619,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03369140625,
+ "grad_norm": 0.033935546875,
"learning_rate": 0.0008447475047836068,
- "loss": 0.0061,
+ "loss": 0.0064,
"macro_f1": 0.6666666865348816,
"num_tokens": 5028767.0,
"repeat_count": 0.0,
- "routers_loss": 0.005725692491978407,
+ "routers_loss": 0.005853322334587574,
"skip_count": 1.0,
"step": 3118,
"text_loss": 0.31420737504959106
@@ -29638,13 +29638,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05712890625,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0008445232588913325,
- "loss": 0.0116,
+ "loss": 0.0115,
"macro_f1": 0.3272727429866791,
"num_tokens": 5032577.0,
"repeat_count": 0.0,
- "routers_loss": 0.016534095630049706,
+ "routers_loss": 0.012760105542838573,
"skip_count": 0.0,
"step": 3120,
"text_loss": 0.5534627437591553
@@ -29657,13 +29657,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.0008442988809770953,
- "loss": 0.0097,
+ "loss": 0.0095,
"macro_f1": 0.3333333432674408,
"num_tokens": 5035381.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023590524215251207,
+ "routers_loss": 0.0022257440723478794,
"skip_count": 0.0,
"step": 3122,
"text_loss": 0.42492759227752686
@@ -29676,13 +29676,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.03955078125,
"learning_rate": 0.0008440743711268775,
- "loss": 0.0084,
+ "loss": 0.0083,
"macro_f1": 0.3333333432674408,
"num_tokens": 5038743.0,
"repeat_count": 0.0,
- "routers_loss": 0.004739012103527784,
+ "routers_loss": 0.004648433532565832,
"skip_count": 0.0,
"step": 3124,
"text_loss": 0.16404685378074646
@@ -29695,13 +29695,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.043212890625,
+ "grad_norm": 0.03955078125,
"learning_rate": 0.0008438497294267117,
- "loss": 0.0069,
+ "loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 5041492.0,
"repeat_count": 0.0,
- "routers_loss": 0.006212939508259296,
+ "routers_loss": 0.006313877180218697,
"skip_count": 0.0,
"step": 3126,
"text_loss": 0.23191484808921814
@@ -29714,13 +29714,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.068359375,
+ "grad_norm": 0.07666015625,
"learning_rate": 0.0008436249559626807,
"loss": 0.0046,
"macro_f1": 0.6666666865348816,
"num_tokens": 5043955.0,
"repeat_count": 1.0,
- "routers_loss": 0.0036408400628715754,
+ "routers_loss": 0.0036270488053560257,
"skip_count": 0.0,
"step": 3128,
"text_loss": 0.5782018303871155
@@ -29733,13 +29733,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.04345703125,
"learning_rate": 0.0008434000508209187,
"loss": 0.0095,
"macro_f1": 0.6666666865348816,
"num_tokens": 5047571.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038875883910804987,
+ "routers_loss": 0.003809858812019229,
"skip_count": 1.0,
"step": 3130,
"text_loss": 0.7129825949668884
@@ -29752,13 +29752,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.03955078125,
"learning_rate": 0.0008431750140876092,
- "loss": 0.0129,
+ "loss": 0.0128,
"macro_f1": 0.3333333432674408,
"num_tokens": 5051608.0,
"repeat_count": 0.0,
- "routers_loss": 0.002172809559851885,
+ "routers_loss": 0.0022369057405740023,
"skip_count": 0.0,
"step": 3132,
"text_loss": 0.4433445930480957
@@ -29773,11 +29773,11 @@
"f1_skip": 0.0,
"grad_norm": 0.0654296875,
"learning_rate": 0.000842949845848987,
- "loss": 0.0134,
+ "loss": 0.0135,
"macro_f1": 0.32098764181137085,
"num_tokens": 5054656.0,
"repeat_count": 0.0,
- "routers_loss": 0.04427836462855339,
+ "routers_loss": 0.0425117202103138,
"skip_count": 2.0,
"step": 3134,
"text_loss": 0.38721024990081787
@@ -29790,13 +29790,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.0712890625,
"learning_rate": 0.0008427245461913368,
"loss": 0.0121,
"macro_f1": 0.3333333432674408,
"num_tokens": 5059108.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016648605233058333,
+ "routers_loss": 0.0018077283166348934,
"skip_count": 0.0,
"step": 3136,
"text_loss": 0.7496368885040283
@@ -29809,13 +29809,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.1142578125,
+ "grad_norm": 0.12109375,
"learning_rate": 0.0008424991152009941,
- "loss": 0.0113,
+ "loss": 0.0111,
"macro_f1": 1.0,
"num_tokens": 5062371.0,
"repeat_count": 1.0,
- "routers_loss": 0.008457986637949944,
+ "routers_loss": 0.008801834657788277,
"skip_count": 2.0,
"step": 3138,
"text_loss": 0.5337086319923401
@@ -29828,13 +29828,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.04296875,
"learning_rate": 0.0008422735529643444,
- "loss": 0.0099,
+ "loss": 0.0097,
"macro_f1": 0.6666666865348816,
"num_tokens": 5065593.0,
"repeat_count": 0.0,
- "routers_loss": 0.004939604084938765,
+ "routers_loss": 0.00548676960170269,
"skip_count": 3.0,
"step": 3140,
"text_loss": 0.2561623156070709
@@ -29847,13 +29847,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031982421875,
+ "grad_norm": 0.032958984375,
"learning_rate": 0.0008420478595678233,
- "loss": 0.0077,
+ "loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 5068271.0,
"repeat_count": 0.0,
- "routers_loss": 0.006254551466554403,
+ "routers_loss": 0.006389956455677748,
"skip_count": 0.0,
"step": 3142,
"text_loss": 0.15605193376541138
@@ -29866,13 +29866,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0693359375,
+ "grad_norm": 0.07958984375,
"learning_rate": 0.0008418220350979175,
"loss": 0.0128,
"macro_f1": 1.0,
"num_tokens": 5071358.0,
"repeat_count": 1.0,
- "routers_loss": 0.01132921315729618,
+ "routers_loss": 0.012387622147798538,
"skip_count": 2.0,
"step": 3144,
"text_loss": 0.3085838258266449
@@ -29885,13 +29885,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0008415960796411628,
"loss": 0.0081,
"macro_f1": 0.6666666865348816,
"num_tokens": 5075584.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026424501556903124,
+ "routers_loss": 0.00311864772811532,
"skip_count": 1.0,
"step": 3146,
"text_loss": 0.4786977469921112
@@ -29904,13 +29904,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.103515625,
+ "grad_norm": 0.1591796875,
"learning_rate": 0.0008413699932841461,
- "loss": 0.0093,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 5078388.0,
"repeat_count": 0.0,
- "routers_loss": 0.0036633017007261515,
+ "routers_loss": 0.0030679800547659397,
"skip_count": 0.0,
"step": 3148,
"text_loss": 0.5222916603088379
@@ -29923,13 +29923,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.039794921875,
+ "grad_norm": 0.0390625,
"learning_rate": 0.0008411437761135039,
- "loss": 0.0112,
+ "loss": 0.011,
"macro_f1": 1.0,
"num_tokens": 5081584.0,
"repeat_count": 1.0,
- "routers_loss": 0.012777967378497124,
+ "routers_loss": 0.012907958589494228,
"skip_count": 2.0,
"step": 3150,
"text_loss": 0.5369884371757507
@@ -29942,13 +29942,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.03759765625,
"learning_rate": 0.0008409174282159232,
- "loss": 0.0074,
+ "loss": 0.0071,
"macro_f1": 0.6666666865348816,
"num_tokens": 5084450.0,
"repeat_count": 0.0,
- "routers_loss": 0.013694444671273232,
+ "routers_loss": 0.012314042076468468,
"skip_count": 2.0,
"step": 3152,
"text_loss": 0.25685277581214905
@@ -29961,13 +29961,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.041015625,
"learning_rate": 0.000840690949678141,
"loss": 0.0091,
"macro_f1": 0.6666666865348816,
"num_tokens": 5087865.0,
"repeat_count": 1.0,
- "routers_loss": 0.008412595838308334,
+ "routers_loss": 0.00899206381291151,
"skip_count": 0.0,
"step": 3154,
"text_loss": 0.1717093288898468
@@ -29980,13 +29980,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.049560546875,
+ "grad_norm": 0.06103515625,
"learning_rate": 0.0008404643405869441,
"loss": 0.0098,
"macro_f1": 0.3333333432674408,
"num_tokens": 5090857.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011648585787042975,
+ "routers_loss": 0.0013312003575265408,
"skip_count": 0.0,
"step": 3156,
"text_loss": 0.27446436882019043
@@ -29999,13 +29999,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.1630859375,
+ "grad_norm": 0.1533203125,
"learning_rate": 0.0008402376010291695,
- "loss": 0.0127,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 5093917.0,
"repeat_count": 0.0,
- "routers_loss": 0.002915408927947283,
+ "routers_loss": 0.002653320087119937,
"skip_count": 0.0,
"step": 3158,
"text_loss": 0.4237489402294159
@@ -30018,13 +30018,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0498046875,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0008400107310917045,
- "loss": 0.0096,
+ "loss": 0.0095,
"macro_f1": 0.6666666865348816,
"num_tokens": 5096656.0,
"repeat_count": 0.0,
- "routers_loss": 0.013139770366251469,
+ "routers_loss": 0.012976993806660175,
"skip_count": 2.0,
"step": 3160,
"text_loss": 0.42361980676651
@@ -30037,13 +30037,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.054931640625,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.000839783730861486,
"loss": 0.0097,
"macro_f1": 0.6666666865348816,
"num_tokens": 5099582.0,
"repeat_count": 0.0,
- "routers_loss": 0.0070426687598228455,
+ "routers_loss": 0.006936746649444103,
"skip_count": 2.0,
"step": 3162,
"text_loss": 0.26656073331832886
@@ -30056,13 +30056,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04345703125,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.0008395566004255008,
"loss": 0.0127,
"macro_f1": 0.6666666865348816,
"num_tokens": 5102908.0,
"repeat_count": 0.0,
- "routers_loss": 0.006271707359701395,
+ "routers_loss": 0.006619359832257032,
"skip_count": 1.0,
"step": 3164,
"text_loss": 0.590774416923523
@@ -30075,13 +30075,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.057373046875,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0008393293398707858,
"loss": 0.0076,
"macro_f1": 0.6666666865348816,
"num_tokens": 5105829.0,
"repeat_count": 0.0,
- "routers_loss": 0.010571467690169811,
+ "routers_loss": 0.010120268911123276,
"skip_count": 2.0,
"step": 3166,
"text_loss": 0.605930507183075
@@ -30094,13 +30094,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03564453125,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.0008391019492844275,
"loss": 0.0108,
"macro_f1": 0.6666666865348816,
"num_tokens": 5109850.0,
"repeat_count": 0.0,
- "routers_loss": 0.005877034272998571,
+ "routers_loss": 0.004940980114042759,
"skip_count": 2.0,
"step": 3168,
"text_loss": 0.12973152101039886
@@ -30115,11 +30115,11 @@
"f1_skip": 1.0,
"grad_norm": 0.037353515625,
"learning_rate": 0.0008388744287535627,
- "loss": 0.0093,
+ "loss": 0.0094,
"macro_f1": 0.6666666865348816,
"num_tokens": 5113353.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031909283716231585,
+ "routers_loss": 0.0031777634285390377,
"skip_count": 1.0,
"step": 3170,
"text_loss": 0.18577200174331665
@@ -30132,13 +30132,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.052734375,
"learning_rate": 0.0008386467783653775,
- "loss": 0.0104,
+ "loss": 0.0103,
"macro_f1": 0.3333333432674408,
"num_tokens": 5116421.0,
"repeat_count": 0.0,
- "routers_loss": 0.005338824819773436,
+ "routers_loss": 0.005431659985333681,
"skip_count": 0.0,
"step": 3172,
"text_loss": 0.2302747517824173
@@ -30151,13 +30151,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.046142578125,
"learning_rate": 0.000838418998207108,
- "loss": 0.0073,
+ "loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 5119457.0,
"repeat_count": 0.0,
- "routers_loss": 0.008522412739694118,
+ "routers_loss": 0.0077286697924137115,
"skip_count": 4.0,
"step": 3174,
"text_loss": 0.19606637954711914
@@ -30170,13 +30170,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.0008381910883660399,
- "loss": 0.0068,
+ "loss": 0.007,
"macro_f1": 0.3333333432674408,
"num_tokens": 5123201.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035330590326339006,
+ "routers_loss": 0.003982985392212868,
"skip_count": 0.0,
"step": 3176,
"text_loss": 0.716376006603241
@@ -30189,13 +30189,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.09375,
+ "grad_norm": 0.09423828125,
"learning_rate": 0.0008379630489295089,
- "loss": 0.0106,
+ "loss": 0.0109,
"macro_f1": 0.6666666865348816,
"num_tokens": 5126035.0,
"repeat_count": 0.0,
- "routers_loss": 0.006332095246762037,
+ "routers_loss": 0.005626026075333357,
"skip_count": 1.0,
"step": 3178,
"text_loss": 0.5144625902175903
@@ -30208,13 +30208,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0008377348799849,
"loss": 0.0086,
"macro_f1": 0.6666666865348816,
"num_tokens": 5129179.0,
"repeat_count": 0.0,
- "routers_loss": 0.017295993864536285,
+ "routers_loss": 0.015458245761692524,
"skip_count": 2.0,
"step": 3180,
"text_loss": 0.29887503385543823
@@ -30227,13 +30227,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0703125,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0008375065816196479,
- "loss": 0.0088,
+ "loss": 0.0086,
"macro_f1": 0.5492662787437439,
"num_tokens": 5132149.0,
"repeat_count": 0.0,
- "routers_loss": 0.017241213470697403,
+ "routers_loss": 0.012210468761622906,
"skip_count": 2.0,
"step": 3182,
"text_loss": 0.8981851935386658
@@ -30246,13 +30246,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04345703125,
+ "grad_norm": 0.044677734375,
"learning_rate": 0.0008372781539212371,
"loss": 0.0058,
"macro_f1": 0.3333333432674408,
"num_tokens": 5135287.0,
"repeat_count": 0.0,
- "routers_loss": 0.00516276340931654,
+ "routers_loss": 0.0052537876181304455,
"skip_count": 0.0,
"step": 3184,
"text_loss": 0.4245666563510895
@@ -30265,13 +30265,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.022705078125,
+ "grad_norm": 0.0240478515625,
"learning_rate": 0.0008370495969772014,
- "loss": 0.0077,
+ "loss": 0.0075,
"macro_f1": 0.6666666865348816,
"num_tokens": 5138589.0,
"repeat_count": 0.0,
- "routers_loss": 0.012517380528151989,
+ "routers_loss": 0.012873421423137188,
"skip_count": 2.0,
"step": 3186,
"text_loss": 0.40581050515174866
@@ -30284,13 +30284,13 @@
"f1_execute": 0.95652174949646,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07177734375,
+ "grad_norm": 0.07470703125,
"learning_rate": 0.0008368209108751244,
- "loss": 0.0129,
+ "loss": 0.0127,
"macro_f1": 0.6521739363670349,
"num_tokens": 5141635.0,
"repeat_count": 2.0,
- "routers_loss": 0.0810512825846672,
+ "routers_loss": 0.07720445841550827,
"skip_count": 4.0,
"step": 3188,
"text_loss": 0.3755173981189728
@@ -30303,13 +30303,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.01953125,
+ "grad_norm": 0.02197265625,
"learning_rate": 0.0008365920957026389,
- "loss": 0.0076,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 5144728.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014350182609632611,
+ "routers_loss": 0.001440995605662465,
"skip_count": 0.0,
"step": 3190,
"text_loss": 0.5067034363746643
@@ -30322,13 +30322,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.046142578125,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0008363631515474275,
- "loss": 0.0091,
+ "loss": 0.0089,
"macro_f1": 0.6538461446762085,
"num_tokens": 5147963.0,
"repeat_count": 1.0,
- "routers_loss": 0.018022676929831505,
+ "routers_loss": 0.018752984702587128,
"skip_count": 2.0,
"step": 3192,
"text_loss": 0.20224551856517792
@@ -30341,13 +30341,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042236328125,
+ "grad_norm": 0.037353515625,
"learning_rate": 0.0008361340784972217,
- "loss": 0.0092,
+ "loss": 0.0093,
"macro_f1": 0.3333333432674408,
"num_tokens": 5151184.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005097229732200503,
+ "routers_loss": 0.0005360354552976787,
"skip_count": 0.0,
"step": 3194,
"text_loss": 0.4588058292865753
@@ -30360,13 +30360,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03173828125,
+ "grad_norm": 0.0390625,
"learning_rate": 0.0008359048766398031,
"loss": 0.0079,
"macro_f1": 0.6666666865348816,
"num_tokens": 5153889.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009840037673711777,
+ "routers_loss": 0.0009184491937048733,
"skip_count": 1.0,
"step": 3196,
"text_loss": 0.2980220317840576
@@ -30379,13 +30379,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02685546875,
+ "grad_norm": 0.027099609375,
"learning_rate": 0.000835675546063002,
- "loss": 0.0058,
+ "loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 5156758.0,
"repeat_count": 0.0,
- "routers_loss": 0.001269801170565188,
+ "routers_loss": 0.001252970308996737,
"skip_count": 0.0,
"step": 3198,
"text_loss": 0.6775755882263184
@@ -30398,13 +30398,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0008354460868546985,
- "loss": 0.0071,
+ "loss": 0.0072,
"macro_f1": 0.3333333432674408,
"num_tokens": 5160247.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034889329690486193,
+ "routers_loss": 0.0037315806839615107,
"skip_count": 0.0,
"step": 3200,
"text_loss": 0.35867011547088623
@@ -30417,13 +30417,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.034912109375,
"learning_rate": 0.0008352164991028217,
- "loss": 0.0091,
+ "loss": 0.0092,
"macro_f1": 0.6666666865348816,
"num_tokens": 5163456.0,
"repeat_count": 1.0,
- "routers_loss": 0.001520772697404027,
+ "routers_loss": 0.001497485558502376,
"skip_count": 0.0,
"step": 3202,
"text_loss": 0.690290093421936
@@ -30436,13 +30436,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03662109375,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.0008349867828953501,
"loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 5166139.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011800233041867614,
+ "routers_loss": 0.001051135826855898,
"skip_count": 0.0,
"step": 3204,
"text_loss": 0.3340415954589844
@@ -30455,13 +30455,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031494140625,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0008347569383203113,
- "loss": 0.01,
+ "loss": 0.0098,
"macro_f1": 0.3333333432674408,
"num_tokens": 5169009.0,
"repeat_count": 0.0,
- "routers_loss": 0.001043233904056251,
+ "routers_loss": 0.0010544003453105688,
"skip_count": 0.0,
"step": 3206,
"text_loss": 0.8584878444671631
@@ -30474,13 +30474,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0008345269654657823,
- "loss": 0.0084,
+ "loss": 0.0085,
"macro_f1": 1.0,
"num_tokens": 5172618.0,
"repeat_count": 1.0,
- "routers_loss": 0.007460868917405605,
+ "routers_loss": 0.007312417030334473,
"skip_count": 1.0,
"step": 3208,
"text_loss": 0.19500218331813812
@@ -30493,13 +30493,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0361328125,
+ "grad_norm": 0.03466796875,
"learning_rate": 0.0008342968644198892,
- "loss": 0.0067,
+ "loss": 0.0065,
"macro_f1": 0.3333333432674408,
"num_tokens": 5175857.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027419133111834526,
+ "routers_loss": 0.00276504410430789,
"skip_count": 0.0,
"step": 3210,
"text_loss": 0.5446314215660095
@@ -30512,13 +30512,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.037109375,
"learning_rate": 0.0008340666352708068,
- "loss": 0.0089,
+ "loss": 0.0088,
"macro_f1": 0.3333333432674408,
"num_tokens": 5178585.0,
"repeat_count": 0.0,
- "routers_loss": 0.002764733275398612,
+ "routers_loss": 0.002669303445145488,
"skip_count": 0.0,
"step": 3212,
"text_loss": 0.3687484860420227
@@ -30531,13 +30531,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0284423828125,
+ "grad_norm": 0.035888671875,
"learning_rate": 0.0008338362781067596,
"loss": 0.0075,
"macro_f1": 0.3333333432674408,
"num_tokens": 5181777.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032288613729178905,
+ "routers_loss": 0.0031585274264216423,
"skip_count": 0.0,
"step": 3214,
"text_loss": 0.27325859665870667
@@ -30550,13 +30550,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.040283203125,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.000833605793016021,
"loss": 0.009,
"macro_f1": 0.6666666865348816,
"num_tokens": 5184312.0,
"repeat_count": 0.0,
- "routers_loss": 0.008322423323988914,
+ "routers_loss": 0.008807534351944923,
"skip_count": 2.0,
"step": 3216,
"text_loss": 0.4466548562049866
@@ -30569,13 +30569,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.040283203125,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.0008333751800869133,
- "loss": 0.0092,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 5187497.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034384531900286674,
+ "routers_loss": 0.003171310294419527,
"skip_count": 0.0,
"step": 3218,
"text_loss": 0.5423526763916016
@@ -30588,13 +30588,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0228271484375,
+ "grad_norm": 0.025634765625,
"learning_rate": 0.0008331444394078076,
- "loss": 0.0081,
+ "loss": 0.008,
"macro_f1": 0.6666666865348816,
"num_tokens": 5190982.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015023534651845694,
+ "routers_loss": 0.0016481258207932115,
"skip_count": 2.0,
"step": 3220,
"text_loss": 0.48984917998313904
@@ -30607,13 +30607,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.03173828125,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.000832913571067124,
- "loss": 0.0108,
+ "loss": 0.0107,
"macro_f1": 1.0,
"num_tokens": 5194044.0,
"repeat_count": 1.0,
- "routers_loss": 0.0043489462696015835,
+ "routers_loss": 0.003957313951104879,
"skip_count": 1.0,
"step": 3222,
"text_loss": 0.4533331096172333
@@ -30626,13 +30626,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.040283203125,
"learning_rate": 0.0008326825751533322,
- "loss": 0.0076,
+ "loss": 0.0075,
"macro_f1": 0.3333333432674408,
"num_tokens": 5197092.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012065734481438994,
+ "routers_loss": 0.0016904744552448392,
"skip_count": 0.0,
"step": 3224,
"text_loss": 0.5538802742958069
@@ -30645,13 +30645,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06005859375,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0008324514517549501,
- "loss": 0.0084,
+ "loss": 0.0081,
"macro_f1": 0.6666666865348816,
"num_tokens": 5199941.0,
"repeat_count": 0.0,
- "routers_loss": 0.006849290337413549,
+ "routers_loss": 0.005608258303254843,
"skip_count": 1.0,
"step": 3226,
"text_loss": 0.416242778301239
@@ -30664,32 +30664,32 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.03857421875,
+ "grad_norm": 0.040771484375,
"learning_rate": 0.0008322202009605444,
- "loss": 0.0073,
+ "loss": 0.0072,
"macro_f1": 0.8823530077934265,
"num_tokens": 5202618.0,
"repeat_count": 1.0,
- "routers_loss": 0.020665202289819717,
+ "routers_loss": 0.020965175703167915,
"skip_count": 2.0,
"step": 3228,
"text_loss": 0.17496295273303986
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 1.0,
- "avg_layers": 23.0,
+ "avg_layers": 24.0,
"epoch": 15.164367478720282,
- "f1_execute": 0.9777777791023254,
- "f1_repeat": 0.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 1.0,
"grad_norm": 0.04052734375,
"learning_rate": 0.0008319888228587311,
"loss": 0.0063,
- "macro_f1": 0.6592592597007751,
+ "macro_f1": 1.0,
"num_tokens": 5206414.0,
"repeat_count": 1.0,
- "routers_loss": 0.026284674182534218,
+ "routers_loss": 0.021259209141135216,
"skip_count": 5.0,
"step": 3230,
"text_loss": 0.22471418976783752
@@ -30702,13 +30702,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03076171875,
+ "grad_norm": 0.029541015625,
"learning_rate": 0.0008317573175381745,
"loss": 0.0115,
"macro_f1": 0.3333333432674408,
"num_tokens": 5209768.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018494570394977927,
+ "routers_loss": 0.0018647604156285524,
"skip_count": 0.0,
"step": 3232,
"text_loss": 0.4415269196033478
@@ -30721,13 +30721,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.027099609375,
+ "grad_norm": 0.0283203125,
"learning_rate": 0.0008315256850875881,
- "loss": 0.0061,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 5213257.0,
"repeat_count": 0.0,
- "routers_loss": 0.002610588213428855,
+ "routers_loss": 0.002345515415072441,
"skip_count": 0.0,
"step": 3234,
"text_loss": 0.347247838973999
@@ -30740,13 +30740,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.048828125,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0008312939255957336,
- "loss": 0.0084,
+ "loss": 0.0086,
"macro_f1": 0.6666666865348816,
"num_tokens": 5215800.0,
"repeat_count": 0.0,
- "routers_loss": 0.007061914075165987,
+ "routers_loss": 0.007112892810255289,
"skip_count": 3.0,
"step": 3236,
"text_loss": 0.31091734766960144
@@ -30759,13 +30759,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.033203125,
"learning_rate": 0.0008310620391514219,
- "loss": 0.0083,
+ "loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 5219205.0,
"repeat_count": 0.0,
- "routers_loss": 0.004094691481441259,
+ "routers_loss": 0.00432228296995163,
"skip_count": 0.0,
"step": 3238,
"text_loss": 0.3421775996685028
@@ -30778,13 +30778,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.024658203125,
+ "grad_norm": 0.027099609375,
"learning_rate": 0.0008308300258435124,
"loss": 0.0085,
"macro_f1": 0.6666666865348816,
"num_tokens": 5222422.0,
"repeat_count": 0.0,
- "routers_loss": 0.007662596181035042,
+ "routers_loss": 0.0076514314860105515,
"skip_count": 2.0,
"step": 3240,
"text_loss": 0.22378318011760712
@@ -30797,13 +30797,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0264892578125,
+ "grad_norm": 0.028564453125,
"learning_rate": 0.0008305978857609128,
- "loss": 0.0073,
+ "loss": 0.0072,
"macro_f1": 0.3333333432674408,
"num_tokens": 5225625.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008108283509500325,
+ "routers_loss": 0.0007617069641128182,
"skip_count": 0.0,
"step": 3242,
"text_loss": 0.5880323648452759
@@ -30816,13 +30816,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0281982421875,
+ "grad_norm": 0.02734375,
"learning_rate": 0.0008303656189925799,
- "loss": 0.0084,
+ "loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 5229113.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018137742299586535,
+ "routers_loss": 0.0017418119823560119,
"skip_count": 0.0,
"step": 3244,
"text_loss": 0.3302813768386841
@@ -30835,13 +30835,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.042724609375,
"learning_rate": 0.0008301332256275183,
"loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 5232061.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025301240384578705,
+ "routers_loss": 0.0026667986530810595,
"skip_count": 0.0,
"step": 3246,
"text_loss": 0.5679706335067749
@@ -30854,13 +30854,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.052001953125,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0008299007057547821,
- "loss": 0.0101,
+ "loss": 0.0106,
"macro_f1": 1.0,
"num_tokens": 5235279.0,
"repeat_count": 1.0,
- "routers_loss": 0.011231686919927597,
+ "routers_loss": 0.011016624979674816,
"skip_count": 2.0,
"step": 3248,
"text_loss": 0.5081504583358765
@@ -30873,13 +30873,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.033203125,
"learning_rate": 0.0008296680594634731,
- "loss": 0.0074,
+ "loss": 0.0073,
"macro_f1": 0.6666666865348816,
"num_tokens": 5239655.0,
"repeat_count": 1.0,
- "routers_loss": 0.005881415214389563,
+ "routers_loss": 0.005492044147104025,
"skip_count": 0.0,
"step": 3250,
"text_loss": 0.14675180613994598
@@ -30892,13 +30892,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0277099609375,
+ "grad_norm": 0.0269775390625,
"learning_rate": 0.0008294352868427418,
- "loss": 0.0056,
+ "loss": 0.0055,
"macro_f1": 0.6666666865348816,
"num_tokens": 5243579.0,
"repeat_count": 0.0,
- "routers_loss": 0.004495301283895969,
+ "routers_loss": 0.00404445780441165,
"skip_count": 1.0,
"step": 3252,
"text_loss": 0.4201085865497589
@@ -30911,13 +30911,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0208740234375,
+ "grad_norm": 0.0242919921875,
"learning_rate": 0.0008292023879817871,
- "loss": 0.0052,
+ "loss": 0.0053,
"macro_f1": 0.6666666865348816,
"num_tokens": 5247059.0,
"repeat_count": 0.0,
- "routers_loss": 0.007394428364932537,
+ "routers_loss": 0.006886140909045935,
"skip_count": 1.0,
"step": 3254,
"text_loss": 0.2289208322763443
@@ -30930,32 +30930,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06201171875,
+ "grad_norm": 0.057861328125,
"learning_rate": 0.0008289693629698564,
- "loss": 0.0077,
+ "loss": 0.0073,
"macro_f1": 0.3333333432674408,
"num_tokens": 5249940.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006736332434229553,
+ "routers_loss": 0.0005736657767556608,
"skip_count": 0.0,
"step": 3256,
"text_loss": 0.5670450925827026
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 15.295861461696507,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.0224609375,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0277099609375,
"learning_rate": 0.0008287362118962452,
- "loss": 0.0062,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.006,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 5253580.0,
"repeat_count": 0.0,
- "routers_loss": 0.009847268462181091,
+ "routers_loss": 0.011349895037710667,
"skip_count": 1.0,
"step": 3258,
"text_loss": 0.5042323470115662
@@ -30968,13 +30968,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.026611328125,
+ "grad_norm": 0.0267333984375,
"learning_rate": 0.0008285029348502973,
"loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 5257080.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013670918997377157,
+ "routers_loss": 0.0013626761501654983,
"skip_count": 0.0,
"step": 3260,
"text_loss": 0.3227672874927521
@@ -30987,13 +30987,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02587890625,
+ "grad_norm": 0.0245361328125,
"learning_rate": 0.0008282695319214053,
"loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 5259951.0,
"repeat_count": 0.0,
- "routers_loss": 0.004696785472333431,
+ "routers_loss": 0.00471635302528739,
"skip_count": 0.0,
"step": 3262,
"text_loss": 0.20773714780807495
@@ -31006,13 +31006,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04345703125,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.0008280360031990093,
- "loss": 0.0108,
+ "loss": 0.0107,
"macro_f1": 0.6666666865348816,
"num_tokens": 5263314.0,
"repeat_count": 0.0,
- "routers_loss": 0.010588239878416061,
+ "routers_loss": 0.010472415015101433,
"skip_count": 2.0,
"step": 3264,
"text_loss": 0.34397366642951965
@@ -31025,13 +31025,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.000827802348772598,
- "loss": 0.0084,
+ "loss": 0.0083,
"macro_f1": 0.3333333432674408,
"num_tokens": 5267358.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010326795745640993,
+ "routers_loss": 0.0007814752752892673,
"skip_count": 0.0,
"step": 3266,
"text_loss": 0.747342586517334
@@ -31044,13 +31044,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.049560546875,
+ "grad_norm": 0.0498046875,
"learning_rate": 0.0008275685687317084,
- "loss": 0.0087,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 5270400.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010199147509410977,
+ "routers_loss": 0.000902949133887887,
"skip_count": 0.0,
"step": 3268,
"text_loss": 0.43782034516334534
@@ -31063,13 +31063,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03173828125,
+ "grad_norm": 0.03564453125,
"learning_rate": 0.0008273346631659252,
- "loss": 0.0069,
+ "loss": 0.007,
"macro_f1": 0.3333333432674408,
"num_tokens": 5273147.0,
"repeat_count": 0.0,
- "routers_loss": 0.00046372212818823755,
+ "routers_loss": 0.00043462219764478505,
"skip_count": 0.0,
"step": 3270,
"text_loss": 0.6358205080032349
@@ -31082,13 +31082,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0380859375,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0008271006321648816,
- "loss": 0.0088,
+ "loss": 0.0085,
"macro_f1": 0.3333333432674408,
"num_tokens": 5277638.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022951713763177395,
+ "routers_loss": 0.002211218234151602,
"skip_count": 0.0,
"step": 3272,
"text_loss": 0.20220105350017548
@@ -31101,13 +31101,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.0008268664758182589,
- "loss": 0.0077,
+ "loss": 0.0075,
"macro_f1": 0.6666666865348816,
"num_tokens": 5280638.0,
"repeat_count": 1.0,
- "routers_loss": 0.008325734175741673,
+ "routers_loss": 0.010536720044910908,
"skip_count": 0.0,
"step": 3274,
"text_loss": 0.7579061388969421
@@ -31120,32 +31120,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.0439453125,
"learning_rate": 0.0008266321942157859,
- "loss": 0.007,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 5283847.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017014809418469667,
+ "routers_loss": 0.0017158017726615071,
"skip_count": 0.0,
"step": 3276,
"text_loss": 0.669302761554718
},
{
- "acc_repeat": 1.0,
+ "acc_repeat": 0.800000011920929,
"acc_skip": 1.0,
- "avg_layers": 29.0,
+ "avg_layers": 28.0,
"epoch": 15.389785735250953,
- "f1_execute": 1.0,
- "f1_repeat": 1.0,
+ "f1_execute": 0.9743589162826538,
+ "f1_repeat": 0.888888955116272,
"f1_skip": 1.0,
- "grad_norm": 0.06787109375,
+ "grad_norm": 0.06005859375,
"learning_rate": 0.0008263977874472399,
- "loss": 0.0089,
- "macro_f1": 1.0,
+ "loss": 0.0088,
+ "macro_f1": 0.9544159770011902,
"num_tokens": 5286627.0,
"repeat_count": 5.0,
- "routers_loss": 0.009527196176350117,
+ "routers_loss": 0.011220700107514858,
"skip_count": 4.0,
"step": 3278,
"text_loss": 0.8703984022140503
@@ -31158,13 +31158,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.060546875,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0008261632556024461,
- "loss": 0.01,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 5289766.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025269081816077232,
+ "routers_loss": 0.0020442772656679153,
"skip_count": 0.0,
"step": 3280,
"text_loss": 0.5009346008300781
@@ -31177,13 +31177,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.11474609375,
+ "grad_norm": 0.10107421875,
"learning_rate": 0.0008259285987712774,
- "loss": 0.0108,
+ "loss": 0.0106,
"macro_f1": 0.3333333432674408,
"num_tokens": 5293010.0,
"repeat_count": 0.0,
- "routers_loss": 0.005710822530090809,
+ "routers_loss": 0.005645765457302332,
"skip_count": 0.0,
"step": 3282,
"text_loss": 0.2546011209487915
@@ -31196,13 +31196,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0419921875,
+ "grad_norm": 0.042236328125,
"learning_rate": 0.0008256938170436549,
- "loss": 0.0114,
+ "loss": 0.0111,
"macro_f1": 0.6666666865348816,
"num_tokens": 5296732.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028946297243237495,
+ "routers_loss": 0.0027385836001485586,
"skip_count": 2.0,
"step": 3284,
"text_loss": 0.5244000554084778
@@ -31217,11 +31217,11 @@
"f1_skip": 1.0,
"grad_norm": 0.0296630859375,
"learning_rate": 0.0008254589105095473,
- "loss": 0.0059,
+ "loss": 0.0061,
"macro_f1": 1.0,
"num_tokens": 5299926.0,
"repeat_count": 1.0,
- "routers_loss": 0.007981270551681519,
+ "routers_loss": 0.007451715879142284,
"skip_count": 1.0,
"step": 3286,
"text_loss": 0.28979742527008057
@@ -31234,13 +31234,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0238037109375,
+ "grad_norm": 0.0218505859375,
"learning_rate": 0.0008252238792589711,
- "loss": 0.0085,
+ "loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 5303006.0,
"repeat_count": 0.0,
- "routers_loss": 0.005524218548089266,
+ "routers_loss": 0.004805843345820904,
"skip_count": 2.0,
"step": 3288,
"text_loss": 0.5131978392601013
@@ -31253,13 +31253,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03857421875,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.000824988723381991,
- "loss": 0.0092,
+ "loss": 0.0091,
"macro_f1": 0.3272727429866791,
"num_tokens": 5306953.0,
"repeat_count": 0.0,
- "routers_loss": 0.01160401664674282,
+ "routers_loss": 0.010639613494277,
"skip_count": 1.0,
"step": 3290,
"text_loss": 0.4901447296142578
@@ -31272,13 +31272,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.0008247534429687191,
- "loss": 0.0069,
+ "loss": 0.007,
"macro_f1": 0.5492662787437439,
"num_tokens": 5310516.0,
"repeat_count": 0.0,
- "routers_loss": 0.014068983495235443,
+ "routers_loss": 0.013625577092170715,
"skip_count": 2.0,
"step": 3292,
"text_loss": 0.2124534696340561
@@ -31291,13 +31291,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0008245180381093152,
- "loss": 0.0116,
+ "loss": 0.0114,
"macro_f1": 0.6666666865348816,
"num_tokens": 5313959.0,
"repeat_count": 0.0,
- "routers_loss": 0.00520911393687129,
+ "routers_loss": 0.004958513658493757,
"skip_count": 1.0,
"step": 3294,
"text_loss": 0.46682238578796387
@@ -31310,13 +31310,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0008242825088939867,
- "loss": 0.0085,
+ "loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 5316609.0,
"repeat_count": 0.0,
- "routers_loss": 0.004490343388170004,
+ "routers_loss": 0.003962756600230932,
"skip_count": 0.0,
"step": 3296,
"text_loss": 0.7010108232498169
@@ -31329,13 +31329,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0008240468554129892,
- "loss": 0.0078,
+ "loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 5319638.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006864524912089109,
+ "routers_loss": 0.0006996620795689523,
"skip_count": 0.0,
"step": 3298,
"text_loss": 0.4966355860233307
@@ -31348,13 +31348,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0008238110777566255,
"loss": 0.0101,
"macro_f1": 0.3333333432674408,
"num_tokens": 5323019.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017158432165160775,
+ "routers_loss": 0.0016031896229833364,
"skip_count": 0.0,
"step": 3300,
"text_loss": 0.38668957352638245
@@ -31367,13 +31367,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.0303955078125,
"learning_rate": 0.0008235751760152459,
- "loss": 0.0064,
+ "loss": 0.0063,
"macro_f1": 1.0,
"num_tokens": 5326099.0,
"repeat_count": 2.0,
- "routers_loss": 0.0037166383117437363,
+ "routers_loss": 0.00344281829893589,
"skip_count": 2.0,
"step": 3302,
"text_loss": 0.5330720543861389
@@ -31386,13 +31386,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05126953125,
+ "grad_norm": 0.06005859375,
"learning_rate": 0.0008233391502792484,
- "loss": 0.0073,
+ "loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 5328993.0,
"repeat_count": 0.0,
- "routers_loss": 0.008341175504028797,
+ "routers_loss": 0.007886730134487152,
"skip_count": 1.0,
"step": 3304,
"text_loss": 0.5470269322395325
@@ -31405,13 +31405,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03271484375,
+ "grad_norm": 0.034423828125,
"learning_rate": 0.0008231030006390786,
"loss": 0.0067,
"macro_f1": 0.6666666865348816,
"num_tokens": 5331554.0,
"repeat_count": 0.0,
- "routers_loss": 0.008380163460969925,
+ "routers_loss": 0.008180000819265842,
"skip_count": 1.0,
"step": 3306,
"text_loss": 0.4023340344429016
@@ -31424,13 +31424,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0242919921875,
+ "grad_norm": 0.02587890625,
"learning_rate": 0.0008228667271852294,
- "loss": 0.0062,
+ "loss": 0.0059,
"macro_f1": 0.3333333432674408,
"num_tokens": 5335712.0,
"repeat_count": 0.0,
- "routers_loss": 0.00030099941068328917,
+ "routers_loss": 0.0002942821884062141,
"skip_count": 0.0,
"step": 3308,
"text_loss": 0.5306711792945862
@@ -31443,13 +31443,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0615234375,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.0008226303300082414,
- "loss": 0.0095,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 5338701.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006003376329317689,
+ "routers_loss": 0.0006134595023468137,
"skip_count": 0.0,
"step": 3310,
"text_loss": 0.5906263589859009
@@ -31462,13 +31462,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.02880859375,
"learning_rate": 0.0008223938091987022,
- "loss": 0.0073,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 5342274.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017984671285375953,
+ "routers_loss": 0.0016656654188409448,
"skip_count": 0.0,
"step": 3312,
"text_loss": 0.5201764106750488
@@ -31481,13 +31481,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.055419921875,
+ "grad_norm": 0.052001953125,
"learning_rate": 0.0008221571648472472,
- "loss": 0.0066,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 5345185.0,
"repeat_count": 0.0,
- "routers_loss": 0.003994898404926062,
+ "routers_loss": 0.0038612703792750835,
"skip_count": 0.0,
"step": 3314,
"text_loss": 0.36633720993995667
@@ -31500,13 +31500,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0008219203970445589,
"loss": 0.011,
"macro_f1": 0.3272727429866791,
"num_tokens": 5348804.0,
"repeat_count": 0.0,
- "routers_loss": 0.009415820240974426,
+ "routers_loss": 0.009782899171113968,
"skip_count": 1.0,
"step": 3316,
"text_loss": 0.3117460012435913
@@ -31519,13 +31519,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.053955078125,
+ "grad_norm": 0.055908203125,
"learning_rate": 0.0008216835058813672,
- "loss": 0.0091,
+ "loss": 0.0093,
"macro_f1": 0.3333333432674408,
"num_tokens": 5351896.0,
"repeat_count": 0.0,
- "routers_loss": 0.006483082659542561,
+ "routers_loss": 0.007713229861110449,
"skip_count": 0.0,
"step": 3318,
"text_loss": 0.253496378660202
@@ -31538,13 +31538,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02880859375,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0008214464914484492,
"loss": 0.0062,
"macro_f1": 0.6666666865348816,
"num_tokens": 5355058.0,
"repeat_count": 0.0,
- "routers_loss": 0.006275791209191084,
+ "routers_loss": 0.006227815989404917,
"skip_count": 2.0,
"step": 3320,
"text_loss": 0.32693132758140564
@@ -31557,13 +31557,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0008212093538366292,
"loss": 0.0099,
"macro_f1": 0.3333333432674408,
"num_tokens": 5358365.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027182933408766985,
+ "routers_loss": 0.002601418411359191,
"skip_count": 0.0,
"step": 3322,
"text_loss": 0.40394455194473267
@@ -31576,13 +31576,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.000820972093136779,
"loss": 0.0079,
"macro_f1": 0.6666666865348816,
"num_tokens": 5360981.0,
"repeat_count": 0.0,
- "routers_loss": 0.005600054748356342,
+ "routers_loss": 0.005545300897210836,
"skip_count": 3.0,
"step": 3324,
"text_loss": 0.6758295893669128
@@ -31595,13 +31595,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.05078125,
"learning_rate": 0.0008207347094398172,
"loss": 0.0096,
"macro_f1": 0.6666666865348816,
"num_tokens": 5364018.0,
"repeat_count": 1.0,
- "routers_loss": 0.0020965971052646637,
+ "routers_loss": 0.001924700103700161,
"skip_count": 0.0,
"step": 3326,
"text_loss": 0.5196860432624817
@@ -31614,13 +31614,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0311279296875,
+ "grad_norm": 0.0299072265625,
"learning_rate": 0.0008204972028367097,
- "loss": 0.006,
+ "loss": 0.0057,
"macro_f1": 0.6666666865348816,
"num_tokens": 5366986.0,
"repeat_count": 0.0,
- "routers_loss": 0.011729889549314976,
+ "routers_loss": 0.012254828587174416,
"skip_count": 1.0,
"step": 3328,
"text_loss": 0.24661913514137268
@@ -31633,13 +31633,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.0008202595734184694,
"loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 5371463.0,
"repeat_count": 0.0,
- "routers_loss": 0.004913534037768841,
+ "routers_loss": 0.005094083491712809,
"skip_count": 0.0,
"step": 3330,
"text_loss": 0.2525769770145416
@@ -31652,13 +31652,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.039794921875,
+ "grad_norm": 0.040283203125,
"learning_rate": 0.0008200218212761566,
- "loss": 0.0111,
+ "loss": 0.0108,
"macro_f1": 0.6666666865348816,
"num_tokens": 5374823.0,
"repeat_count": 1.0,
- "routers_loss": 0.0028079606126993895,
+ "routers_loss": 0.0025883198250085115,
"skip_count": 0.0,
"step": 3332,
"text_loss": 0.21849912405014038
@@ -31671,13 +31671,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031982421875,
+ "grad_norm": 0.030029296875,
"learning_rate": 0.000819783946500878,
"loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 5377640.0,
"repeat_count": 0.0,
- "routers_loss": 0.008404970169067383,
+ "routers_loss": 0.008240507915616035,
"skip_count": 0.0,
"step": 3334,
"text_loss": 0.2662734091281891
@@ -31690,13 +31690,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.048583984375,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.000819545949183788,
- "loss": 0.0101,
+ "loss": 0.01,
"macro_f1": 0.5934640765190125,
"num_tokens": 5380593.0,
"repeat_count": 0.0,
- "routers_loss": 0.040179044008255005,
+ "routers_loss": 0.038378193974494934,
"skip_count": 3.0,
"step": 3336,
"text_loss": 0.2431795746088028
@@ -31709,13 +31709,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.040283203125,
"learning_rate": 0.0008193078294160874,
- "loss": 0.0096,
+ "loss": 0.0097,
"macro_f1": 1.0,
"num_tokens": 5384487.0,
"repeat_count": 1.0,
- "routers_loss": 0.005122583359479904,
+ "routers_loss": 0.005926199723035097,
"skip_count": 1.0,
"step": 3338,
"text_loss": 0.5663705468177795
@@ -31728,13 +31728,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.031494140625,
+ "grad_norm": 0.032470703125,
"learning_rate": 0.0008190695872890242,
- "loss": 0.0056,
+ "loss": 0.0055,
"macro_f1": 0.6666666865348816,
"num_tokens": 5387511.0,
"repeat_count": 0.0,
- "routers_loss": 0.012232085689902306,
+ "routers_loss": 0.010842559859156609,
"skip_count": 2.0,
"step": 3340,
"text_loss": 0.11517292261123657
@@ -31747,13 +31747,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.029296875,
+ "grad_norm": 0.0283203125,
"learning_rate": 0.0008188312228938933,
- "loss": 0.009,
+ "loss": 0.0088,
"macro_f1": 0.3333333432674408,
"num_tokens": 5390698.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011168667115271091,
+ "routers_loss": 0.001304097007960081,
"skip_count": 0.0,
"step": 3342,
"text_loss": 0.4827076196670532
@@ -31766,13 +31766,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.03515625,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0008185927363220363,
- "loss": 0.0088,
+ "loss": 0.0087,
"macro_f1": 0.6666666865348816,
"num_tokens": 5393778.0,
"repeat_count": 1.0,
- "routers_loss": 0.005202370695769787,
+ "routers_loss": 0.005354117136448622,
"skip_count": 0.0,
"step": 3344,
"text_loss": 0.44467049837112427
@@ -31785,13 +31785,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.040771484375,
"learning_rate": 0.0008183541276648418,
- "loss": 0.0081,
+ "loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 5396925.0,
"repeat_count": 0.0,
- "routers_loss": 0.005000839475542307,
+ "routers_loss": 0.004800073802471161,
"skip_count": 2.0,
"step": 3346,
"text_loss": 0.2032834142446518
@@ -31804,13 +31804,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025634765625,
+ "grad_norm": 0.027587890625,
"learning_rate": 0.0008181153970137449,
- "loss": 0.0059,
+ "loss": 0.006,
"macro_f1": 0.3333333432674408,
"num_tokens": 5400522.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020684092305600643,
+ "routers_loss": 0.0021674633026123047,
"skip_count": 0.0,
"step": 3348,
"text_loss": 0.4507528841495514
@@ -31823,13 +31823,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.0439453125,
+ "grad_norm": 0.051513671875,
"learning_rate": 0.0008178765444602278,
"loss": 0.0117,
"macro_f1": 0.8820862174034119,
"num_tokens": 5403526.0,
"repeat_count": 2.0,
- "routers_loss": 0.040753237903118134,
+ "routers_loss": 0.04263930395245552,
"skip_count": 2.0,
"step": 3350,
"text_loss": 0.3606615960597992
@@ -31842,13 +31842,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.03564453125,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0008176375700958194,
- "loss": 0.0089,
+ "loss": 0.0087,
"macro_f1": 0.6666666865348816,
"num_tokens": 5407127.0,
"repeat_count": 1.0,
- "routers_loss": 0.007767915725708008,
+ "routers_loss": 0.006953123956918716,
"skip_count": 0.0,
"step": 3352,
"text_loss": 0.2290353775024414
@@ -31861,13 +31861,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0008173984740120948,
"loss": 0.0055,
"macro_f1": 0.3333333432674408,
"num_tokens": 5410829.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016073459992185235,
+ "routers_loss": 0.0014363783411681652,
"skip_count": 0.0,
"step": 3354,
"text_loss": 0.4220392405986786
@@ -31880,13 +31880,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02880859375,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0008171592563006762,
- "loss": 0.0078,
+ "loss": 0.0079,
"macro_f1": 0.6666666865348816,
"num_tokens": 5414152.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016132282325997949,
+ "routers_loss": 0.00202389364130795,
"skip_count": 1.0,
"step": 3356,
"text_loss": 0.37729766964912415
@@ -31899,13 +31899,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037353515625,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0008169199170532323,
- "loss": 0.007,
+ "loss": 0.0067,
"macro_f1": 0.6666666865348816,
"num_tokens": 5417312.0,
"repeat_count": 0.0,
- "routers_loss": 0.007077203597873449,
+ "routers_loss": 0.006253739818930626,
"skip_count": 2.0,
"step": 3358,
"text_loss": 0.1304289996623993
@@ -31918,13 +31918,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.07568359375,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0008166804563614785,
- "loss": 0.0088,
+ "loss": 0.0084,
"macro_f1": 1.0,
"num_tokens": 5421227.0,
"repeat_count": 2.0,
- "routers_loss": 0.01628093235194683,
+ "routers_loss": 0.01622140221297741,
"skip_count": 2.0,
"step": 3360,
"text_loss": 0.298664391040802
@@ -31937,13 +31937,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0250244140625,
+ "grad_norm": 0.024169921875,
"learning_rate": 0.0008164408743171763,
- "loss": 0.0064,
+ "loss": 0.0062,
"macro_f1": 1.0,
"num_tokens": 5424646.0,
"repeat_count": 1.0,
- "routers_loss": 0.003795142285525799,
+ "routers_loss": 0.0037176944315433502,
"skip_count": 2.0,
"step": 3362,
"text_loss": 0.12147632241249084
@@ -31956,13 +31956,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037841796875,
+ "grad_norm": 0.046630859375,
"learning_rate": 0.0008162011710121339,
"loss": 0.0076,
"macro_f1": 0.6666666865348816,
"num_tokens": 5427897.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024164009373635054,
+ "routers_loss": 0.0020403533708304167,
"skip_count": 1.0,
"step": 3364,
"text_loss": 0.2656533420085907
@@ -31975,32 +31975,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0008159613465382066,
- "loss": 0.0071,
+ "loss": 0.007,
"macro_f1": 0.3333333432674408,
"num_tokens": 5430474.0,
"repeat_count": 0.0,
- "routers_loss": 0.002314126119017601,
+ "routers_loss": 0.0018634048756211996,
"skip_count": 0.0,
"step": 3366,
"text_loss": 0.9133086204528809
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 15.812444966245964,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.058837890625,
+ "f1_skip": 1.0,
+ "grad_norm": 0.0634765625,
"learning_rate": 0.0008157214009872951,
- "loss": 0.008,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 5433113.0,
"repeat_count": 0.0,
- "routers_loss": 0.014630996622145176,
+ "routers_loss": 0.012944488786160946,
"skip_count": 2.0,
"step": 3368,
"text_loss": 0.24352453649044037
@@ -32013,13 +32013,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04833984375,
+ "grad_norm": 0.05712890625,
"learning_rate": 0.0008154813344513472,
- "loss": 0.0141,
+ "loss": 0.0143,
"macro_f1": 0.6666666865348816,
"num_tokens": 5436259.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023453824687749147,
+ "routers_loss": 0.002347963862121105,
"skip_count": 2.0,
"step": 3370,
"text_loss": 0.7601244449615479
@@ -32032,13 +32032,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0291748046875,
+ "grad_norm": 0.031494140625,
"learning_rate": 0.0008152411470223568,
- "loss": 0.0078,
+ "loss": 0.0077,
"macro_f1": 0.3333333432674408,
"num_tokens": 5439126.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015595925506204367,
+ "routers_loss": 0.0016609140438959002,
"skip_count": 0.0,
"step": 3372,
"text_loss": 0.5551947355270386
@@ -32051,13 +32051,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.04345703125,
"learning_rate": 0.0008150008387923643,
- "loss": 0.0067,
+ "loss": 0.0064,
"macro_f1": 0.3333333432674408,
"num_tokens": 5442739.0,
"repeat_count": 0.0,
- "routers_loss": 0.008187411352992058,
+ "routers_loss": 0.008321396075189114,
"skip_count": 0.0,
"step": 3374,
"text_loss": 0.25028282403945923
@@ -32070,13 +32070,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.10302734375,
+ "grad_norm": 0.08544921875,
"learning_rate": 0.000814760409853456,
- "loss": 0.0109,
+ "loss": 0.0105,
"macro_f1": 1.0,
"num_tokens": 5445247.0,
"repeat_count": 2.0,
- "routers_loss": 0.009705786593258381,
+ "routers_loss": 0.009738070890307426,
"skip_count": 1.0,
"step": 3376,
"text_loss": 0.37271201610565186
@@ -32089,13 +32089,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0439453125,
+ "grad_norm": 0.042236328125,
"learning_rate": 0.0008145198602977651,
- "loss": 0.0084,
+ "loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 5449044.0,
"repeat_count": 0.0,
- "routers_loss": 0.003062802366912365,
+ "routers_loss": 0.0028421466704458,
"skip_count": 0.0,
"step": 3378,
"text_loss": 0.1458655595779419
@@ -32108,13 +32108,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.095703125,
+ "grad_norm": 0.11474609375,
"learning_rate": 0.0008142791902174701,
- "loss": 0.008,
+ "loss": 0.0081,
"macro_f1": 0.3333333432674408,
"num_tokens": 5453063.0,
"repeat_count": 0.0,
- "routers_loss": 0.001539172139018774,
+ "routers_loss": 0.0015170135302469134,
"skip_count": 0.0,
"step": 3380,
"text_loss": 0.5548722743988037
@@ -32127,13 +32127,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0008140383997047966,
- "loss": 0.0082,
+ "loss": 0.008,
"macro_f1": 0.6666666865348816,
"num_tokens": 5455814.0,
"repeat_count": 0.0,
- "routers_loss": 0.002227923832833767,
+ "routers_loss": 0.0022444510832428932,
"skip_count": 1.0,
"step": 3382,
"text_loss": 0.8034513592720032
@@ -32146,13 +32146,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.000813797488852016,
- "loss": 0.0063,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 5459392.0,
"repeat_count": 0.0,
- "routers_loss": 0.0003921810712199658,
+ "routers_loss": 0.00038578867679461837,
"skip_count": 0.0,
"step": 3384,
"text_loss": 0.6940088868141174
@@ -32165,13 +32165,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0517578125,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0008135564577514458,
- "loss": 0.0116,
+ "loss": 0.011,
"macro_f1": 0.3333333432674408,
"num_tokens": 5462413.0,
"repeat_count": 0.0,
- "routers_loss": 0.001971066929399967,
+ "routers_loss": 0.0019727381877601147,
"skip_count": 0.0,
"step": 3386,
"text_loss": 0.5124650597572327
@@ -32184,13 +32184,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0869140625,
+ "grad_norm": 0.099609375,
"learning_rate": 0.0008133153064954495,
- "loss": 0.0108,
+ "loss": 0.0107,
"macro_f1": 0.3333333432674408,
"num_tokens": 5465552.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018206594977527857,
+ "routers_loss": 0.0019896167796105146,
"skip_count": 0.0,
"step": 3388,
"text_loss": 0.4292517900466919
@@ -32203,13 +32203,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0008130740351764367,
- "loss": 0.0068,
+ "loss": 0.007,
"macro_f1": 1.0,
"num_tokens": 5468573.0,
"repeat_count": 1.0,
- "routers_loss": 0.003323496552184224,
+ "routers_loss": 0.0030118159484118223,
"skip_count": 1.0,
"step": 3390,
"text_loss": 0.48903173208236694
@@ -32222,13 +32222,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.024658203125,
+ "grad_norm": 0.0216064453125,
"learning_rate": 0.000812832643886863,
- "loss": 0.0058,
+ "loss": 0.0057,
"macro_f1": 0.6666666865348816,
"num_tokens": 5471547.0,
"repeat_count": 0.0,
- "routers_loss": 0.006201856769621372,
+ "routers_loss": 0.005084246397018433,
"skip_count": 2.0,
"step": 3392,
"text_loss": 0.35789889097213745
@@ -32241,13 +32241,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.0390625,
"learning_rate": 0.0008125911327192299,
- "loss": 0.009,
+ "loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 5474331.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009058464202098548,
+ "routers_loss": 0.0008874498889781535,
"skip_count": 0.0,
"step": 3394,
"text_loss": 0.6267408728599548
@@ -32260,13 +32260,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0008123495017660851,
- "loss": 0.0059,
+ "loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 5477633.0,
"repeat_count": 0.0,
- "routers_loss": 0.00202162005007267,
+ "routers_loss": 0.001794386887922883,
"skip_count": 0.0,
"step": 3396,
"text_loss": 0.3701885938644409
@@ -32279,13 +32279,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04296875,
+ "grad_norm": 0.042724609375,
"learning_rate": 0.0008121077511200221,
"loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 5481277.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022049983963370323,
+ "routers_loss": 0.002140481723472476,
"skip_count": 0.0,
"step": 3398,
"text_loss": 0.6362857818603516
@@ -32298,13 +32298,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05322265625,
+ "grad_norm": 0.0556640625,
"learning_rate": 0.00081186588087368,
- "loss": 0.0115,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 5484237.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008255304419435561,
+ "routers_loss": 0.000867189432028681,
"skip_count": 0.0,
"step": 3400,
"text_loss": 1.0847382545471191
@@ -32317,13 +32317,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0274658203125,
+ "grad_norm": 0.0296630859375,
"learning_rate": 0.0008116238911197442,
- "loss": 0.0067,
+ "loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 5487423.0,
"repeat_count": 0.0,
- "routers_loss": 0.0029532560147345066,
+ "routers_loss": 0.0029817656613886356,
"skip_count": 0.0,
"step": 3402,
"text_loss": 0.3813740313053131
@@ -32336,13 +32336,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.049560546875,
"learning_rate": 0.0008113817819509454,
"loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 5490155.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038054194301366806,
+ "routers_loss": 0.0035141287371516228,
"skip_count": 0.0,
"step": 3404,
"text_loss": 0.2113083451986313
@@ -32355,13 +32355,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042236328125,
+ "grad_norm": 0.04443359375,
"learning_rate": 0.0008111395534600603,
"loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 5493415.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034561967477202415,
+ "routers_loss": 0.003317659953609109,
"skip_count": 0.0,
"step": 3406,
"text_loss": 0.5869330167770386
@@ -32374,13 +32374,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.052001953125,
"learning_rate": 0.0008108972057399114,
- "loss": 0.0131,
+ "loss": 0.0123,
"macro_f1": 0.6666666865348816,
"num_tokens": 5496032.0,
"repeat_count": 0.0,
- "routers_loss": 0.0036799898371100426,
+ "routers_loss": 0.003833734430372715,
"skip_count": 2.0,
"step": 3408,
"text_loss": 0.2938928008079529
@@ -32393,13 +32393,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08740234375,
+ "grad_norm": 0.11328125,
"learning_rate": 0.0008106547388833669,
- "loss": 0.006,
+ "loss": 0.0061,
"macro_f1": 0.6666666865348816,
"num_tokens": 5498890.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026391225401312113,
+ "routers_loss": 0.002622978063300252,
"skip_count": 1.0,
"step": 3410,
"text_loss": 0.3130980432033539
@@ -32412,13 +32412,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.03564453125,
"learning_rate": 0.0008104121529833402,
"loss": 0.0062,
"macro_f1": 0.6666666865348816,
"num_tokens": 5502010.0,
"repeat_count": 1.0,
- "routers_loss": 0.00991886481642723,
+ "routers_loss": 0.007447598036378622,
"skip_count": 0.0,
"step": 3412,
"text_loss": 0.4413072466850281
@@ -32431,13 +32431,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.000810169448132791,
- "loss": 0.0096,
+ "loss": 0.0093,
"macro_f1": 0.6666666865348816,
"num_tokens": 5505212.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031243201810866594,
+ "routers_loss": 0.0031087708193808794,
"skip_count": 1.0,
"step": 3414,
"text_loss": 0.2910428047180176
@@ -32450,13 +32450,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.04345703125,
"learning_rate": 0.0008099266244247243,
- "loss": 0.0083,
+ "loss": 0.0082,
"macro_f1": 0.3272727429866791,
"num_tokens": 5508755.0,
"repeat_count": 0.0,
- "routers_loss": 0.02572118304669857,
+ "routers_loss": 0.02510393038392067,
"skip_count": 1.0,
"step": 3416,
"text_loss": 0.33022749423980713
@@ -32469,13 +32469,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0306396484375,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0008096836819521903,
"loss": 0.0057,
"macro_f1": 0.6666666865348816,
"num_tokens": 5512034.0,
"repeat_count": 0.0,
- "routers_loss": 0.001839894917793572,
+ "routers_loss": 0.0020537273958325386,
"skip_count": 1.0,
"step": 3418,
"text_loss": 0.4731218218803406
@@ -32488,32 +32488,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.031494140625,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0008094406208082853,
"loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 5515707.0,
"repeat_count": 0.0,
- "routers_loss": 0.0039922320283949375,
+ "routers_loss": 0.004218162503093481,
"skip_count": 2.0,
"step": 3420,
"text_loss": 0.23429590463638306
},
{
"acc_repeat": 1.0,
- "acc_skip": 1.0,
- "avg_layers": 26.0,
+ "acc_skip": 0.6666666865348816,
+ "avg_layers": 27.0,
"epoch": 16.065746991488112,
- "f1_execute": 1.0,
+ "f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
- "f1_skip": 1.0,
- "grad_norm": 0.0703125,
+ "f1_skip": 0.800000011920929,
+ "grad_norm": 0.0869140625,
"learning_rate": 0.0008091974410861507,
- "loss": 0.0066,
- "macro_f1": 1.0,
+ "loss": 0.0069,
+ "macro_f1": 0.9265305995941162,
"num_tokens": 5518436.0,
"repeat_count": 1.0,
- "routers_loss": 0.012939191423356533,
+ "routers_loss": 0.013488355092704296,
"skip_count": 3.0,
"step": 3422,
"text_loss": 0.45768749713897705
@@ -32526,13 +32526,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0008089541428789733,
- "loss": 0.01,
+ "loss": 0.0097,
"macro_f1": 0.6666666865348816,
"num_tokens": 5522368.0,
"repeat_count": 0.0,
- "routers_loss": 0.001064157928340137,
+ "routers_loss": 0.0010335417464375496,
"skip_count": 1.0,
"step": 3424,
"text_loss": 0.43423423171043396
@@ -32545,13 +32545,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0299072265625,
+ "grad_norm": 0.0306396484375,
"learning_rate": 0.0008087107262799855,
- "loss": 0.0047,
+ "loss": 0.0046,
"macro_f1": 0.3333333432674408,
"num_tokens": 5526061.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024185231886804104,
+ "routers_loss": 0.002134323585778475,
"skip_count": 0.0,
"step": 3426,
"text_loss": 0.4031757414340973
@@ -32564,13 +32564,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08203125,
+ "grad_norm": 0.1318359375,
"learning_rate": 0.0008084671913824651,
"loss": 0.006,
"macro_f1": 0.6666666865348816,
"num_tokens": 5529284.0,
"repeat_count": 0.0,
- "routers_loss": 0.009645994752645493,
+ "routers_loss": 0.0097216060385108,
"skip_count": 2.0,
"step": 3428,
"text_loss": 0.2836039960384369
@@ -32583,13 +32583,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.022705078125,
+ "grad_norm": 0.0220947265625,
"learning_rate": 0.000808223538279735,
- "loss": 0.0051,
+ "loss": 0.0049,
"macro_f1": 0.3333333432674408,
"num_tokens": 5532159.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017972104251384735,
+ "routers_loss": 0.001684269867837429,
"skip_count": 0.0,
"step": 3430,
"text_loss": 0.5804527401924133
@@ -32602,13 +32602,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.04248046875,
+ "grad_norm": 0.0390625,
"learning_rate": 0.0008079797670651637,
"loss": 0.008,
"macro_f1": 1.0,
"num_tokens": 5536050.0,
"repeat_count": 1.0,
- "routers_loss": 0.015138664282858372,
+ "routers_loss": 0.013918434269726276,
"skip_count": 1.0,
"step": 3432,
"text_loss": 0.31325826048851013
@@ -32621,13 +32621,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042724609375,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0008077358778321647,
- "loss": 0.0114,
+ "loss": 0.011,
"macro_f1": 0.3333333432674408,
"num_tokens": 5538885.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007666898309253156,
+ "routers_loss": 0.0007751787197776139,
"skip_count": 0.0,
"step": 3434,
"text_loss": 0.783108115196228
@@ -32640,13 +32640,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.800000011920929,
"f1_skip": 1.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.033935546875,
"learning_rate": 0.0008074918706741966,
"loss": 0.0063,
"macro_f1": 0.9262410998344421,
"num_tokens": 5541909.0,
"repeat_count": 3.0,
- "routers_loss": 0.024132754653692245,
+ "routers_loss": 0.021819550544023514,
"skip_count": 2.0,
"step": 3436,
"text_loss": 0.6558083295822144
@@ -32659,13 +32659,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03173828125,
+ "grad_norm": 0.02880859375,
"learning_rate": 0.0008072477456847638,
- "loss": 0.0061,
+ "loss": 0.0057,
"macro_f1": 0.3272727429866791,
"num_tokens": 5545101.0,
"repeat_count": 1.0,
- "routers_loss": 0.03225114569067955,
+ "routers_loss": 0.03309348225593567,
"skip_count": 0.0,
"step": 3438,
"text_loss": 0.9877075552940369
@@ -32678,13 +32678,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.04931640625,
"learning_rate": 0.0008070035029574151,
- "loss": 0.0062,
+ "loss": 0.006,
"macro_f1": 1.0,
"num_tokens": 5548971.0,
"repeat_count": 1.0,
- "routers_loss": 0.008569693192839622,
+ "routers_loss": 0.008696741424500942,
"skip_count": 1.0,
"step": 3440,
"text_loss": 0.24766330420970917
@@ -32697,13 +32697,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.000806759142585745,
"loss": 0.0056,
"macro_f1": 0.6666666865348816,
"num_tokens": 5552174.0,
"repeat_count": 0.0,
- "routers_loss": 0.004438123665750027,
+ "routers_loss": 0.004240929149091244,
"skip_count": 3.0,
"step": 3442,
"text_loss": 0.37255001068115234
@@ -32716,13 +32716,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0615234375,
+ "grad_norm": 0.05322265625,
"learning_rate": 0.0008065146646633927,
- "loss": 0.0091,
+ "loss": 0.0088,
"macro_f1": 0.6666666865348816,
"num_tokens": 5555005.0,
"repeat_count": 0.0,
- "routers_loss": 0.013728363439440727,
+ "routers_loss": 0.014345484785735607,
"skip_count": 1.0,
"step": 3444,
"text_loss": 0.26157206296920776
@@ -32735,13 +32735,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.05810546875,
+ "grad_norm": 0.06005859375,
"learning_rate": 0.0008062700692840428,
"loss": 0.0083,
"macro_f1": 1.0,
"num_tokens": 5559127.0,
"repeat_count": 1.0,
- "routers_loss": 0.008383825421333313,
+ "routers_loss": 0.008315163664519787,
"skip_count": 2.0,
"step": 3446,
"text_loss": 0.21971040964126587
@@ -32754,13 +32754,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.056396484375,
"learning_rate": 0.0008060253565414246,
"loss": 0.009,
"macro_f1": 0.5934640765190125,
"num_tokens": 5562254.0,
"repeat_count": 0.0,
- "routers_loss": 0.009948022663593292,
+ "routers_loss": 0.009582413360476494,
"skip_count": 3.0,
"step": 3448,
"text_loss": 0.6758295893669128
@@ -32773,13 +32773,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0361328125,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.0008057805265293124,
"loss": 0.006,
"macro_f1": 0.3333333432674408,
"num_tokens": 5565515.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025822422467172146,
+ "routers_loss": 0.002429503947496414,
"skip_count": 0.0,
"step": 3450,
"text_loss": 0.696592390537262
@@ -32792,13 +32792,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0008055355793415257,
- "loss": 0.0091,
+ "loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 5568392.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008777108159847558,
+ "routers_loss": 0.0007724192109890282,
"skip_count": 0.0,
"step": 3452,
"text_loss": 0.7092870473861694
@@ -32811,13 +32811,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0008052905150719285,
- "loss": 0.01,
+ "loss": 0.0099,
"macro_f1": 0.3333333432674408,
"num_tokens": 5571090.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009592860005795956,
+ "routers_loss": 0.0010859938338398933,
"skip_count": 0.0,
"step": 3454,
"text_loss": 0.6593860387802124
@@ -32832,11 +32832,11 @@
"f1_skip": 1.0,
"grad_norm": 0.04150390625,
"learning_rate": 0.0008050453338144301,
- "loss": 0.0077,
+ "loss": 0.0072,
"macro_f1": 1.0,
"num_tokens": 5574552.0,
"repeat_count": 1.0,
- "routers_loss": 0.0029973683413118124,
+ "routers_loss": 0.0030258705373853445,
"skip_count": 1.0,
"step": 3456,
"text_loss": 0.3479384481906891
@@ -32849,13 +32849,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.0008048000356629844,
- "loss": 0.0068,
+ "loss": 0.0066,
"macro_f1": 0.6666666865348816,
"num_tokens": 5577484.0,
"repeat_count": 0.0,
- "routers_loss": 0.005223365034908056,
+ "routers_loss": 0.005052885971963406,
"skip_count": 2.0,
"step": 3458,
"text_loss": 0.21858671307563782
@@ -32868,13 +32868,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.029541015625,
"learning_rate": 0.0008045546207115901,
- "loss": 0.0074,
+ "loss": 0.0068,
"macro_f1": 1.0,
"num_tokens": 5581605.0,
"repeat_count": 1.0,
- "routers_loss": 0.010660176165401936,
+ "routers_loss": 0.009976249188184738,
"skip_count": 3.0,
"step": 3460,
"text_loss": 0.16868001222610474
@@ -32887,13 +32887,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.032958984375,
"learning_rate": 0.0008043090890542904,
- "loss": 0.008,
+ "loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 5584994.0,
"repeat_count": 0.0,
- "routers_loss": 0.003038279013708234,
+ "routers_loss": 0.00270817126147449,
"skip_count": 0.0,
"step": 3462,
"text_loss": 0.785690426826477
@@ -32906,13 +32906,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03125,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0008040634407851739,
- "loss": 0.0057,
+ "loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 5588067.0,
"repeat_count": 0.0,
- "routers_loss": 0.001855011098086834,
+ "routers_loss": 0.0018436965765431523,
"skip_count": 0.0,
"step": 3464,
"text_loss": 0.5006644129753113
@@ -32925,13 +32925,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.030029296875,
+ "grad_norm": 0.028076171875,
"learning_rate": 0.0008038176759983731,
- "loss": 0.0064,
+ "loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 5590789.0,
"repeat_count": 0.0,
- "routers_loss": 0.008276397362351418,
+ "routers_loss": 0.008516279980540276,
"skip_count": 2.0,
"step": 3466,
"text_loss": 0.20963478088378906
@@ -32944,13 +32944,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04052734375,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0008035717947880659,
- "loss": 0.0092,
+ "loss": 0.0091,
"macro_f1": 0.3333333432674408,
"num_tokens": 5593472.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016371201490983367,
+ "routers_loss": 0.0016293043736368418,
"skip_count": 0.0,
"step": 3468,
"text_loss": 0.7376078963279724
@@ -32963,13 +32963,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.036376953125,
"learning_rate": 0.0008033257972484742,
- "loss": 0.0081,
+ "loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 5596108.0,
"repeat_count": 0.0,
- "routers_loss": 0.002605364890769124,
+ "routers_loss": 0.002364142332226038,
"skip_count": 0.0,
"step": 3470,
"text_loss": 0.5156455039978027
@@ -32982,13 +32982,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0008030796834738649,
- "loss": 0.0083,
+ "loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 5599103.0,
"repeat_count": 0.0,
- "routers_loss": 0.00892016664147377,
+ "routers_loss": 0.008872323669493198,
"skip_count": 0.0,
"step": 3472,
"text_loss": 0.2996419668197632
@@ -33001,13 +33001,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037841796875,
+ "grad_norm": 0.043701171875,
"learning_rate": 0.0008028334535585491,
- "loss": 0.0089,
+ "loss": 0.0087,
"macro_f1": 0.6666666865348816,
"num_tokens": 5602410.0,
"repeat_count": 0.0,
- "routers_loss": 0.01095602847635746,
+ "routers_loss": 0.011508257128298283,
"skip_count": 3.0,
"step": 3474,
"text_loss": 0.25438693165779114
@@ -33020,13 +33020,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.037353515625,
+ "grad_norm": 0.038330078125,
"learning_rate": 0.0008025871075968827,
- "loss": 0.0105,
+ "loss": 0.0106,
"macro_f1": 1.0,
"num_tokens": 5605424.0,
"repeat_count": 2.0,
- "routers_loss": 0.016052749007940292,
+ "routers_loss": 0.017225435003638268,
"skip_count": 2.0,
"step": 3476,
"text_loss": 0.2549574077129364
@@ -33039,13 +33039,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.800000011920929,
"f1_skip": 1.0,
- "grad_norm": 0.02880859375,
+ "grad_norm": 0.028564453125,
"learning_rate": 0.0008023406456832657,
- "loss": 0.0116,
+ "loss": 0.0111,
"macro_f1": 0.9262410998344421,
"num_tokens": 5608266.0,
"repeat_count": 3.0,
- "routers_loss": 0.04047509655356407,
+ "routers_loss": 0.039165645837783813,
"skip_count": 2.0,
"step": 3478,
"text_loss": 0.1797947734594345
@@ -33058,13 +33058,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0272216796875,
+ "grad_norm": 0.026123046875,
"learning_rate": 0.0008020940679121429,
- "loss": 0.0073,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 5611471.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010115962941199541,
+ "routers_loss": 0.0009718866203911602,
"skip_count": 0.0,
"step": 3480,
"text_loss": 0.8267702460289001
@@ -33077,13 +33077,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0008018473743780036,
- "loss": 0.0095,
+ "loss": 0.0093,
"macro_f1": 0.6666666865348816,
"num_tokens": 5615046.0,
"repeat_count": 0.0,
- "routers_loss": 0.006490753497928381,
+ "routers_loss": 0.006087122485041618,
"skip_count": 2.0,
"step": 3482,
"text_loss": 0.7267677187919617
@@ -33096,13 +33096,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.000801600565175381,
- "loss": 0.0088,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 5618350.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008378152851946652,
+ "routers_loss": 0.0007539413054473698,
"skip_count": 0.0,
"step": 3484,
"text_loss": 0.5910211801528931
@@ -33115,13 +33115,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.048583984375,
+ "grad_norm": 0.046142578125,
"learning_rate": 0.0008013536403988529,
- "loss": 0.0087,
+ "loss": 0.0085,
"macro_f1": 0.3333333432674408,
"num_tokens": 5621381.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007683819276280701,
+ "routers_loss": 0.0008076327503658831,
"skip_count": 0.0,
"step": 3486,
"text_loss": 0.30616798996925354
@@ -33134,13 +33134,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.047607421875,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.0008011066001430412,
"loss": 0.0086,
"macro_f1": 0.6122449040412903,
"num_tokens": 5624617.0,
"repeat_count": 0.0,
- "routers_loss": 0.02481125481426716,
+ "routers_loss": 0.023835813626646996,
"skip_count": 4.0,
"step": 3488,
"text_loss": 0.3376443088054657
@@ -33153,13 +33153,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0311279296875,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0008008594445026122,
- "loss": 0.0082,
+ "loss": 0.0083,
"macro_f1": 0.6666666865348816,
"num_tokens": 5627989.0,
"repeat_count": 0.0,
- "routers_loss": 0.005174005404114723,
+ "routers_loss": 0.004226419143378735,
"skip_count": 2.0,
"step": 3490,
"text_loss": 0.8185343146324158
@@ -33172,13 +33172,13 @@
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.044677734375,
"learning_rate": 0.0008006121735722767,
"loss": 0.0084,
"macro_f1": 0.32098764181137085,
"num_tokens": 5632286.0,
"repeat_count": 0.0,
- "routers_loss": 0.03602224588394165,
+ "routers_loss": 0.0366671048104763,
"skip_count": 2.0,
"step": 3492,
"text_loss": 0.2209547609090805
@@ -33191,13 +33191,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.03466796875,
"learning_rate": 0.0008003647874467892,
- "loss": 0.0087,
+ "loss": 0.0084,
"macro_f1": 0.6666666865348816,
"num_tokens": 5635368.0,
"repeat_count": 1.0,
- "routers_loss": 0.012145630083978176,
+ "routers_loss": 0.012956378981471062,
"skip_count": 0.0,
"step": 3494,
"text_loss": 0.20468664169311523
@@ -33210,13 +33210,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.057861328125,
+ "grad_norm": 0.059814453125,
"learning_rate": 0.0008001172862209485,
"loss": 0.0103,
"macro_f1": 0.6666666865348816,
"num_tokens": 5638440.0,
"repeat_count": 1.0,
- "routers_loss": 0.001456267898902297,
+ "routers_loss": 0.0017375422175973654,
"skip_count": 0.0,
"step": 3496,
"text_loss": 0.6647221446037292
@@ -33229,13 +33229,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0247802734375,
+ "grad_norm": 0.0244140625,
"learning_rate": 0.0007998696699895976,
- "loss": 0.0093,
+ "loss": 0.0091,
"macro_f1": 0.6592592597007751,
"num_tokens": 5641996.0,
"repeat_count": 1.0,
- "routers_loss": 0.028984347358345985,
+ "routers_loss": 0.025240756571292877,
"skip_count": 5.0,
"step": 3498,
"text_loss": 0.23892143368721008
@@ -33248,13 +33248,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02294921875,
+ "grad_norm": 0.021728515625,
"learning_rate": 0.0007996219388476236,
- "loss": 0.0077,
+ "loss": 0.0075,
"macro_f1": 0.6666666865348816,
"num_tokens": 5645071.0,
"repeat_count": 0.0,
- "routers_loss": 0.006859986111521721,
+ "routers_loss": 0.007436830550432205,
"skip_count": 1.0,
"step": 3500,
"text_loss": 0.7580804228782654
@@ -33267,13 +33267,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.024169921875,
+ "grad_norm": 0.0242919921875,
"learning_rate": 0.0007993740928899571,
- "loss": 0.0055,
+ "loss": 0.0054,
"macro_f1": 0.3333333432674408,
"num_tokens": 5648175.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011989293852820992,
+ "routers_loss": 0.001126602990552783,
"skip_count": 0.0,
"step": 3502,
"text_loss": 0.5281378626823425
@@ -33286,13 +33286,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031982421875,
+ "grad_norm": 0.04443359375,
"learning_rate": 0.0007991261322115737,
- "loss": 0.0056,
+ "loss": 0.0055,
"macro_f1": 0.3333333432674408,
"num_tokens": 5650973.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007974735926836729,
+ "routers_loss": 0.0007907263352535665,
"skip_count": 0.0,
"step": 3504,
"text_loss": 0.25220927596092224
@@ -33305,13 +33305,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0274658203125,
+ "grad_norm": 0.0262451171875,
"learning_rate": 0.000798878056907492,
- "loss": 0.0049,
+ "loss": 0.0048,
"macro_f1": 1.0,
"num_tokens": 5654252.0,
"repeat_count": 2.0,
- "routers_loss": 0.007121780421584845,
+ "routers_loss": 0.006263538729399443,
"skip_count": 2.0,
"step": 3506,
"text_loss": 0.46569153666496277
@@ -33324,13 +33324,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.0703125,
"learning_rate": 0.0007986298670727752,
- "loss": 0.0101,
+ "loss": 0.0098,
"macro_f1": 0.6666666865348816,
"num_tokens": 5657229.0,
"repeat_count": 0.0,
- "routers_loss": 0.00414140522480011,
+ "routers_loss": 0.004049144219607115,
"skip_count": 3.0,
"step": 3508,
"text_loss": 0.15174436569213867
@@ -33343,13 +33343,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 1.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.058837890625,
+ "grad_norm": 0.0791015625,
"learning_rate": 0.0007983815628025301,
- "loss": 0.0073,
+ "loss": 0.0074,
"macro_f1": 0.9262410998344421,
"num_tokens": 5659974.0,
"repeat_count": 2.0,
- "routers_loss": 0.04618353769183159,
+ "routers_loss": 0.0471976138651371,
"skip_count": 3.0,
"step": 3510,
"text_loss": 0.39072203636169434
@@ -33362,13 +33362,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.000798133144191907,
- "loss": 0.0084,
+ "loss": 0.0082,
"macro_f1": 0.3272727429866791,
"num_tokens": 5662893.0,
"repeat_count": 0.0,
- "routers_loss": 0.04054548963904381,
+ "routers_loss": 0.04030488431453705,
"skip_count": 1.0,
"step": 3512,
"text_loss": 0.3562147617340088
@@ -33381,13 +33381,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.0595703125,
"learning_rate": 0.0007978846113361009,
- "loss": 0.0067,
+ "loss": 0.0069,
"macro_f1": 0.6666666865348816,
"num_tokens": 5666476.0,
"repeat_count": 0.0,
- "routers_loss": 0.007785080466419458,
+ "routers_loss": 0.007475079502910376,
"skip_count": 1.0,
"step": 3514,
"text_loss": 0.26518192887306213
@@ -33400,13 +33400,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0400390625,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.0007976359643303497,
- "loss": 0.0128,
+ "loss": 0.013,
"macro_f1": 0.6666666865348816,
"num_tokens": 5669647.0,
"repeat_count": 0.0,
- "routers_loss": 0.0057366108521819115,
+ "routers_loss": 0.00558585487306118,
"skip_count": 2.0,
"step": 3516,
"text_loss": 0.29284560680389404
@@ -33419,13 +33419,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0458984375,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0007973872032699354,
- "loss": 0.0088,
+ "loss": 0.0082,
"macro_f1": 1.0,
"num_tokens": 5673491.0,
"repeat_count": 1.0,
- "routers_loss": 0.002753519220277667,
+ "routers_loss": 0.0026981087867170572,
"skip_count": 1.0,
"step": 3518,
"text_loss": 0.35089045763015747
@@ -33438,32 +33438,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.033203125,
"learning_rate": 0.000797138328250184,
"loss": 0.0058,
"macro_f1": 0.6666666865348816,
"num_tokens": 5676529.0,
"repeat_count": 1.0,
- "routers_loss": 0.0027982397004961967,
+ "routers_loss": 0.0027328627184033394,
"skip_count": 0.0,
"step": 3520,
"text_loss": 0.41077399253845215
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.800000011920929,
- "avg_layers": 24.0,
+ "acc_skip": 1.0,
+ "avg_layers": 23.0,
"epoch": 16.535368359260346,
- "f1_execute": 0.95652174949646,
+ "f1_execute": 0.9777777791023254,
"f1_repeat": 0.0,
- "f1_skip": 0.888888955116272,
- "grad_norm": 0.055419921875,
+ "f1_skip": 1.0,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0007968893393664646,
- "loss": 0.0105,
- "macro_f1": 0.6151369214057922,
+ "loss": 0.01,
+ "macro_f1": 0.6592592597007751,
"num_tokens": 5679987.0,
"repeat_count": 1.0,
- "routers_loss": 0.03294458985328674,
+ "routers_loss": 0.02695014327764511,
"skip_count": 5.0,
"step": 3522,
"text_loss": 0.44942837953567505
@@ -33476,13 +33476,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0007966402367141903,
- "loss": 0.0073,
+ "loss": 0.0072,
"macro_f1": 0.6666666865348816,
"num_tokens": 5683185.0,
"repeat_count": 0.0,
- "routers_loss": 0.007946476340293884,
+ "routers_loss": 0.00817026849836111,
"skip_count": 2.0,
"step": 3524,
"text_loss": 0.14528048038482666
@@ -33495,13 +33495,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.021240234375,
+ "grad_norm": 0.0216064453125,
"learning_rate": 0.0007963910203888176,
- "loss": 0.0043,
+ "loss": 0.0042,
"macro_f1": 0.3333333432674408,
"num_tokens": 5686544.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021326798014342785,
+ "routers_loss": 0.0021973433904349804,
"skip_count": 0.0,
"step": 3526,
"text_loss": 0.22358648478984833
@@ -33514,13 +33514,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0556640625,
+ "grad_norm": 0.050048828125,
"learning_rate": 0.0007961416904858469,
- "loss": 0.0079,
+ "loss": 0.0078,
"macro_f1": 0.3272727429866791,
"num_tokens": 5689579.0,
"repeat_count": 0.0,
- "routers_loss": 0.03373958170413971,
+ "routers_loss": 0.033712416887283325,
"skip_count": 1.0,
"step": 3528,
"text_loss": 0.3083649277687073
@@ -33533,13 +33533,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033203125,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0007958922471008217,
- "loss": 0.007,
+ "loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 5692869.0,
"repeat_count": 0.0,
- "routers_loss": 0.010963297449052334,
+ "routers_loss": 0.011182719841599464,
"skip_count": 2.0,
"step": 3530,
"text_loss": 0.21288011968135834
@@ -33552,13 +33552,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0286865234375,
+ "grad_norm": 0.0267333984375,
"learning_rate": 0.0007956426903293292,
"loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 5696007.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014243065379559994,
+ "routers_loss": 0.0015808293828740716,
"skip_count": 0.0,
"step": 3532,
"text_loss": 0.6068631410598755
@@ -33571,13 +33571,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
- "grad_norm": 0.059326171875,
+ "grad_norm": 0.052734375,
"learning_rate": 0.0007953930202670001,
- "loss": 0.0066,
+ "loss": 0.0062,
"macro_f1": 0.5492662787437439,
"num_tokens": 5699474.0,
"repeat_count": 2.0,
- "routers_loss": 0.038375116884708405,
+ "routers_loss": 0.03205178305506706,
"skip_count": 0.0,
"step": 3534,
"text_loss": 0.4317135512828827
@@ -33590,13 +33590,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.062255859375,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0007951432370095084,
"loss": 0.0105,
"macro_f1": 0.3333333432674408,
"num_tokens": 5703483.0,
"repeat_count": 0.0,
- "routers_loss": 0.0041501945815980434,
+ "routers_loss": 0.003518853336572647,
"skip_count": 0.0,
"step": 3536,
"text_loss": 0.5432273149490356
@@ -33609,13 +33609,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.08349609375,
+ "grad_norm": 0.11083984375,
"learning_rate": 0.0007948933406525715,
"loss": 0.01,
"macro_f1": 1.0,
"num_tokens": 5707301.0,
"repeat_count": 1.0,
- "routers_loss": 0.00536845438182354,
+ "routers_loss": 0.004982157610356808,
"skip_count": 1.0,
"step": 3538,
"text_loss": 0.40061065554618835
@@ -33628,13 +33628,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0007946433312919502,
- "loss": 0.0076,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 5710847.0,
"repeat_count": 0.0,
- "routers_loss": 0.0030090278014540672,
+ "routers_loss": 0.003067734418436885,
"skip_count": 0.0,
"step": 3540,
"text_loss": 0.5396234393119812
@@ -33647,13 +33647,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.055419921875,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0007943932090234486,
- "loss": 0.0098,
+ "loss": 0.0097,
"macro_f1": 0.5492662787437439,
"num_tokens": 5713683.0,
"repeat_count": 0.0,
- "routers_loss": 0.03756432980298996,
+ "routers_loss": 0.03728383034467697,
"skip_count": 2.0,
"step": 3542,
"text_loss": 0.18310914933681488
@@ -33666,13 +33666,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.027587890625,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0007941429739429138,
- "loss": 0.0037,
+ "loss": 0.0036,
"macro_f1": 0.6666666865348816,
"num_tokens": 5716397.0,
"repeat_count": 0.0,
- "routers_loss": 0.002606320893391967,
+ "routers_loss": 0.0025092530995607376,
"skip_count": 3.0,
"step": 3544,
"text_loss": 0.5806207060813904
@@ -33685,13 +33685,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0361328125,
+ "grad_norm": 0.040283203125,
"learning_rate": 0.0007938926261462366,
- "loss": 0.007,
+ "loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 5719984.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025650030001997948,
+ "routers_loss": 0.002493767999112606,
"skip_count": 0.0,
"step": 3546,
"text_loss": 0.38606807589530945
@@ -33704,13 +33704,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.05078125,
"learning_rate": 0.0007936421657293507,
"loss": 0.0094,
"macro_f1": 0.8823530077934265,
"num_tokens": 5723571.0,
"repeat_count": 1.0,
- "routers_loss": 0.013521218672394753,
+ "routers_loss": 0.014810923486948013,
"skip_count": 2.0,
"step": 3548,
"text_loss": 0.49558472633361816
@@ -33723,13 +33723,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0240478515625,
+ "grad_norm": 0.0284423828125,
"learning_rate": 0.0007933915927882327,
- "loss": 0.0071,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 5726405.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014581449795514345,
+ "routers_loss": 0.00152928801253438,
"skip_count": 0.0,
"step": 3550,
"text_loss": 0.8674797415733337
@@ -33742,13 +33742,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.0390625,
"learning_rate": 0.000793140907418903,
- "loss": 0.0077,
+ "loss": 0.0075,
"macro_f1": 0.6666666865348816,
"num_tokens": 5729955.0,
"repeat_count": 0.0,
- "routers_loss": 0.005775467026978731,
+ "routers_loss": 0.005522782914340496,
"skip_count": 2.0,
"step": 3552,
"text_loss": 0.3274473249912262
@@ -33761,13 +33761,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.0007928901097174248,
- "loss": 0.0083,
+ "loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 5733030.0,
"repeat_count": 0.0,
- "routers_loss": 0.008668854832649231,
+ "routers_loss": 0.009207013063132763,
"skip_count": 2.0,
"step": 3554,
"text_loss": 0.18237128853797913
@@ -33780,13 +33780,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.056884765625,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0007926391997799039,
- "loss": 0.0068,
+ "loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 5735978.0,
"repeat_count": 0.0,
- "routers_loss": 0.007210119627416134,
+ "routers_loss": 0.00695531303063035,
"skip_count": 0.0,
"step": 3556,
"text_loss": 0.3266434967517853
@@ -33799,13 +33799,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.048583984375,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0007923881777024898,
- "loss": 0.0065,
+ "loss": 0.0061,
"macro_f1": 0.6666666865348816,
"num_tokens": 5738901.0,
"repeat_count": 0.0,
- "routers_loss": 0.00165808224119246,
+ "routers_loss": 0.002743212040513754,
"skip_count": 1.0,
"step": 3558,
"text_loss": 0.4971913695335388
@@ -33818,13 +33818,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.049560546875,
+ "grad_norm": 0.04931640625,
"learning_rate": 0.0007921370435813741,
- "loss": 0.0081,
+ "loss": 0.0082,
"macro_f1": 0.6666666865348816,
"num_tokens": 5741946.0,
"repeat_count": 1.0,
- "routers_loss": 0.007618873380124569,
+ "routers_loss": 0.007037297356873751,
"skip_count": 0.0,
"step": 3560,
"text_loss": 0.5645473599433899
@@ -33837,13 +33837,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047607421875,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0007918857975127924,
"loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 5744987.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031584161333739758,
+ "routers_loss": 0.0030746585689485073,
"skip_count": 0.0,
"step": 3562,
"text_loss": 0.17717665433883667
@@ -33856,13 +33856,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0537109375,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0007916344395930224,
- "loss": 0.0079,
+ "loss": 0.0077,
"macro_f1": 0.3333333432674408,
"num_tokens": 5747837.0,
"repeat_count": 0.0,
- "routers_loss": 0.005207436624914408,
+ "routers_loss": 0.004522138275206089,
"skip_count": 0.0,
"step": 3564,
"text_loss": 0.7676118612289429
@@ -33875,13 +33875,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.000791382969918385,
- "loss": 0.0074,
+ "loss": 0.0075,
"macro_f1": 0.3333333432674408,
"num_tokens": 5750716.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023729163222014904,
+ "routers_loss": 0.0026240211445838213,
"skip_count": 0.0,
"step": 3566,
"text_loss": 0.4975173771381378
@@ -33894,13 +33894,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.061767578125,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.000791131388585244,
- "loss": 0.0115,
+ "loss": 0.011,
"macro_f1": 0.8820862174034119,
"num_tokens": 5754368.0,
"repeat_count": 2.0,
- "routers_loss": 0.021537931635975838,
+ "routers_loss": 0.021831991150975227,
"skip_count": 2.0,
"step": 3568,
"text_loss": 0.9670342206954956
@@ -33913,13 +33913,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0007908796956900055,
"loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 5757076.0,
"repeat_count": 1.0,
- "routers_loss": 0.001752255018800497,
+ "routers_loss": 0.0017586691537871957,
"skip_count": 0.0,
"step": 3570,
"text_loss": 0.3057977259159088
@@ -33932,13 +33932,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.043701171875,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.000790627891329119,
- "loss": 0.006,
+ "loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 5760613.0,
"repeat_count": 0.0,
- "routers_loss": 0.00557586969807744,
+ "routers_loss": 0.005515786819159985,
"skip_count": 0.0,
"step": 3572,
"text_loss": 0.5860086679458618
@@ -33951,13 +33951,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.04296875,
"learning_rate": 0.0007903759755990763,
"loss": 0.0061,
"macro_f1": 0.3333333432674408,
"num_tokens": 5763557.0,
"repeat_count": 0.0,
- "routers_loss": 0.004236271139234304,
+ "routers_loss": 0.004096484277397394,
"skip_count": 0.0,
"step": 3574,
"text_loss": 0.17175781726837158
@@ -33970,13 +33970,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.000790123948596412,
"loss": 0.0119,
"macro_f1": 0.6666666865348816,
"num_tokens": 5767430.0,
"repeat_count": 1.0,
- "routers_loss": 0.003505093976855278,
+ "routers_loss": 0.005216122139245272,
"skip_count": 0.0,
"step": 3576,
"text_loss": 0.7520374059677124
@@ -33989,13 +33989,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06640625,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0007898718104177031,
- "loss": 0.011,
+ "loss": 0.0108,
"macro_f1": 0.3333333432674408,
"num_tokens": 5770175.0,
"repeat_count": 0.0,
- "routers_loss": 0.0039036881644278765,
+ "routers_loss": 0.0037980107590556145,
"skip_count": 0.0,
"step": 3578,
"text_loss": 0.18117885291576385
@@ -34008,13 +34008,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.0007896195611595699,
"loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 5773032.0,
"repeat_count": 0.0,
- "routers_loss": 0.00450134975835681,
+ "routers_loss": 0.003672175807878375,
"skip_count": 2.0,
"step": 3580,
"text_loss": 0.7241058349609375
@@ -34027,13 +34027,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0007893672009186744,
- "loss": 0.0082,
+ "loss": 0.0083,
"macro_f1": 1.0,
"num_tokens": 5776077.0,
"repeat_count": 1.0,
- "routers_loss": 0.01287894882261753,
+ "routers_loss": 0.01229850109666586,
"skip_count": 3.0,
"step": 3582,
"text_loss": 0.29140418767929077
@@ -34046,13 +34046,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0284423828125,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0007891147297917216,
"loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 5779088.0,
"repeat_count": 1.0,
- "routers_loss": 0.003500303253531456,
+ "routers_loss": 0.0035251814406365156,
"skip_count": 0.0,
"step": 3584,
"text_loss": 0.1727485954761505
@@ -34065,13 +34065,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05615234375,
+ "grad_norm": 0.055908203125,
"learning_rate": 0.000788862147875459,
- "loss": 0.0093,
+ "loss": 0.0094,
"macro_f1": 0.6666666865348816,
"num_tokens": 5782201.0,
"repeat_count": 0.0,
- "routers_loss": 0.0042770374566316605,
+ "routers_loss": 0.004725661128759384,
"skip_count": 2.0,
"step": 3586,
"text_loss": 0.43512848019599915
@@ -34084,13 +34084,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.057861328125,
+ "grad_norm": 0.06396484375,
"learning_rate": 0.0007886094552666765,
- "loss": 0.0107,
+ "loss": 0.0106,
"macro_f1": 0.3333333432674408,
"num_tokens": 5785039.0,
"repeat_count": 0.0,
- "routers_loss": 0.005349197890609503,
+ "routers_loss": 0.005632172804325819,
"skip_count": 0.0,
"step": 3588,
"text_loss": 0.3534786105155945
@@ -34103,13 +34103,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0654296875,
+ "grad_norm": 0.0556640625,
"learning_rate": 0.0007883566520622062,
- "loss": 0.0114,
+ "loss": 0.0109,
"macro_f1": 0.6666666865348816,
"num_tokens": 5788017.0,
"repeat_count": 0.0,
- "routers_loss": 0.008142824284732342,
+ "routers_loss": 0.006249965168535709,
"skip_count": 1.0,
"step": 3590,
"text_loss": 0.2089710384607315
@@ -34122,13 +34122,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0302734375,
+ "grad_norm": 0.02978515625,
"learning_rate": 0.0007881037383589229,
- "loss": 0.0071,
+ "loss": 0.0073,
"macro_f1": 0.3333333432674408,
"num_tokens": 5791168.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013415004359558225,
+ "routers_loss": 0.0013797614956274629,
"skip_count": 0.0,
"step": 3592,
"text_loss": 0.4349329471588135
@@ -34141,13 +34141,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07666015625,
+ "grad_norm": 0.06982421875,
"learning_rate": 0.0007878507142537436,
- "loss": 0.0089,
+ "loss": 0.0091,
"macro_f1": 0.6666666865348816,
"num_tokens": 5793927.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022349755745381117,
+ "routers_loss": 0.0019719740375876427,
"skip_count": 1.0,
"step": 3594,
"text_loss": 0.6087368726730347
@@ -34160,13 +34160,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.0007875975798436274,
- "loss": 0.0058,
+ "loss": 0.0059,
"macro_f1": 0.6666666865348816,
"num_tokens": 5797214.0,
"repeat_count": 1.0,
- "routers_loss": 0.0037436108104884624,
+ "routers_loss": 0.0037070370744913816,
"skip_count": 0.0,
"step": 3596,
"text_loss": 0.4258122444152832
@@ -34179,13 +34179,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0458984375,
+ "grad_norm": 0.048583984375,
"learning_rate": 0.0007873443352255764,
- "loss": 0.009,
+ "loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 5800691.0,
"repeat_count": 0.0,
- "routers_loss": 0.008491694927215576,
+ "routers_loss": 0.008431311696767807,
"skip_count": 0.0,
"step": 3598,
"text_loss": 0.6006711721420288
@@ -34198,13 +34198,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052490234375,
+ "grad_norm": 0.055419921875,
"learning_rate": 0.0007870909804966337,
- "loss": 0.0075,
+ "loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 5804712.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020895113702863455,
+ "routers_loss": 0.0017720256000757217,
"skip_count": 0.0,
"step": 3600,
"text_loss": 0.6055042743682861
@@ -34217,13 +34217,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.053955078125,
+ "grad_norm": 0.0517578125,
"learning_rate": 0.0007868375157538861,
- "loss": 0.0086,
+ "loss": 0.0083,
"macro_f1": 0.3272727429866791,
"num_tokens": 5807670.0,
"repeat_count": 1.0,
- "routers_loss": 0.01193003449589014,
+ "routers_loss": 0.010697763413190842,
"skip_count": 0.0,
"step": 3602,
"text_loss": 0.8039056658744812
@@ -34236,13 +34236,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.043212890625,
+ "grad_norm": 0.04150390625,
"learning_rate": 0.0007865839410944611,
- "loss": 0.008,
+ "loss": 0.0081,
"macro_f1": 0.6666666865348816,
"num_tokens": 5810880.0,
"repeat_count": 1.0,
- "routers_loss": 0.003107197815552354,
+ "routers_loss": 0.0030022128485143185,
"skip_count": 0.0,
"step": 3604,
"text_loss": 0.596110463142395
@@ -34255,13 +34255,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0007863302566155295,
- "loss": 0.0098,
+ "loss": 0.0093,
"macro_f1": 0.6666666865348816,
"num_tokens": 5814171.0,
"repeat_count": 0.0,
- "routers_loss": 0.0075443098321557045,
+ "routers_loss": 0.006257854867726564,
"skip_count": 2.0,
"step": 3606,
"text_loss": 0.5700319409370422
@@ -34274,13 +34274,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.0294189453125,
"learning_rate": 0.0007860764624143031,
- "loss": 0.0053,
+ "loss": 0.0054,
"macro_f1": 0.6666666865348816,
"num_tokens": 5817607.0,
"repeat_count": 1.0,
- "routers_loss": 0.005313992965966463,
+ "routers_loss": 0.004838473163545132,
"skip_count": 0.0,
"step": 3608,
"text_loss": 0.8319530487060547
@@ -34293,13 +34293,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.09716796875,
+ "grad_norm": 0.08154296875,
"learning_rate": 0.0007858225585880369,
- "loss": 0.0069,
+ "loss": 0.0067,
"macro_f1": 0.8823530077934265,
"num_tokens": 5821452.0,
"repeat_count": 1.0,
- "routers_loss": 0.020901991054415703,
+ "routers_loss": 0.02173662930727005,
"skip_count": 2.0,
"step": 3610,
"text_loss": 0.3738477826118469
@@ -34312,13 +34312,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0007855685452340269,
- "loss": 0.0078,
+ "loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 5824683.0,
"repeat_count": 0.0,
- "routers_loss": 0.002484811469912529,
+ "routers_loss": 0.0032719180453568697,
"skip_count": 0.0,
"step": 3612,
"text_loss": 0.4054839015007019
@@ -34331,13 +34331,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.0007853144224496118,
- "loss": 0.0094,
+ "loss": 0.0093,
"macro_f1": 0.3272727429866791,
"num_tokens": 5827860.0,
"repeat_count": 1.0,
- "routers_loss": 0.032128892838954926,
+ "routers_loss": 0.032171256840229034,
"skip_count": 0.0,
"step": 3614,
"text_loss": 0.18112395703792572
@@ -34350,13 +34350,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05322265625,
+ "grad_norm": 0.0458984375,
"learning_rate": 0.0007850601903321716,
- "loss": 0.0062,
+ "loss": 0.006,
"macro_f1": 0.6666666865348816,
"num_tokens": 5831651.0,
"repeat_count": 0.0,
- "routers_loss": 0.0136244622990489,
+ "routers_loss": 0.013230946846306324,
"skip_count": 1.0,
"step": 3616,
"text_loss": 0.2698844075202942
@@ -34369,13 +34369,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.000784805848979129,
- "loss": 0.0057,
+ "loss": 0.0058,
"macro_f1": 0.3333333432674408,
"num_tokens": 5834369.0,
"repeat_count": 0.0,
- "routers_loss": 0.001705345930531621,
+ "routers_loss": 0.00162619655020535,
"skip_count": 0.0,
"step": 3618,
"text_loss": 0.2430931180715561
@@ -34388,13 +34388,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0546875,
+ "grad_norm": 0.0498046875,
"learning_rate": 0.0007845513984879477,
- "loss": 0.0066,
+ "loss": 0.0069,
"macro_f1": 0.6666666865348816,
"num_tokens": 5838102.0,
"repeat_count": 1.0,
- "routers_loss": 0.002594438148662448,
+ "routers_loss": 0.002781603019684553,
"skip_count": 0.0,
"step": 3620,
"text_loss": 0.4968300759792328
@@ -34407,13 +34407,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.031005859375,
"learning_rate": 0.0007842968389561337,
- "loss": 0.0049,
+ "loss": 0.0048,
"macro_f1": 0.3333333432674408,
"num_tokens": 5841029.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019142795354127884,
+ "routers_loss": 0.0023873315658420324,
"skip_count": 0.0,
"step": 3622,
"text_loss": 0.5842974781990051
@@ -34426,13 +34426,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.03955078125,
"learning_rate": 0.0007840421704812346,
- "loss": 0.0093,
+ "loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 5845158.0,
"repeat_count": 0.0,
- "routers_loss": 0.004223407246172428,
+ "routers_loss": 0.00400173757225275,
"skip_count": 1.0,
"step": 3624,
"text_loss": 0.8312450647354126
@@ -34445,13 +34445,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03466796875,
+ "grad_norm": 0.035888671875,
"learning_rate": 0.00078378739316084,
- "loss": 0.0092,
+ "loss": 0.0094,
"macro_f1": 0.3333333432674408,
"num_tokens": 5849175.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005486982990987599,
+ "routers_loss": 0.0004974664188921452,
"skip_count": 0.0,
"step": 3626,
"text_loss": 0.48637253046035767
@@ -34464,13 +34464,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 1.0,
"f1_skip": 0.888888955116272,
- "grad_norm": 0.0654296875,
+ "grad_norm": 0.10693359375,
"learning_rate": 0.000783532507092581,
- "loss": 0.0077,
+ "loss": 0.0079,
"macro_f1": 0.9555556178092957,
"num_tokens": 5852020.0,
"repeat_count": 1.0,
- "routers_loss": 0.025490080937743187,
+ "routers_loss": 0.02555239573121071,
"skip_count": 5.0,
"step": 3628,
"text_loss": 0.5407033562660217
@@ -34483,13 +34483,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.041259765625,
"learning_rate": 0.0007832775123741306,
- "loss": 0.0104,
+ "loss": 0.0106,
"macro_f1": 0.3333333432674408,
"num_tokens": 5854873.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026199028361588717,
+ "routers_loss": 0.0025962977670133114,
"skip_count": 0.0,
"step": 3630,
"text_loss": 0.618230938911438
@@ -34502,13 +34502,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0225830078125,
+ "grad_norm": 0.0234375,
"learning_rate": 0.000783022409103203,
"loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 5858086.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028729604091495275,
+ "routers_loss": 0.0029271875973790884,
"skip_count": 0.0,
"step": 3632,
"text_loss": 0.21259798109531403
@@ -34521,13 +34521,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05419921875,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0007827671973775542,
- "loss": 0.0069,
+ "loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 5860886.0,
"repeat_count": 0.0,
- "routers_loss": 0.004097428172826767,
+ "routers_loss": 0.004102068953216076,
"skip_count": 0.0,
"step": 3634,
"text_loss": 0.4991208016872406
@@ -34540,13 +34540,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0302734375,
+ "grad_norm": 0.033203125,
"learning_rate": 0.0007825118772949819,
"loss": 0.0095,
"macro_f1": 0.6666666865348816,
"num_tokens": 5864291.0,
"repeat_count": 0.0,
- "routers_loss": 0.002142589772120118,
+ "routers_loss": 0.0023497689981013536,
"skip_count": 1.0,
"step": 3636,
"text_loss": 0.3878401517868042
@@ -34559,13 +34559,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0206298828125,
+ "grad_norm": 0.0216064453125,
"learning_rate": 0.0007822564489533255,
- "loss": 0.005,
+ "loss": 0.0051,
"macro_f1": 0.6666666865348816,
"num_tokens": 5867155.0,
"repeat_count": 0.0,
- "routers_loss": 0.006497112102806568,
+ "routers_loss": 0.007680345326662064,
"skip_count": 2.0,
"step": 3638,
"text_loss": 0.6132124066352844
@@ -34578,13 +34578,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.053466796875,
"learning_rate": 0.0007820009124504653,
- "loss": 0.0095,
+ "loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 5870325.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008698388119228184,
+ "routers_loss": 0.0008242831099778414,
"skip_count": 0.0,
"step": 3640,
"text_loss": 0.3552473187446594
@@ -34597,13 +34597,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.04296875,
"learning_rate": 0.0007817452678843236,
- "loss": 0.0071,
+ "loss": 0.0073,
"macro_f1": 0.6601307392120361,
"num_tokens": 5873301.0,
"repeat_count": 1.0,
- "routers_loss": 0.022245829924941063,
+ "routers_loss": 0.023831043392419815,
"skip_count": 2.0,
"step": 3642,
"text_loss": 0.18363867700099945
@@ -34616,13 +34616,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.024658203125,
+ "grad_norm": 0.0260009765625,
"learning_rate": 0.0007814895153528635,
- "loss": 0.0071,
+ "loss": 0.007,
"macro_f1": 0.3333333432674408,
"num_tokens": 5876225.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020051905885338783,
+ "routers_loss": 0.001999989850446582,
"skip_count": 0.0,
"step": 3644,
"text_loss": 0.17581747472286224
@@ -34635,13 +34635,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025146484375,
+ "grad_norm": 0.028564453125,
"learning_rate": 0.0007812336549540903,
- "loss": 0.0071,
+ "loss": 0.007,
"macro_f1": 0.3333333432674408,
"num_tokens": 5879501.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014994015218690038,
+ "routers_loss": 0.001098626758903265,
"skip_count": 0.0,
"step": 3646,
"text_loss": 0.5040884613990784
@@ -34654,13 +34654,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0294189453125,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0007809776867860499,
- "loss": 0.0051,
+ "loss": 0.005,
"macro_f1": 0.3272727429866791,
"num_tokens": 5882608.0,
"repeat_count": 0.0,
- "routers_loss": 0.010847748257219791,
+ "routers_loss": 0.012210183776915073,
"skip_count": 1.0,
"step": 3648,
"text_loss": 0.27114811539649963
@@ -34673,13 +34673,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0262451171875,
+ "grad_norm": 0.032958984375,
"learning_rate": 0.00078072161094683,
- "loss": 0.006,
+ "loss": 0.0059,
"macro_f1": 0.6666666865348816,
"num_tokens": 5886106.0,
"repeat_count": 0.0,
- "routers_loss": 0.005927151069045067,
+ "routers_loss": 0.005191771313548088,
"skip_count": 2.0,
"step": 3650,
"text_loss": 0.5167917609214783
@@ -34692,13 +34692,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.024169921875,
+ "grad_norm": 0.0235595703125,
"learning_rate": 0.0007804654275345591,
- "loss": 0.0061,
+ "loss": 0.006,
"macro_f1": 0.6666666865348816,
"num_tokens": 5889122.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019531139405444264,
+ "routers_loss": 0.0016411367105320096,
"skip_count": 1.0,
"step": 3652,
"text_loss": 0.7691274285316467
@@ -34711,13 +34711,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.0277099609375,
+ "grad_norm": 0.03515625,
"learning_rate": 0.0007802091366474074,
- "loss": 0.0052,
+ "loss": 0.005,
"macro_f1": 0.8823530077934265,
"num_tokens": 5892313.0,
"repeat_count": 2.0,
- "routers_loss": 0.015216727741062641,
+ "routers_loss": 0.015627093613147736,
"skip_count": 1.0,
"step": 3654,
"text_loss": 0.4646325409412384
@@ -34730,13 +34730,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0311279296875,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0007799527383835858,
- "loss": 0.0067,
+ "loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 5895577.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009810501942411065,
+ "routers_loss": 0.0009879748104140162,
"skip_count": 0.0,
"step": 3656,
"text_loss": 0.5587969422340393
@@ -34749,13 +34749,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.072265625,
+ "grad_norm": 0.0986328125,
"learning_rate": 0.0007796962328413469,
- "loss": 0.0093,
+ "loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 5898546.0,
"repeat_count": 0.0,
- "routers_loss": 0.00458681071177125,
+ "routers_loss": 0.004864919930696487,
"skip_count": 0.0,
"step": 3658,
"text_loss": 0.6981375813484192
@@ -34768,13 +34768,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.029052734375,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0007794396201189839,
- "loss": 0.0076,
+ "loss": 0.0078,
"macro_f1": 1.0,
"num_tokens": 5901618.0,
"repeat_count": 1.0,
- "routers_loss": 0.006519644521176815,
+ "routers_loss": 0.006617432460188866,
"skip_count": 2.0,
"step": 3660,
"text_loss": 0.22521957755088806
@@ -34787,13 +34787,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.0007791829003148312,
- "loss": 0.0097,
+ "loss": 0.0098,
"macro_f1": 0.6601307392120361,
"num_tokens": 5904540.0,
"repeat_count": 1.0,
- "routers_loss": 0.0783558189868927,
+ "routers_loss": 0.0782252699136734,
"skip_count": 2.0,
"step": 3662,
"text_loss": 0.2649642825126648
@@ -34806,13 +34806,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06884765625,
+ "grad_norm": 0.06494140625,
"learning_rate": 0.0007789260735272647,
- "loss": 0.0115,
+ "loss": 0.0114,
"macro_f1": 0.3333333432674408,
"num_tokens": 5907827.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012588179670274258,
+ "routers_loss": 0.0012057392159476876,
"skip_count": 0.0,
"step": 3664,
"text_loss": 0.6943771243095398
@@ -34825,13 +34825,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0177001953125,
+ "grad_norm": 0.018310546875,
"learning_rate": 0.0007786691398547005,
"loss": 0.0048,
"macro_f1": 0.6666666865348816,
"num_tokens": 5911163.0,
"repeat_count": 0.0,
- "routers_loss": 0.0075621698051691055,
+ "routers_loss": 0.007476957980543375,
"skip_count": 2.0,
"step": 3666,
"text_loss": 0.1502683162689209
@@ -34844,13 +34844,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0303955078125,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.0007784120993955962,
- "loss": 0.0056,
+ "loss": 0.0055,
"macro_f1": 0.6666666865348816,
"num_tokens": 5913948.0,
"repeat_count": 1.0,
- "routers_loss": 0.00408853217959404,
+ "routers_loss": 0.004082011990249157,
"skip_count": 0.0,
"step": 3668,
"text_loss": 0.4127517640590668
@@ -34863,13 +34863,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.800000011920929,
"f1_skip": 1.0,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.041259765625,
"learning_rate": 0.0007781549522484503,
- "loss": 0.0067,
+ "loss": 0.0066,
"macro_f1": 0.9265305995941162,
"num_tokens": 5917360.0,
"repeat_count": 3.0,
- "routers_loss": 0.02851647138595581,
+ "routers_loss": 0.027505695819854736,
"skip_count": 1.0,
"step": 3670,
"text_loss": 0.23892618715763092
@@ -34882,13 +34882,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.031005859375,
+ "grad_norm": 0.0306396484375,
"learning_rate": 0.0007778976985118018,
- "loss": 0.0086,
+ "loss": 0.0083,
"macro_f1": 0.6666666865348816,
"num_tokens": 5920524.0,
"repeat_count": 0.0,
- "routers_loss": 0.0030399872921407223,
+ "routers_loss": 0.0024977331049740314,
"skip_count": 2.0,
"step": 3672,
"text_loss": 0.5076471567153931
@@ -34901,13 +34901,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05908203125,
+ "grad_norm": 0.0576171875,
"learning_rate": 0.0007776403382842312,
- "loss": 0.0061,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 5923632.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014176326803863049,
+ "routers_loss": 0.0015700991498306394,
"skip_count": 0.0,
"step": 3674,
"text_loss": 0.6287924647331238
@@ -34920,13 +34920,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06591796875,
+ "grad_norm": 0.05810546875,
"learning_rate": 0.0007773828716643591,
- "loss": 0.0084,
+ "loss": 0.0085,
"macro_f1": 0.3272727429866791,
"num_tokens": 5926438.0,
"repeat_count": 1.0,
- "routers_loss": 0.0505419559776783,
+ "routers_loss": 0.05108916014432907,
"skip_count": 0.0,
"step": 3676,
"text_loss": 0.26517006754875183
@@ -34939,13 +34939,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0007771252987508474,
"loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 5930081.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034831957891583443,
+ "routers_loss": 0.003439917229115963,
"skip_count": 0.0,
"step": 3678,
"text_loss": 0.5189079642295837
@@ -34958,13 +34958,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.056884765625,
"learning_rate": 0.0007768676196423984,
"loss": 0.0064,
"macro_f1": 1.0,
"num_tokens": 5933463.0,
"repeat_count": 1.0,
- "routers_loss": 0.0020620382856577635,
+ "routers_loss": 0.001935846172273159,
"skip_count": 1.0,
"step": 3680,
"text_loss": 0.6703575849533081
@@ -34972,18 +34972,18 @@
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 27.0,
"epoch": 17.286469034341064,
- "f1_execute": 0.9629629254341125,
+ "f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0007766098344377553,
- "loss": 0.0084,
- "macro_f1": 0.32098764181137085,
+ "loss": 0.0082,
+ "macro_f1": 0.31446540355682373,
"num_tokens": 5937098.0,
"repeat_count": 0.0,
- "routers_loss": 0.03850153833627701,
+ "routers_loss": 0.0384826585650444,
"skip_count": 2.0,
"step": 3682,
"text_loss": 0.6424444913864136
@@ -34996,13 +34996,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031982421875,
+ "grad_norm": 0.0301513671875,
"learning_rate": 0.0007763519432357018,
- "loss": 0.0065,
+ "loss": 0.0063,
"macro_f1": 0.3333333432674408,
"num_tokens": 5940436.0,
"repeat_count": 0.0,
- "routers_loss": 0.000853471748996526,
+ "routers_loss": 0.0008654671837575734,
"skip_count": 0.0,
"step": 3684,
"text_loss": 0.4189988672733307
@@ -35015,13 +35015,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05712890625,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.0007760939461350623,
- "loss": 0.0107,
+ "loss": 0.0111,
"macro_f1": 0.6666666865348816,
"num_tokens": 5943731.0,
"repeat_count": 0.0,
- "routers_loss": 0.007630084175616503,
+ "routers_loss": 0.007468715775758028,
"skip_count": 2.0,
"step": 3686,
"text_loss": 0.2875453233718872
@@ -35036,11 +35036,11 @@
"f1_skip": 0.0,
"grad_norm": 0.041259765625,
"learning_rate": 0.0007758358432347019,
- "loss": 0.0061,
+ "loss": 0.0058,
"macro_f1": 0.3333333432674408,
"num_tokens": 5946707.0,
"repeat_count": 0.0,
- "routers_loss": 0.001303135184571147,
+ "routers_loss": 0.001252831774763763,
"skip_count": 0.0,
"step": 3688,
"text_loss": 0.5093055367469788
@@ -35053,13 +35053,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0007755776346335259,
- "loss": 0.0058,
+ "loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 5949833.0,
"repeat_count": 0.0,
- "routers_loss": 0.001894078915938735,
+ "routers_loss": 0.001680848654359579,
"skip_count": 0.0,
"step": 3690,
"text_loss": 0.4031114876270294
@@ -35072,13 +35072,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.025146484375,
+ "grad_norm": 0.0255126953125,
"learning_rate": 0.0007753193204304807,
- "loss": 0.0056,
+ "loss": 0.0058,
"macro_f1": 0.6666666865348816,
"num_tokens": 5953095.0,
"repeat_count": 0.0,
- "routers_loss": 0.005708714015781879,
+ "routers_loss": 0.0047258250415325165,
"skip_count": 2.0,
"step": 3692,
"text_loss": 0.17632785439491272
@@ -35091,13 +35091,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.03564453125,
+ "grad_norm": 0.036376953125,
"learning_rate": 0.0007750609007245524,
"loss": 0.0062,
"macro_f1": 1.0,
"num_tokens": 5955971.0,
"repeat_count": 2.0,
- "routers_loss": 0.0019924843218177557,
+ "routers_loss": 0.001980359200388193,
"skip_count": 4.0,
"step": 3694,
"text_loss": 0.3423727750778198
@@ -35110,13 +35110,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0255126953125,
+ "grad_norm": 0.0238037109375,
"learning_rate": 0.0007748023756147679,
- "loss": 0.007,
+ "loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 5958948.0,
"repeat_count": 0.0,
- "routers_loss": 0.005303190555423498,
+ "routers_loss": 0.00511702848598361,
"skip_count": 0.0,
"step": 3696,
"text_loss": 0.28279972076416016
@@ -35129,13 +35129,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0007745437452001949,
- "loss": 0.0063,
+ "loss": 0.0064,
"macro_f1": 0.3333333432674408,
"num_tokens": 5961819.0,
"repeat_count": 0.0,
- "routers_loss": 0.0004839526955038309,
+ "routers_loss": 0.0005220443126745522,
"skip_count": 0.0,
"step": 3698,
"text_loss": 0.4793325662612915
@@ -35148,13 +35148,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0007742850095799408,
- "loss": 0.0083,
+ "loss": 0.0084,
"macro_f1": 0.3272727429866791,
"num_tokens": 5964625.0,
"repeat_count": 1.0,
- "routers_loss": 0.06377380341291428,
+ "routers_loss": 0.06411020457744598,
"skip_count": 0.0,
"step": 3700,
"text_loss": 0.2825184464454651
@@ -35167,13 +35167,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0654296875,
+ "grad_norm": 0.0751953125,
"learning_rate": 0.0007740261688531536,
- "loss": 0.007,
+ "loss": 0.0068,
"macro_f1": 0.6666666865348816,
"num_tokens": 5967134.0,
"repeat_count": 0.0,
- "routers_loss": 0.00462002120912075,
+ "routers_loss": 0.004408109001815319,
"skip_count": 3.0,
"step": 3702,
"text_loss": 0.690429151058197
@@ -35186,13 +35186,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0228271484375,
+ "grad_norm": 0.0279541015625,
"learning_rate": 0.0007737672231190215,
- "loss": 0.0033,
+ "loss": 0.0034,
"macro_f1": 0.3333333432674408,
"num_tokens": 5969831.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006775400252081454,
+ "routers_loss": 0.0006747521692886949,
"skip_count": 0.0,
"step": 3704,
"text_loss": 0.32556024193763733
@@ -35205,13 +35205,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02783203125,
+ "grad_norm": 0.031005859375,
"learning_rate": 0.0007735081724767732,
- "loss": 0.0061,
+ "loss": 0.0059,
"macro_f1": 0.3333333432674408,
"num_tokens": 5973015.0,
"repeat_count": 0.0,
- "routers_loss": 0.001372992410324514,
+ "routers_loss": 0.0020414739847183228,
"skip_count": 0.0,
"step": 3706,
"text_loss": 0.5876469612121582
@@ -35224,13 +35224,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.072265625,
"learning_rate": 0.0007732490170256769,
"loss": 0.0071,
"macro_f1": 0.6666666865348816,
"num_tokens": 5975778.0,
"repeat_count": 1.0,
- "routers_loss": 0.005310074891895056,
+ "routers_loss": 0.005610425490885973,
"skip_count": 0.0,
"step": 3708,
"text_loss": 0.2968577444553375
@@ -35243,13 +35243,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05078125,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0007729897568650422,
- "loss": 0.01,
+ "loss": 0.0097,
"macro_f1": 0.3333333432674408,
"num_tokens": 5979115.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012178041506558657,
+ "routers_loss": 0.001248046406544745,
"skip_count": 0.0,
"step": 3710,
"text_loss": 0.626361608505249
@@ -35262,13 +35262,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0595703125,
+ "grad_norm": 0.06787109375,
"learning_rate": 0.0007727303920942176,
- "loss": 0.01,
+ "loss": 0.0102,
"macro_f1": 0.6666666865348816,
"num_tokens": 5982213.0,
"repeat_count": 0.0,
- "routers_loss": 0.004617640748620033,
+ "routers_loss": 0.005791695322841406,
"skip_count": 2.0,
"step": 3712,
"text_loss": 0.4133484661579132
@@ -35281,13 +35281,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0791015625,
+ "grad_norm": 0.08740234375,
"learning_rate": 0.0007724709228125922,
- "loss": 0.0106,
+ "loss": 0.0105,
"macro_f1": 0.5492662787437439,
"num_tokens": 5984930.0,
"repeat_count": 0.0,
- "routers_loss": 0.020924020558595657,
+ "routers_loss": 0.02114664763212204,
"skip_count": 2.0,
"step": 3714,
"text_loss": 0.4646461308002472
@@ -35300,13 +35300,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.032958984375,
"learning_rate": 0.0007722113491195952,
- "loss": 0.0059,
+ "loss": 0.0058,
"macro_f1": 1.0,
"num_tokens": 5988017.0,
"repeat_count": 2.0,
- "routers_loss": 0.0053578754886984825,
+ "routers_loss": 0.005913930479437113,
"skip_count": 5.0,
"step": 3716,
"text_loss": 0.15474505722522736
@@ -35319,13 +35319,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.026123046875,
+ "grad_norm": 0.02685546875,
"learning_rate": 0.0007719516711146957,
- "loss": 0.0075,
+ "loss": 0.0073,
"macro_f1": 0.6666666865348816,
"num_tokens": 5991562.0,
"repeat_count": 0.0,
- "routers_loss": 0.006991801783442497,
+ "routers_loss": 0.0075925313867628574,
"skip_count": 2.0,
"step": 3718,
"text_loss": 0.5293686985969543
@@ -35338,13 +35338,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031005859375,
+ "grad_norm": 0.037353515625,
"learning_rate": 0.000771691888897403,
- "loss": 0.0054,
+ "loss": 0.0051,
"macro_f1": 0.3333333432674408,
"num_tokens": 5994675.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011527709430083632,
+ "routers_loss": 0.0012335237115621567,
"skip_count": 0.0,
"step": 3720,
"text_loss": 0.5210637450218201
@@ -35357,13 +35357,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.09521484375,
+ "grad_norm": 0.0771484375,
"learning_rate": 0.0007714320025672657,
- "loss": 0.008,
+ "loss": 0.0077,
"macro_f1": 0.6666666865348816,
"num_tokens": 5999070.0,
"repeat_count": 0.0,
- "routers_loss": 0.011113573797047138,
+ "routers_loss": 0.010582062415778637,
"skip_count": 2.0,
"step": 3722,
"text_loss": 0.2783571779727936
@@ -35376,13 +35376,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03369140625,
+ "grad_norm": 0.032958984375,
"learning_rate": 0.000771172012223873,
- "loss": 0.008,
+ "loss": 0.0078,
"macro_f1": 0.6598639488220215,
"num_tokens": 6002702.0,
"repeat_count": 1.0,
- "routers_loss": 0.014584671705961227,
+ "routers_loss": 0.015008784830570221,
"skip_count": 3.0,
"step": 3724,
"text_loss": 0.358705073595047
@@ -35395,13 +35395,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05419921875,
+ "grad_norm": 0.052734375,
"learning_rate": 0.0007709119179668538,
"loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 6005517.0,
"repeat_count": 0.0,
- "routers_loss": 0.001164636923931539,
+ "routers_loss": 0.00111615180503577,
"skip_count": 0.0,
"step": 3726,
"text_loss": 0.45202162861824036
@@ -35414,13 +35414,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.034912109375,
"learning_rate": 0.0007706517198958764,
- "loss": 0.0098,
+ "loss": 0.0096,
"macro_f1": 0.6595745086669922,
"num_tokens": 6009111.0,
"repeat_count": 1.0,
- "routers_loss": 0.05235295370221138,
+ "routers_loss": 0.05215252563357353,
"skip_count": 4.0,
"step": 3728,
"text_loss": 0.20360413193702698
@@ -35433,13 +35433,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0007703914181106497,
- "loss": 0.0077,
+ "loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 6012989.0,
"repeat_count": 0.0,
- "routers_loss": 0.01087163109332323,
+ "routers_loss": 0.010039499960839748,
"skip_count": 3.0,
"step": 3730,
"text_loss": 0.20334361493587494
@@ -35452,13 +35452,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07177734375,
+ "grad_norm": 0.08203125,
"learning_rate": 0.0007701310127109211,
- "loss": 0.0063,
+ "loss": 0.0062,
"macro_f1": 0.3272727429866791,
"num_tokens": 6016420.0,
"repeat_count": 0.0,
- "routers_loss": 0.010110805742442608,
+ "routers_loss": 0.01090205181390047,
"skip_count": 1.0,
"step": 3732,
"text_loss": 0.47959551215171814
@@ -35471,13 +35471,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.0,
"f1_skip": 0.888888955116272,
- "grad_norm": 0.03564453125,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0007698705037964791,
- "loss": 0.0078,
+ "loss": 0.0076,
"macro_f1": 0.6225374937057495,
"num_tokens": 6019551.0,
"repeat_count": 0.0,
- "routers_loss": 0.026909299194812775,
+ "routers_loss": 0.02677762135863304,
"skip_count": 5.0,
"step": 3734,
"text_loss": 0.2621438801288605
@@ -35490,13 +35490,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.056640625,
"learning_rate": 0.000769609891467151,
- "loss": 0.0122,
+ "loss": 0.0119,
"macro_f1": 0.6666666865348816,
"num_tokens": 6022262.0,
"repeat_count": 1.0,
- "routers_loss": 0.003602684009820223,
+ "routers_loss": 0.00460716662928462,
"skip_count": 0.0,
"step": 3736,
"text_loss": 0.3433022201061249
@@ -35509,13 +35509,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.037109375,
"learning_rate": 0.0007693491758228037,
- "loss": 0.005,
+ "loss": 0.0052,
"macro_f1": 0.6666666865348816,
"num_tokens": 6025723.0,
"repeat_count": 0.0,
- "routers_loss": 0.00290105608291924,
+ "routers_loss": 0.0036111194640398026,
"skip_count": 2.0,
"step": 3738,
"text_loss": 0.38703784346580505
@@ -35528,13 +35528,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0007690883569633442,
"loss": 0.006,
"macro_f1": 0.3333333432674408,
"num_tokens": 6028652.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031469720415771008,
+ "routers_loss": 0.003299296135082841,
"skip_count": 0.0,
"step": 3740,
"text_loss": 0.24203069508075714
@@ -35547,13 +35547,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.027587890625,
+ "grad_norm": 0.0277099609375,
"learning_rate": 0.0007688274349887188,
- "loss": 0.0048,
+ "loss": 0.0047,
"macro_f1": 0.3333333432674408,
"num_tokens": 6032280.0,
"repeat_count": 0.0,
- "routers_loss": 0.0029467069543898106,
+ "routers_loss": 0.003173880511894822,
"skip_count": 0.0,
"step": 3742,
"text_loss": 0.2827291488647461
@@ -35566,13 +35566,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031494140625,
+ "grad_norm": 0.0302734375,
"learning_rate": 0.0007685664099989131,
- "loss": 0.0074,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 6035111.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009511710377410054,
+ "routers_loss": 0.0008576177642680705,
"skip_count": 0.0,
"step": 3744,
"text_loss": 0.43613526225090027
@@ -35585,13 +35585,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0252685546875,
+ "grad_norm": 0.0274658203125,
"learning_rate": 0.0007683052820939524,
"loss": 0.0072,
"macro_f1": 0.6666666865348816,
"num_tokens": 6038428.0,
"repeat_count": 0.0,
- "routers_loss": 0.004079817794263363,
+ "routers_loss": 0.004335585981607437,
"skip_count": 2.0,
"step": 3746,
"text_loss": 1.0385624170303345
@@ -35604,13 +35604,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0007680440513739015,
"loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 6041185.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007996217464096844,
+ "routers_loss": 0.0008210531086660922,
"skip_count": 0.0,
"step": 3748,
"text_loss": 0.7070431709289551
@@ -35623,13 +35623,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.041015625,
+ "grad_norm": 0.056640625,
"learning_rate": 0.0007677827179388646,
- "loss": 0.0088,
+ "loss": 0.0089,
"macro_f1": 1.0,
"num_tokens": 6046333.0,
"repeat_count": 1.0,
- "routers_loss": 0.0047629233449697495,
+ "routers_loss": 0.003778942162171006,
"skip_count": 1.0,
"step": 3750,
"text_loss": 0.3682238757610321
@@ -35642,13 +35642,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.05908203125,
+ "grad_norm": 0.08984375,
"learning_rate": 0.000767521281888985,
- "loss": 0.0087,
+ "loss": 0.009,
"macro_f1": 1.0,
"num_tokens": 6049528.0,
"repeat_count": 1.0,
- "routers_loss": 0.0039178295992314816,
+ "routers_loss": 0.002767334459349513,
"skip_count": 1.0,
"step": 3752,
"text_loss": 0.7619418501853943
@@ -35661,13 +35661,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03662109375,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0007672597433244455,
- "loss": 0.0109,
+ "loss": 0.0108,
"macro_f1": 0.6666666865348816,
"num_tokens": 6053202.0,
"repeat_count": 0.0,
- "routers_loss": 0.004995788913220167,
+ "routers_loss": 0.004796457476913929,
"skip_count": 2.0,
"step": 3754,
"text_loss": 0.4157083034515381
@@ -35680,13 +35680,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.062255859375,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0007669981023454682,
- "loss": 0.0125,
+ "loss": 0.0126,
"macro_f1": 0.3333333432674408,
"num_tokens": 6056609.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012595724547281861,
+ "routers_loss": 0.0013067846884950995,
"skip_count": 0.0,
"step": 3756,
"text_loss": 0.4529118537902832
@@ -35699,13 +35699,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0284423828125,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0007667363590523142,
"loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 6060504.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012152433628216386,
+ "routers_loss": 0.0010285493917763233,
"skip_count": 0.0,
"step": 3758,
"text_loss": 0.8363246321678162
@@ -35718,13 +35718,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.055419921875,
"learning_rate": 0.0007664745135452844,
- "loss": 0.0093,
+ "loss": 0.0092,
"macro_f1": 0.6666666865348816,
"num_tokens": 6063526.0,
"repeat_count": 0.0,
- "routers_loss": 0.006478998344391584,
+ "routers_loss": 0.006289863493293524,
"skip_count": 3.0,
"step": 3760,
"text_loss": 0.5313657522201538
@@ -35737,13 +35737,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.05517578125,
"learning_rate": 0.0007662125659247183,
- "loss": 0.0096,
+ "loss": 0.0093,
"macro_f1": 0.3333333432674408,
"num_tokens": 6067147.0,
"repeat_count": 0.0,
- "routers_loss": 0.003008047351613641,
+ "routers_loss": 0.0028537956532090902,
"skip_count": 0.0,
"step": 3762,
"text_loss": 0.5668109059333801
@@ -35756,13 +35756,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03369140625,
+ "grad_norm": 0.039794921875,
"learning_rate": 0.0007659505162909949,
"loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 6070350.0,
"repeat_count": 0.0,
- "routers_loss": 0.002841299632564187,
+ "routers_loss": 0.0026814753655344248,
"skip_count": 0.0,
"step": 3764,
"text_loss": 0.4983512759208679
@@ -35775,13 +35775,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.055419921875,
+ "grad_norm": 0.056884765625,
"learning_rate": 0.0007656883647445318,
- "loss": 0.01,
+ "loss": 0.0099,
"macro_f1": 0.6666666865348816,
"num_tokens": 6073091.0,
"repeat_count": 0.0,
- "routers_loss": 0.006070348434150219,
+ "routers_loss": 0.005981382913887501,
"skip_count": 1.0,
"step": 3766,
"text_loss": 0.30372318625450134
@@ -35794,13 +35794,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0289306640625,
+ "grad_norm": 0.028564453125,
"learning_rate": 0.0007654261113857863,
- "loss": 0.0073,
+ "loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 6076244.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008278369787149131,
+ "routers_loss": 0.000803640519734472,
"skip_count": 0.0,
"step": 3768,
"text_loss": 0.6100738048553467
@@ -35813,13 +35813,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02392578125,
+ "grad_norm": 0.027587890625,
"learning_rate": 0.0007651637563152539,
"loss": 0.0055,
"macro_f1": 0.3333333432674408,
"num_tokens": 6078936.0,
"repeat_count": 0.0,
- "routers_loss": 0.001354316365905106,
+ "routers_loss": 0.0013324898900464177,
"skip_count": 0.0,
"step": 3770,
"text_loss": 0.4733821153640747
@@ -35832,13 +35832,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0242919921875,
+ "grad_norm": 0.029541015625,
"learning_rate": 0.0007649012996334701,
- "loss": 0.0051,
+ "loss": 0.0054,
"macro_f1": 0.6666666865348816,
"num_tokens": 6081951.0,
"repeat_count": 1.0,
- "routers_loss": 0.0019684957806020975,
+ "routers_loss": 0.0021543330512940884,
"skip_count": 0.0,
"step": 3772,
"text_loss": 0.6794875860214233
@@ -35851,13 +35851,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.0007646387414410085,
- "loss": 0.0076,
+ "loss": 0.0073,
"macro_f1": 0.3333333432674408,
"num_tokens": 6085165.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005270782858133316,
+ "routers_loss": 0.0005426189745776355,
"skip_count": 0.0,
"step": 3774,
"text_loss": 0.5886107683181763
@@ -35870,13 +35870,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.028076171875,
+ "grad_norm": 0.0262451171875,
"learning_rate": 0.0007643760818384819,
"loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 6088370.0,
"repeat_count": 0.0,
- "routers_loss": 0.0029050554148852825,
+ "routers_loss": 0.002537576947361231,
"skip_count": 0.0,
"step": 3776,
"text_loss": 0.23591920733451843
@@ -35889,13 +35889,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.03564453125,
"learning_rate": 0.0007641133209265423,
- "loss": 0.0064,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 6092319.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026071348693221807,
+ "routers_loss": 0.002613696036860347,
"skip_count": 0.0,
"step": 3778,
"text_loss": 0.3217754662036896
@@ -35908,13 +35908,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.051025390625,
+ "grad_norm": 0.052978515625,
"learning_rate": 0.0007638504588058796,
- "loss": 0.0101,
+ "loss": 0.0105,
"macro_f1": 0.3333333432674408,
"num_tokens": 6095799.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008351493743248284,
+ "routers_loss": 0.0007219464750960469,
"skip_count": 0.0,
"step": 3780,
"text_loss": 0.4276983141899109
@@ -35927,13 +35927,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.026611328125,
+ "grad_norm": 0.0263671875,
"learning_rate": 0.0007635874955772234,
- "loss": 0.007,
+ "loss": 0.0069,
"macro_f1": 0.6666666865348816,
"num_tokens": 6098789.0,
"repeat_count": 0.0,
- "routers_loss": 0.005872148554772139,
+ "routers_loss": 0.005965052172541618,
"skip_count": 3.0,
"step": 3782,
"text_loss": 0.30936646461486816
@@ -35946,13 +35946,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0703125,
+ "grad_norm": 0.07177734375,
"learning_rate": 0.0007633244313413417,
"loss": 0.0077,
"macro_f1": 0.3333333432674408,
"num_tokens": 6101631.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007862916099838912,
+ "routers_loss": 0.0007469559786841273,
"skip_count": 0.0,
"step": 3784,
"text_loss": 0.44460123777389526
@@ -35965,13 +35965,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0478515625,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0007630612661990412,
- "loss": 0.0098,
+ "loss": 0.0097,
"macro_f1": 0.6666666865348816,
"num_tokens": 6105097.0,
"repeat_count": 0.0,
- "routers_loss": 0.0037640000227838755,
+ "routers_loss": 0.004300760570913553,
"skip_count": 1.0,
"step": 3786,
"text_loss": 0.41950157284736633
@@ -35984,13 +35984,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0007627980002511672,
- "loss": 0.0068,
+ "loss": 0.0069,
"macro_f1": 0.6666666865348816,
"num_tokens": 6107847.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023107193410396576,
+ "routers_loss": 0.0023050960153341293,
"skip_count": 1.0,
"step": 3788,
"text_loss": 0.48561373353004456
@@ -36003,13 +36003,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03271484375,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.0007625346335986039,
- "loss": 0.0066,
+ "loss": 0.0063,
"macro_f1": 0.3333333432674408,
"num_tokens": 6110546.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017923865234479308,
+ "routers_loss": 0.0018124044872820377,
"skip_count": 0.0,
"step": 3790,
"text_loss": 0.20882295072078705
@@ -36022,13 +36022,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0007622711663422735,
"loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 6113600.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007700122077949345,
+ "routers_loss": 0.0007613401976414025,
"skip_count": 0.0,
"step": 3792,
"text_loss": 0.31751760840415955
@@ -36041,13 +36041,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04248046875,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0007620075985831375,
- "loss": 0.009,
+ "loss": 0.0092,
"macro_f1": 0.6666666865348816,
"num_tokens": 6116916.0,
"repeat_count": 0.0,
- "routers_loss": 0.004986821208149195,
+ "routers_loss": 0.005452962126582861,
"skip_count": 2.0,
"step": 3794,
"text_loss": 0.3246645927429199
@@ -36060,13 +36060,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0272216796875,
+ "grad_norm": 0.0306396484375,
"learning_rate": 0.0007617439304221956,
"loss": 0.0055,
"macro_f1": 0.6666666865348816,
"num_tokens": 6120056.0,
"repeat_count": 2.0,
- "routers_loss": 0.004177430644631386,
+ "routers_loss": 0.0043787881731987,
"skip_count": 0.0,
"step": 3796,
"text_loss": 0.4859195947647095
@@ -36079,13 +36079,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0228271484375,
+ "grad_norm": 0.02294921875,
"learning_rate": 0.0007614801619604856,
- "loss": 0.0065,
+ "loss": 0.0064,
"macro_f1": 0.3333333432674408,
"num_tokens": 6122668.0,
"repeat_count": 0.0,
- "routers_loss": 0.003494138829410076,
+ "routers_loss": 0.0033891722559928894,
"skip_count": 0.0,
"step": 3798,
"text_loss": 0.48194369673728943
@@ -36098,13 +36098,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0252685546875,
+ "grad_norm": 0.02587890625,
"learning_rate": 0.0007612162932990845,
- "loss": 0.0063,
+ "loss": 0.0061,
"macro_f1": 0.3333333432674408,
"num_tokens": 6126792.0,
"repeat_count": 0.0,
- "routers_loss": 0.001831608940847218,
+ "routers_loss": 0.001883238204754889,
"skip_count": 0.0,
"step": 3800,
"text_loss": 0.3740062117576599
@@ -36117,13 +36117,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0007609523245391068,
- "loss": 0.0078,
+ "loss": 0.0076,
"macro_f1": 0.6666666865348816,
"num_tokens": 6129801.0,
"repeat_count": 0.0,
- "routers_loss": 0.010433467105031013,
+ "routers_loss": 0.00882677361369133,
"skip_count": 2.0,
"step": 3802,
"text_loss": 0.5759486556053162
@@ -36136,13 +36136,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0007606882557817062,
- "loss": 0.0057,
+ "loss": 0.0058,
"macro_f1": 0.6666666865348816,
"num_tokens": 6133613.0,
"repeat_count": 0.0,
- "routers_loss": 0.009141471236944199,
+ "routers_loss": 0.009537030011415482,
"skip_count": 2.0,
"step": 3804,
"text_loss": 0.3217554986476898
@@ -36155,13 +36155,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0235595703125,
+ "grad_norm": 0.0220947265625,
"learning_rate": 0.0007604240871280742,
- "loss": 0.0055,
+ "loss": 0.0053,
"macro_f1": 0.3333333432674408,
"num_tokens": 6137784.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024337477516382933,
+ "routers_loss": 0.0023913346230983734,
"skip_count": 0.0,
"step": 3806,
"text_loss": 0.3718445599079132
@@ -36174,13 +36174,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0390625,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0007601598186794407,
- "loss": 0.0083,
+ "loss": 0.0081,
"macro_f1": 0.6603773832321167,
"num_tokens": 6141356.0,
"repeat_count": 1.0,
- "routers_loss": 0.03635421022772789,
+ "routers_loss": 0.033796411007642746,
"skip_count": 1.0,
"step": 3808,
"text_loss": 0.2717749774456024
@@ -36193,13 +36193,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.000759895450537074,
- "loss": 0.0101,
+ "loss": 0.01,
"macro_f1": 0.6666666865348816,
"num_tokens": 6144448.0,
"repeat_count": 0.0,
- "routers_loss": 0.002765925833955407,
+ "routers_loss": 0.0037919918540865183,
"skip_count": 2.0,
"step": 3810,
"text_loss": 0.5935076475143433
@@ -36212,13 +36212,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03369140625,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0007596309828022803,
- "loss": 0.0072,
+ "loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 6147526.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009747639996930957,
+ "routers_loss": 0.0008182782912626863,
"skip_count": 0.0,
"step": 3812,
"text_loss": 0.449336439371109
@@ -36231,13 +36231,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.03125,
"learning_rate": 0.0007593664155764044,
"loss": 0.0061,
"macro_f1": 0.6666666865348816,
"num_tokens": 6150620.0,
"repeat_count": 1.0,
- "routers_loss": 0.001395601429976523,
+ "routers_loss": 0.001734903547912836,
"skip_count": 0.0,
"step": 3814,
"text_loss": 0.6647221446037292
@@ -36250,13 +36250,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.037353515625,
"learning_rate": 0.0007591017489608286,
- "loss": 0.0092,
+ "loss": 0.0088,
"macro_f1": 0.3272727429866791,
"num_tokens": 6153714.0,
"repeat_count": 1.0,
- "routers_loss": 0.048050083220005035,
+ "routers_loss": 0.04721754416823387,
"skip_count": 0.0,
"step": 3816,
"text_loss": 0.25481200218200684
@@ -36269,13 +36269,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03662109375,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0007588369830569738,
- "loss": 0.0062,
+ "loss": 0.0061,
"macro_f1": 0.3333333432674408,
"num_tokens": 6156974.0,
"repeat_count": 0.0,
- "routers_loss": 0.00022119733330328017,
+ "routers_loss": 0.0002484306460246444,
"skip_count": 0.0,
"step": 3818,
"text_loss": 0.7195295691490173
@@ -36288,13 +36288,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02783203125,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0007585721179662988,
"loss": 0.0072,
"macro_f1": 0.6666666865348816,
"num_tokens": 6159660.0,
"repeat_count": 0.0,
- "routers_loss": 0.005448841955512762,
+ "routers_loss": 0.0051363613456487656,
"skip_count": 2.0,
"step": 3820,
"text_loss": 0.5073586702346802
@@ -36307,13 +36307,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0458984375,
+ "grad_norm": 0.052734375,
"learning_rate": 0.0007583071537903005,
- "loss": 0.0067,
+ "loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 6163146.0,
"repeat_count": 0.0,
- "routers_loss": 0.007093957159668207,
+ "routers_loss": 0.006719176657497883,
"skip_count": 0.0,
"step": 3822,
"text_loss": 0.6950558423995972
@@ -36326,13 +36326,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.025634765625,
+ "grad_norm": 0.0269775390625,
"learning_rate": 0.0007580420906305136,
- "loss": 0.007,
+ "loss": 0.0073,
"macro_f1": 1.0,
"num_tokens": 6166257.0,
"repeat_count": 1.0,
- "routers_loss": 0.008060536347329617,
+ "routers_loss": 0.00871267355978489,
"skip_count": 3.0,
"step": 3824,
"text_loss": 0.2549148201942444
@@ -36345,13 +36345,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025146484375,
+ "grad_norm": 0.022705078125,
"learning_rate": 0.0007577769285885109,
- "loss": 0.004,
+ "loss": 0.0039,
"macro_f1": 0.3333333432674408,
"num_tokens": 6169624.0,
"repeat_count": 0.0,
- "routers_loss": 0.001302229124121368,
+ "routers_loss": 0.0015642556827515364,
"skip_count": 0.0,
"step": 3826,
"text_loss": 0.3720305860042572
@@ -36364,13 +36364,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.038330078125,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.0007575116677659029,
- "loss": 0.0076,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 6172673.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010101167717948556,
+ "routers_loss": 0.0011551049537956715,
"skip_count": 0.0,
"step": 3828,
"text_loss": 0.6819429397583008
@@ -36383,13 +36383,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.040771484375,
"learning_rate": 0.0007572463082643377,
- "loss": 0.0083,
+ "loss": 0.0084,
"macro_f1": 0.3333333432674408,
"num_tokens": 6175414.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009081853204406798,
+ "routers_loss": 0.0008922060951590538,
"skip_count": 0.0,
"step": 3830,
"text_loss": 0.5424665212631226
@@ -36402,13 +36402,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03515625,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0007569808501855023,
"loss": 0.0044,
"macro_f1": 0.6666666865348816,
"num_tokens": 6178701.0,
"repeat_count": 0.0,
- "routers_loss": 0.0040206871926784515,
+ "routers_loss": 0.004167596809566021,
"skip_count": 1.0,
"step": 3832,
"text_loss": 0.4429764151573181
@@ -36421,13 +36421,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.04931640625,
"learning_rate": 0.00075671529363112,
"loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 6183036.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009683453245088458,
+ "routers_loss": 0.0008732969872653484,
"skip_count": 0.0,
"step": 3834,
"text_loss": 0.8015334010124207
@@ -36440,13 +36440,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0277099609375,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0007564496387029531,
- "loss": 0.0056,
+ "loss": 0.0055,
"macro_f1": 0.6666666865348816,
"num_tokens": 6186325.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021183546632528305,
+ "routers_loss": 0.0021374202333390713,
"skip_count": 1.0,
"step": 3836,
"text_loss": 0.4233771562576294
@@ -36459,13 +36459,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.000756183885502801,
- "loss": 0.0059,
+ "loss": 0.006,
"macro_f1": 0.6666666865348816,
"num_tokens": 6189919.0,
"repeat_count": 1.0,
- "routers_loss": 0.0034987039398401976,
+ "routers_loss": 0.004017227329313755,
"skip_count": 0.0,
"step": 3838,
"text_loss": 0.33691394329071045
@@ -36478,13 +36478,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.01953125,
+ "grad_norm": 0.018310546875,
"learning_rate": 0.0007559180341325005,
- "loss": 0.0048,
+ "loss": 0.0046,
"macro_f1": 0.3333333432674408,
"num_tokens": 6193412.0,
"repeat_count": 0.0,
- "routers_loss": 0.001348655903711915,
+ "routers_loss": 0.0013120946241542697,
"skip_count": 0.0,
"step": 3840,
"text_loss": 0.14970099925994873
@@ -36497,13 +36497,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.029541015625,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0007556520846939265,
"loss": 0.0061,
"macro_f1": 0.5492662787437439,
"num_tokens": 6196588.0,
"repeat_count": 0.0,
- "routers_loss": 0.011758741922676563,
+ "routers_loss": 0.011793316341936588,
"skip_count": 2.0,
"step": 3842,
"text_loss": 0.2714047133922577
@@ -36516,13 +36516,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.03466796875,
+ "grad_norm": 0.031494140625,
"learning_rate": 0.0007553860372889914,
- "loss": 0.0064,
+ "loss": 0.0062,
"macro_f1": 1.0,
"num_tokens": 6200841.0,
"repeat_count": 1.0,
- "routers_loss": 0.022454025223851204,
+ "routers_loss": 0.019968654960393906,
"skip_count": 4.0,
"step": 3844,
"text_loss": 0.23680976033210754
@@ -36535,13 +36535,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.049560546875,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0007551198920196452,
"loss": 0.0079,
"macro_f1": 0.5492662787437439,
"num_tokens": 6203797.0,
"repeat_count": 0.0,
- "routers_loss": 0.012088865973055363,
+ "routers_loss": 0.013615630567073822,
"skip_count": 2.0,
"step": 3846,
"text_loss": 0.25839608907699585
@@ -36554,13 +36554,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.057373046875,
+ "grad_norm": 0.0546875,
"learning_rate": 0.000754853648987875,
- "loss": 0.0073,
+ "loss": 0.0072,
"macro_f1": 0.6666666865348816,
"num_tokens": 6206790.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025066444650292397,
+ "routers_loss": 0.002420815173536539,
"skip_count": 1.0,
"step": 3848,
"text_loss": 0.5358025431632996
@@ -36573,13 +36573,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.032470703125,
"learning_rate": 0.0007545873082957057,
- "loss": 0.0073,
+ "loss": 0.0072,
"macro_f1": 0.9265305995941162,
"num_tokens": 6209791.0,
"repeat_count": 1.0,
- "routers_loss": 0.01811581663787365,
+ "routers_loss": 0.018236197531223297,
"skip_count": 3.0,
"step": 3850,
"text_loss": 0.1463700383901596
@@ -36592,13 +36592,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0390625,
+ "grad_norm": 0.034423828125,
"learning_rate": 0.0007543208700451998,
"loss": 0.0052,
"macro_f1": 0.6666666865348816,
"num_tokens": 6212792.0,
"repeat_count": 0.0,
- "routers_loss": 0.005889591295272112,
+ "routers_loss": 0.006242573726922274,
"skip_count": 3.0,
"step": 3852,
"text_loss": 0.9441591501235962
@@ -36611,13 +36611,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0299072265625,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0007540543343384565,
- "loss": 0.0064,
+ "loss": 0.0062,
"macro_f1": 0.3272727429866791,
"num_tokens": 6215747.0,
"repeat_count": 0.0,
- "routers_loss": 0.015324318781495094,
+ "routers_loss": 0.01451140083372593,
"skip_count": 1.0,
"step": 3854,
"text_loss": 0.41610902547836304
@@ -36630,13 +36630,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0007537877012776132,
"loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 6218593.0,
"repeat_count": 0.0,
- "routers_loss": 0.0003138817264698446,
+ "routers_loss": 0.00037674361374229193,
"skip_count": 0.0,
"step": 3856,
"text_loss": 0.6048852205276489
@@ -36649,13 +36649,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0269775390625,
+ "grad_norm": 0.0255126953125,
"learning_rate": 0.0007535209709648439,
- "loss": 0.0044,
+ "loss": 0.0045,
"macro_f1": 1.0,
"num_tokens": 6221315.0,
"repeat_count": 1.0,
- "routers_loss": 0.006152884569019079,
+ "routers_loss": 0.005776284262537956,
"skip_count": 3.0,
"step": 3858,
"text_loss": 0.35627537965774536
@@ -36668,13 +36668,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025634765625,
+ "grad_norm": 0.0245361328125,
"learning_rate": 0.0007532541435023605,
- "loss": 0.0048,
+ "loss": 0.0049,
"macro_f1": 0.3333333432674408,
"num_tokens": 6225012.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009145989897660911,
+ "routers_loss": 0.0009280376834794879,
"skip_count": 0.0,
"step": 3860,
"text_loss": 0.6440183520317078
@@ -36687,13 +36687,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025146484375,
+ "grad_norm": 0.0224609375,
"learning_rate": 0.0007529872189924114,
"loss": 0.0046,
"macro_f1": 0.3333333432674408,
"num_tokens": 6227650.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010246031451970339,
+ "routers_loss": 0.0009876530384644866,
"skip_count": 0.0,
"step": 3862,
"text_loss": 0.35507893562316895
@@ -36706,13 +36706,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0007527201975372827,
- "loss": 0.0046,
+ "loss": 0.0045,
"macro_f1": 0.6603773832321167,
"num_tokens": 6230557.0,
"repeat_count": 1.0,
- "routers_loss": 0.011913667432963848,
+ "routers_loss": 0.013780162669718266,
"skip_count": 1.0,
"step": 3864,
"text_loss": 0.38958442211151123
@@ -36725,13 +36725,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04345703125,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.0007524530792392977,
- "loss": 0.0111,
+ "loss": 0.011,
"macro_f1": 0.6666666865348816,
"num_tokens": 6233371.0,
"repeat_count": 0.0,
- "routers_loss": 0.0050127157010138035,
+ "routers_loss": 0.004849869292229414,
"skip_count": 3.0,
"step": 3866,
"text_loss": 0.3826720714569092
@@ -36744,13 +36744,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0228271484375,
+ "grad_norm": 0.0191650390625,
"learning_rate": 0.0007521858642008163,
- "loss": 0.0073,
+ "loss": 0.0072,
"macro_f1": 0.6666666865348816,
"num_tokens": 6236770.0,
"repeat_count": 0.0,
- "routers_loss": 0.008781078271567822,
+ "routers_loss": 0.008618295192718506,
"skip_count": 1.0,
"step": 3868,
"text_loss": 0.3596078157424927
@@ -36763,13 +36763,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03076171875,
+ "grad_norm": 0.029052734375,
"learning_rate": 0.0007519185525242363,
"loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 6239661.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014061459805816412,
+ "routers_loss": 0.0013421972980722785,
"skip_count": 0.0,
"step": 3870,
"text_loss": 0.5585550665855408
@@ -36782,13 +36782,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.027099609375,
+ "grad_norm": 0.026611328125,
"learning_rate": 0.0007516511443119916,
- "loss": 0.0056,
+ "loss": 0.0057,
"macro_f1": 0.6666666865348816,
"num_tokens": 6242459.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031452353578060865,
+ "routers_loss": 0.0038009448908269405,
"skip_count": 1.0,
"step": 3872,
"text_loss": 0.4418395757675171
@@ -36801,13 +36801,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.030517578125,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0007513836396665534,
"loss": 0.0061,
"macro_f1": 1.0,
"num_tokens": 6245489.0,
"repeat_count": 1.0,
- "routers_loss": 0.0028979210183024406,
+ "routers_loss": 0.002785376040264964,
"skip_count": 2.0,
"step": 3874,
"text_loss": 0.551510751247406
@@ -36820,13 +36820,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02294921875,
+ "grad_norm": 0.0234375,
"learning_rate": 0.0007511160386904305,
- "loss": 0.0051,
+ "loss": 0.005,
"macro_f1": 0.6666666865348816,
"num_tokens": 6249014.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021069799549877644,
+ "routers_loss": 0.0021424589212983847,
"skip_count": 1.0,
"step": 3876,
"text_loss": 1.0502676963806152
@@ -36839,13 +36839,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.034423828125,
"learning_rate": 0.0007508483414861679,
- "loss": 0.0083,
+ "loss": 0.0084,
"macro_f1": 0.6666666865348816,
"num_tokens": 6252357.0,
"repeat_count": 0.0,
- "routers_loss": 0.0073753902688622475,
+ "routers_loss": 0.0085759861394763,
"skip_count": 1.0,
"step": 3878,
"text_loss": 0.49212515354156494
@@ -36858,13 +36858,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0390625,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0007505805481563477,
- "loss": 0.0094,
+ "loss": 0.0093,
"macro_f1": 0.3333333432674408,
"num_tokens": 6254975.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010532810119912028,
+ "routers_loss": 0.0010723904706537724,
"skip_count": 0.0,
"step": 3880,
"text_loss": 0.7022985816001892
@@ -36877,13 +36877,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.05078125,
"learning_rate": 0.0007503126588035887,
- "loss": 0.0086,
+ "loss": 0.0081,
"macro_f1": 1.0,
"num_tokens": 6258001.0,
"repeat_count": 1.0,
- "routers_loss": 0.012617395259439945,
+ "routers_loss": 0.012809890322387218,
"skip_count": 2.0,
"step": 3882,
"text_loss": 0.1829151213169098
@@ -36896,13 +36896,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.0439453125,
"learning_rate": 0.0007500446735305466,
"loss": 0.0073,
"macro_f1": 0.6666666865348816,
"num_tokens": 6261795.0,
"repeat_count": 0.0,
- "routers_loss": 0.002872605575248599,
+ "routers_loss": 0.0026790346018970013,
"skip_count": 1.0,
"step": 3884,
"text_loss": 0.20436066389083862
@@ -36915,13 +36915,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.02978515625,
+ "grad_norm": 0.035888671875,
"learning_rate": 0.000749776592439914,
"loss": 0.007,
"macro_f1": 1.0,
"num_tokens": 6265585.0,
"repeat_count": 1.0,
- "routers_loss": 0.0047233253717422485,
+ "routers_loss": 0.005243788007646799,
"skip_count": 2.0,
"step": 3886,
"text_loss": 0.4479229748249054
@@ -36934,13 +36934,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02294921875,
+ "grad_norm": 0.024658203125,
"learning_rate": 0.00074950841563442,
- "loss": 0.0052,
+ "loss": 0.0051,
"macro_f1": 0.6666666865348816,
"num_tokens": 6269039.0,
"repeat_count": 0.0,
- "routers_loss": 0.007303252816200256,
+ "routers_loss": 0.007998534478247166,
"skip_count": 1.0,
"step": 3888,
"text_loss": 0.2154676914215088
@@ -36953,13 +36953,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0242919921875,
+ "grad_norm": 0.0238037109375,
"learning_rate": 0.0007492401432168303,
"loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 6272315.0,
"repeat_count": 0.0,
- "routers_loss": 0.005679785739630461,
+ "routers_loss": 0.004648822825402021,
"skip_count": 1.0,
"step": 3890,
"text_loss": 0.3375042676925659
@@ -36972,13 +36972,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0007489717752899477,
- "loss": 0.0097,
+ "loss": 0.0094,
"macro_f1": 0.3272727429866791,
"num_tokens": 6275342.0,
"repeat_count": 0.0,
- "routers_loss": 0.013875136151909828,
+ "routers_loss": 0.012154200114309788,
"skip_count": 1.0,
"step": 3892,
"text_loss": 0.1964082419872284
@@ -36991,13 +36991,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0247802734375,
+ "grad_norm": 0.0267333984375,
"learning_rate": 0.000748703311956611,
"loss": 0.0058,
"macro_f1": 1.0,
"num_tokens": 6278700.0,
"repeat_count": 1.0,
- "routers_loss": 0.004874289035797119,
+ "routers_loss": 0.004610476549714804,
"skip_count": 2.0,
"step": 3894,
"text_loss": 0.26545581221580505
@@ -37010,13 +37010,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06494140625,
+ "grad_norm": 0.06201171875,
"learning_rate": 0.0007484347533196961,
"loss": 0.0105,
"macro_f1": 0.6666666865348816,
"num_tokens": 6281864.0,
"repeat_count": 0.0,
- "routers_loss": 0.008282547816634178,
+ "routers_loss": 0.0075586591847240925,
"skip_count": 2.0,
"step": 3896,
"text_loss": 0.3106999397277832
@@ -37029,13 +37029,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0208740234375,
+ "grad_norm": 0.02099609375,
"learning_rate": 0.0007481660994821151,
- "loss": 0.007,
+ "loss": 0.0068,
"macro_f1": 0.6666666865348816,
"num_tokens": 6284676.0,
"repeat_count": 0.0,
- "routers_loss": 0.00792533066123724,
+ "routers_loss": 0.007845268584787846,
"skip_count": 1.0,
"step": 3898,
"text_loss": 0.4094304144382477
@@ -37048,13 +37048,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0007478973505468165,
- "loss": 0.0086,
+ "loss": 0.0081,
"macro_f1": 1.0,
"num_tokens": 6287470.0,
"repeat_count": 1.0,
- "routers_loss": 0.012142898514866829,
+ "routers_loss": 0.011116391979157925,
"skip_count": 2.0,
"step": 3900,
"text_loss": 0.1838909536600113
@@ -37067,13 +37067,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.03515625,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0007476285066167857,
- "loss": 0.0062,
+ "loss": 0.0059,
"macro_f1": 0.6666666865348816,
"num_tokens": 6290432.0,
"repeat_count": 1.0,
- "routers_loss": 0.004634121898561716,
+ "routers_loss": 0.004599364474415779,
"skip_count": 0.0,
"step": 3902,
"text_loss": 0.25872838497161865
@@ -37086,13 +37086,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0419921875,
+ "grad_norm": 0.046142578125,
"learning_rate": 0.0007473595677950439,
"loss": 0.0109,
"macro_f1": 0.6666666865348816,
"num_tokens": 6293557.0,
"repeat_count": 0.0,
- "routers_loss": 0.001632143510505557,
+ "routers_loss": 0.0016367282951250672,
"skip_count": 1.0,
"step": 3904,
"text_loss": 0.5272360444068909
@@ -37105,13 +37105,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.032470703125,
"learning_rate": 0.0007470905341846492,
- "loss": 0.0053,
+ "loss": 0.0052,
"macro_f1": 0.3333333432674408,
"num_tokens": 6295979.0,
"repeat_count": 0.0,
- "routers_loss": 0.0004961033118888736,
+ "routers_loss": 0.0004760588926728815,
"skip_count": 0.0,
"step": 3906,
"text_loss": 0.666959822177887
@@ -37124,13 +37124,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0007468214058886956,
- "loss": 0.0074,
+ "loss": 0.0072,
"macro_f1": 0.3333333432674408,
"num_tokens": 6299215.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007425977964885533,
+ "routers_loss": 0.000524883100297302,
"skip_count": 0.0,
"step": 3908,
"text_loss": 0.5144801139831543
@@ -37143,13 +37143,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0007465521830103137,
- "loss": 0.0081,
+ "loss": 0.0077,
"macro_f1": 0.3333333432674408,
"num_tokens": 6302320.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015668199630454183,
+ "routers_loss": 0.0016085522947832942,
"skip_count": 0.0,
"step": 3910,
"text_loss": 0.14342890679836273
@@ -37162,13 +37162,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037353515625,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0007462828656526702,
- "loss": 0.0065,
+ "loss": 0.0064,
"macro_f1": 0.6666666865348816,
"num_tokens": 6305212.0,
"repeat_count": 0.0,
- "routers_loss": 0.003138904692605138,
+ "routers_loss": 0.002720315707847476,
"skip_count": 2.0,
"step": 3912,
"text_loss": 0.31109121441841125
@@ -37181,13 +37181,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.052001953125,
+ "grad_norm": 0.06884765625,
"learning_rate": 0.0007460134539189681,
- "loss": 0.0117,
+ "loss": 0.0114,
"macro_f1": 0.6666666865348816,
"num_tokens": 6308964.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012123063206672668,
+ "routers_loss": 0.0010418406454846263,
"skip_count": 1.0,
"step": 3914,
"text_loss": 0.5662030577659607
@@ -37200,13 +37200,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.047119140625,
+ "grad_norm": 0.052001953125,
"learning_rate": 0.0007457439479124459,
"loss": 0.0134,
"macro_f1": 0.3333333432674408,
"num_tokens": 6313195.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017939694225788116,
+ "routers_loss": 0.0020303844939917326,
"skip_count": 0.0,
"step": 3916,
"text_loss": 0.6358339190483093
@@ -37219,13 +37219,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0274658203125,
+ "grad_norm": 0.0289306640625,
"learning_rate": 0.0007454743477363797,
"loss": 0.0054,
"macro_f1": 0.3333333432674408,
"num_tokens": 6315949.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006735047209076583,
+ "routers_loss": 0.0006592223653569818,
"skip_count": 0.0,
"step": 3918,
"text_loss": 0.35648423433303833
@@ -37238,13 +37238,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.027099609375,
+ "grad_norm": 0.0262451171875,
"learning_rate": 0.0007452046534940803,
- "loss": 0.0078,
+ "loss": 0.0075,
"macro_f1": 0.6603773832321167,
"num_tokens": 6319024.0,
"repeat_count": 1.0,
- "routers_loss": 0.025279851630330086,
+ "routers_loss": 0.024555351585149765,
"skip_count": 1.0,
"step": 3920,
"text_loss": 0.21955153346061707
@@ -37257,13 +37257,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033203125,
+ "grad_norm": 0.035888671875,
"learning_rate": 0.0007449348652888952,
- "loss": 0.007,
+ "loss": 0.0068,
"macro_f1": 0.6666666865348816,
"num_tokens": 6321633.0,
"repeat_count": 0.0,
- "routers_loss": 0.002887458074837923,
+ "routers_loss": 0.003606822807341814,
"skip_count": 1.0,
"step": 3922,
"text_loss": 0.6079489588737488
@@ -37276,13 +37276,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0007446649832242075,
"loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 6325209.0,
"repeat_count": 0.0,
- "routers_loss": 0.0034941197372972965,
+ "routers_loss": 0.0035831446293741465,
"skip_count": 1.0,
"step": 3924,
"text_loss": 0.2774808406829834
@@ -37295,13 +37295,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03173828125,
+ "grad_norm": 0.0311279296875,
"learning_rate": 0.0007443950074034368,
- "loss": 0.0067,
+ "loss": 0.0064,
"macro_f1": 0.6666666865348816,
"num_tokens": 6327822.0,
"repeat_count": 0.0,
- "routers_loss": 0.006862608715891838,
+ "routers_loss": 0.006809544749557972,
"skip_count": 2.0,
"step": 3926,
"text_loss": 0.48236769437789917
@@ -37314,13 +37314,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.0007441249379300381,
- "loss": 0.0072,
+ "loss": 0.007,
"macro_f1": 0.6601307392120361,
"num_tokens": 6331662.0,
"repeat_count": 1.0,
- "routers_loss": 0.02176409214735031,
+ "routers_loss": 0.023832591250538826,
"skip_count": 2.0,
"step": 3928,
"text_loss": 0.7287537455558777
@@ -37333,13 +37333,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.04296875,
"learning_rate": 0.0007438547749075028,
- "loss": 0.0064,
+ "loss": 0.0061,
"macro_f1": 1.0,
"num_tokens": 6335801.0,
"repeat_count": 1.0,
- "routers_loss": 0.013603253290057182,
+ "routers_loss": 0.011755098588764668,
"skip_count": 3.0,
"step": 3930,
"text_loss": 0.17253030836582184
@@ -37352,13 +37352,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0267333984375,
+ "grad_norm": 0.02685546875,
"learning_rate": 0.0007435845184393577,
- "loss": 0.0052,
+ "loss": 0.005,
"macro_f1": 0.6666666865348816,
"num_tokens": 6338747.0,
"repeat_count": 1.0,
- "routers_loss": 0.006635789293795824,
+ "routers_loss": 0.005972472485154867,
"skip_count": 0.0,
"step": 3932,
"text_loss": 0.6400216817855835
@@ -37371,13 +37371,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0361328125,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0007433141686291657,
- "loss": 0.0077,
+ "loss": 0.0075,
"macro_f1": 0.6666666865348816,
"num_tokens": 6342772.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032724342308938503,
+ "routers_loss": 0.0030393085908144712,
"skip_count": 1.0,
"step": 3934,
"text_loss": 0.6865074038505554
@@ -37390,13 +37390,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0213623046875,
+ "grad_norm": 0.020263671875,
"learning_rate": 0.0007430437255805252,
- "loss": 0.007,
+ "loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 6345957.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007380369352176785,
+ "routers_loss": 0.0006984061910770833,
"skip_count": 0.0,
"step": 3936,
"text_loss": 0.40398702025413513
@@ -37409,13 +37409,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.078125,
+ "grad_norm": 0.07275390625,
"learning_rate": 0.0007427731893970706,
"loss": 0.007,
"macro_f1": 0.6666666865348816,
"num_tokens": 6349162.0,
"repeat_count": 1.0,
- "routers_loss": 0.004635625518858433,
+ "routers_loss": 0.005219762213528156,
"skip_count": 0.0,
"step": 3938,
"text_loss": 0.5951031446456909
@@ -37428,13 +37428,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.043701171875,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.0007425025601824717,
- "loss": 0.0085,
+ "loss": 0.0083,
"macro_f1": 0.6666666865348816,
"num_tokens": 6352655.0,
"repeat_count": 0.0,
- "routers_loss": 0.014994140714406967,
+ "routers_loss": 0.015575960278511047,
"skip_count": 3.0,
"step": 3940,
"text_loss": 0.26689088344573975
@@ -37447,13 +37447,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031494140625,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0007422318380404346,
- "loss": 0.0067,
+ "loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 6355890.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011694672284647822,
+ "routers_loss": 0.0012208883417770267,
"skip_count": 0.0,
"step": 3942,
"text_loss": 0.570725679397583
@@ -37466,13 +37466,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.02587890625,
+ "grad_norm": 0.0235595703125,
"learning_rate": 0.0007419610230746999,
"loss": 0.0056,
"macro_f1": 0.6666666865348816,
"num_tokens": 6358891.0,
"repeat_count": 1.0,
- "routers_loss": 0.003442608518525958,
+ "routers_loss": 0.0029412026051431894,
"skip_count": 0.0,
"step": 3944,
"text_loss": 0.5521301031112671
@@ -37485,13 +37485,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0007416901153890448,
"loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 6361586.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009970148093998432,
+ "routers_loss": 0.0010283910669386387,
"skip_count": 0.0,
"step": 3946,
"text_loss": 0.4046417772769928
@@ -37504,13 +37504,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.043212890625,
+ "grad_norm": 0.03955078125,
"learning_rate": 0.0007414191150872818,
- "loss": 0.0078,
+ "loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 6364954.0,
"repeat_count": 0.0,
- "routers_loss": 0.009517154656350613,
+ "routers_loss": 0.008222512900829315,
"skip_count": 2.0,
"step": 3948,
"text_loss": 0.2803446352481842
@@ -37523,13 +37523,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.030029296875,
+ "grad_norm": 0.03564453125,
"learning_rate": 0.0007411480222732583,
- "loss": 0.0091,
+ "loss": 0.0093,
"macro_f1": 0.3333333432674408,
"num_tokens": 6367660.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012908667558804154,
+ "routers_loss": 0.001304348581470549,
"skip_count": 0.0,
"step": 3950,
"text_loss": 0.45553359389305115
@@ -37542,13 +37542,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03564453125,
+ "grad_norm": 0.03759765625,
"learning_rate": 0.0007408768370508576,
- "loss": 0.0076,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 6371585.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015499353175982833,
+ "routers_loss": 0.0016345062758773565,
"skip_count": 0.0,
"step": 3952,
"text_loss": 0.25424402952194214
@@ -37561,13 +37561,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.0007406055595239986,
- "loss": 0.007,
+ "loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 6374365.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005612325621768832,
+ "routers_loss": 0.0005097290268167853,
"skip_count": 0.0,
"step": 3954,
"text_loss": 0.5856026411056519
@@ -37580,13 +37580,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07373046875,
+ "grad_norm": 0.060546875,
"learning_rate": 0.0007403341897966356,
- "loss": 0.0063,
+ "loss": 0.0061,
"macro_f1": 0.6666666865348816,
"num_tokens": 6377335.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024961072485893965,
+ "routers_loss": 0.002482263371348381,
"skip_count": 1.0,
"step": 3956,
"text_loss": 0.5145615339279175
@@ -37599,32 +37599,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0230712890625,
+ "grad_norm": 0.0245361328125,
"learning_rate": 0.0007400627279727574,
"loss": 0.0041,
"macro_f1": 0.3333333432674408,
"num_tokens": 6380799.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013171056052669883,
+ "routers_loss": 0.0011743451468646526,
"skip_count": 0.0,
"step": 3958,
"text_loss": 0.31868961453437805
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 18.591722923393014,
- "f1_execute": 0.9818181991577148,
- "f1_repeat": 0.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.027099609375,
+ "grad_norm": 0.0286865234375,
"learning_rate": 0.0007397911741563892,
- "loss": 0.0054,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0052,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 6383963.0,
"repeat_count": 1.0,
- "routers_loss": 0.012845510616898537,
+ "routers_loss": 0.009861881844699383,
"skip_count": 0.0,
"step": 3960,
"text_loss": 0.21192194521427155
@@ -37637,13 +37637,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0390625,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.0007395195284515905,
- "loss": 0.0099,
+ "loss": 0.0095,
"macro_f1": 0.6666666865348816,
"num_tokens": 6387410.0,
"repeat_count": 1.0,
- "routers_loss": 0.003112874459475279,
+ "routers_loss": 0.004189098719507456,
"skip_count": 0.0,
"step": 3962,
"text_loss": 0.5809708833694458
@@ -37656,13 +37656,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039794921875,
+ "grad_norm": 0.036376953125,
"learning_rate": 0.0007392477909624567,
- "loss": 0.0058,
+ "loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 6390670.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019742189906537533,
+ "routers_loss": 0.001853612600825727,
"skip_count": 0.0,
"step": 3964,
"text_loss": 0.48985618352890015
@@ -37675,13 +37675,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.028076171875,
+ "grad_norm": 0.0308837890625,
"learning_rate": 0.0007389759617931182,
- "loss": 0.0066,
+ "loss": 0.0067,
"macro_f1": 0.6666666865348816,
"num_tokens": 6393609.0,
"repeat_count": 1.0,
- "routers_loss": 0.003850853070616722,
+ "routers_loss": 0.003303771372884512,
"skip_count": 0.0,
"step": 3966,
"text_loss": 0.28729453682899475
@@ -37694,13 +37694,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 1.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.0634765625,
+ "grad_norm": 0.10595703125,
"learning_rate": 0.0007387040410477404,
- "loss": 0.0057,
+ "loss": 0.0058,
"macro_f1": 0.9452888369560242,
"num_tokens": 6396608.0,
"repeat_count": 1.0,
- "routers_loss": 0.020281648263335228,
+ "routers_loss": 0.01791577786207199,
"skip_count": 4.0,
"step": 3968,
"text_loss": 0.30386820435523987
@@ -37713,13 +37713,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0284423828125,
+ "grad_norm": 0.029541015625,
"learning_rate": 0.0007384320288305235,
- "loss": 0.0093,
+ "loss": 0.0091,
"macro_f1": 0.3333333432674408,
"num_tokens": 6399793.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005419629742391407,
+ "routers_loss": 0.0005771282012574375,
"skip_count": 0.0,
"step": 3970,
"text_loss": 0.47285011410713196
@@ -37732,13 +37732,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0291748046875,
+ "grad_norm": 0.032958984375,
"learning_rate": 0.0007381599252457037,
- "loss": 0.0061,
+ "loss": 0.006,
"macro_f1": 0.3333333432674408,
"num_tokens": 6403365.0,
"repeat_count": 0.0,
- "routers_loss": 0.003040255280211568,
+ "routers_loss": 0.003010645741596818,
"skip_count": 0.0,
"step": 3972,
"text_loss": 0.5313063859939575
@@ -37751,32 +37751,32 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.000737887730397551,
"loss": 0.0061,
"macro_f1": 0.6666666865348816,
"num_tokens": 6406205.0,
"repeat_count": 1.0,
- "routers_loss": 0.006762589327991009,
+ "routers_loss": 0.006457438692450523,
"skip_count": 0.0,
"step": 3974,
"text_loss": 0.2323843240737915
},
{
- "acc_repeat": 0.0,
+ "acc_repeat": 1.0,
"acc_skip": 0.0,
- "avg_layers": 28.0,
+ "avg_layers": 29.0,
"epoch": 18.666862342236573,
- "f1_execute": 0.9818181991577148,
- "f1_repeat": 0.0,
+ "f1_execute": 1.0,
+ "f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.036865234375,
"learning_rate": 0.0007376154443903713,
- "loss": 0.0086,
- "macro_f1": 0.3272727429866791,
+ "loss": 0.0084,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 6409552.0,
"repeat_count": 1.0,
- "routers_loss": 0.01173968706279993,
+ "routers_loss": 0.010693981312215328,
"skip_count": 0.0,
"step": 3976,
"text_loss": 0.6304101943969727
@@ -37789,13 +37789,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0007373430673285051,
"loss": 0.008,
"macro_f1": 0.3272727429866791,
"num_tokens": 6412386.0,
"repeat_count": 1.0,
- "routers_loss": 0.028297962620854378,
+ "routers_loss": 0.03116440214216709,
"skip_count": 0.0,
"step": 3978,
"text_loss": 0.23448467254638672
@@ -37808,13 +37808,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.08447265625,
+ "grad_norm": 0.10009765625,
"learning_rate": 0.0007370705993163278,
- "loss": 0.011,
+ "loss": 0.0111,
"macro_f1": 0.3272727429866791,
"num_tokens": 6416054.0,
"repeat_count": 1.0,
- "routers_loss": 0.010761309415102005,
+ "routers_loss": 0.011973714455962181,
"skip_count": 0.0,
"step": 3980,
"text_loss": 0.6371755599975586
@@ -37827,13 +37827,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0458984375,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0007367980404582497,
"loss": 0.0105,
"macro_f1": 1.0,
"num_tokens": 6419238.0,
"repeat_count": 1.0,
- "routers_loss": 0.0057355971075594425,
+ "routers_loss": 0.005117347463965416,
"skip_count": 2.0,
"step": 3982,
"text_loss": 0.19822923839092255
@@ -37846,13 +37846,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0267333984375,
+ "grad_norm": 0.0296630859375,
"learning_rate": 0.0007365253908587158,
- "loss": 0.005,
+ "loss": 0.0049,
"macro_f1": 0.3333333432674408,
"num_tokens": 6422122.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011142889270558953,
+ "routers_loss": 0.0010648667812347412,
"skip_count": 0.0,
"step": 3984,
"text_loss": 0.566700279712677
@@ -37865,13 +37865,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0263671875,
+ "grad_norm": 0.025146484375,
"learning_rate": 0.0007362526506222058,
- "loss": 0.0045,
+ "loss": 0.0044,
"macro_f1": 0.3333333432674408,
"num_tokens": 6425313.0,
"repeat_count": 0.0,
- "routers_loss": 0.005405326373875141,
+ "routers_loss": 0.005726494826376438,
"skip_count": 0.0,
"step": 3986,
"text_loss": 0.6568437814712524
@@ -37884,13 +37884,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0289306640625,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0007359798198532343,
- "loss": 0.0043,
+ "loss": 0.0042,
"macro_f1": 0.6666666865348816,
"num_tokens": 6428422.0,
"repeat_count": 1.0,
- "routers_loss": 0.005449058022350073,
+ "routers_loss": 0.004504100419580936,
"skip_count": 0.0,
"step": 3988,
"text_loss": 0.598754346370697
@@ -37903,13 +37903,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.0306396484375,
"learning_rate": 0.0007357068986563509,
- "loss": 0.0083,
+ "loss": 0.0081,
"macro_f1": 0.6666666865348816,
"num_tokens": 6431512.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020256424322724342,
+ "routers_loss": 0.0019837068393826485,
"skip_count": 1.0,
"step": 3990,
"text_loss": 0.7152895927429199
@@ -37922,13 +37922,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.032470703125,
"learning_rate": 0.0007354338871361393,
- "loss": 0.0084,
+ "loss": 0.0079,
"macro_f1": 0.6666666865348816,
"num_tokens": 6434358.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027240889612585306,
+ "routers_loss": 0.0026031541638076305,
"skip_count": 1.0,
"step": 3992,
"text_loss": 0.4986513555049896
@@ -37941,13 +37941,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.039794921875,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.000735160785397218,
- "loss": 0.0061,
+ "loss": 0.006,
"macro_f1": 0.6666666865348816,
"num_tokens": 6438175.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026689881924539804,
+ "routers_loss": 0.0024831905029714108,
"skip_count": 2.0,
"step": 3994,
"text_loss": 0.4406205713748932
@@ -37960,13 +37960,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0007348875935442401,
- "loss": 0.0067,
+ "loss": 0.0065,
"macro_f1": 0.3333333432674408,
"num_tokens": 6441228.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010014307918027043,
+ "routers_loss": 0.0008635876583866775,
"skip_count": 0.0,
"step": 3996,
"text_loss": 0.48884135484695435
@@ -37979,13 +37979,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.040283203125,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0007346143116818932,
- "loss": 0.0046,
+ "loss": 0.0044,
"macro_f1": 0.3333333432674408,
"num_tokens": 6444318.0,
"repeat_count": 0.0,
- "routers_loss": 0.004282998852431774,
+ "routers_loss": 0.004007008858025074,
"skip_count": 0.0,
"step": 3998,
"text_loss": 0.6669428944587708
@@ -37998,13 +37998,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.08203125,
"learning_rate": 0.0007343409399148994,
- "loss": 0.0092,
+ "loss": 0.0093,
"macro_f1": 0.3333333432674408,
"num_tokens": 6448317.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031171543523669243,
+ "routers_loss": 0.0031380734872072935,
"skip_count": 0.0,
"step": 4000,
"text_loss": 0.6468493938446045
@@ -38017,13 +38017,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025634765625,
+ "grad_norm": 0.02392578125,
"learning_rate": 0.0007340674783480154,
- "loss": 0.0077,
+ "loss": 0.0078,
"macro_f1": 0.3333333432674408,
"num_tokens": 6451673.0,
"repeat_count": 0.0,
- "routers_loss": 0.005329967010766268,
+ "routers_loss": 0.004996029660105705,
"skip_count": 0.0,
"step": 4002,
"text_loss": 0.28135430812835693
@@ -38036,13 +38036,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03857421875,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0007337939270860323,
- "loss": 0.0091,
+ "loss": 0.009,
"macro_f1": 0.3272727429866791,
"num_tokens": 6456372.0,
"repeat_count": 1.0,
- "routers_loss": 0.038046106696128845,
+ "routers_loss": 0.03784399852156639,
"skip_count": 0.0,
"step": 4004,
"text_loss": 0.41668644547462463
@@ -38055,32 +38055,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.040283203125,
"learning_rate": 0.0007335202862337753,
"loss": 0.0093,
"macro_f1": 0.3333333432674408,
"num_tokens": 6459047.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013881187187507749,
+ "routers_loss": 0.0011750755365937948,
"skip_count": 0.0,
"step": 4006,
"text_loss": 0.6853910684585571
},
{
"acc_repeat": 1.0,
- "acc_skip": 1.0,
- "avg_layers": 25.0,
+ "acc_skip": 0.75,
+ "avg_layers": 26.0,
"epoch": 18.817141179923688,
- "f1_execute": 1.0,
+ "f1_execute": 0.978723406791687,
"f1_repeat": 1.0,
- "f1_skip": 1.0,
- "grad_norm": 0.044189453125,
+ "f1_skip": 0.8571428656578064,
+ "grad_norm": 0.05908203125,
"learning_rate": 0.000733246555896104,
- "loss": 0.0059,
- "macro_f1": 1.0,
+ "loss": 0.0062,
+ "macro_f1": 0.9452888369560242,
"num_tokens": 6462390.0,
"repeat_count": 1.0,
- "routers_loss": 0.01348043605685234,
+ "routers_loss": 0.01630394533276558,
"skip_count": 4.0,
"step": 4008,
"text_loss": 0.7110592126846313
@@ -38093,13 +38093,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.049560546875,
"learning_rate": 0.0007329727361779124,
- "loss": 0.0073,
+ "loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 6466057.0,
"repeat_count": 0.0,
- "routers_loss": 0.0051529803313314915,
+ "routers_loss": 0.0052404399029910564,
"skip_count": 2.0,
"step": 4010,
"text_loss": 0.13856995105743408
@@ -38112,13 +38112,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.03759765625,
"learning_rate": 0.000732698827184129,
- "loss": 0.0058,
+ "loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 6468878.0,
"repeat_count": 0.0,
- "routers_loss": 0.002958883298560977,
+ "routers_loss": 0.002138581359758973,
"skip_count": 0.0,
"step": 4012,
"text_loss": 0.3999565839767456
@@ -38131,13 +38131,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.036376953125,
"learning_rate": 0.000732424829019716,
"loss": 0.0075,
"macro_f1": 0.3333333432674408,
"num_tokens": 6472364.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038471813313663006,
+ "routers_loss": 0.0037466560024768114,
"skip_count": 0.0,
"step": 4014,
"text_loss": 0.28161346912384033
@@ -38150,13 +38150,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0286865234375,
+ "grad_norm": 0.0306396484375,
"learning_rate": 0.0007321507417896699,
- "loss": 0.0087,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 6475379.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010916640749201179,
+ "routers_loss": 0.0010469373082742095,
"skip_count": 0.0,
"step": 4016,
"text_loss": 1.0490952730178833
@@ -38169,13 +38169,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.042236328125,
+ "grad_norm": 0.06591796875,
"learning_rate": 0.0007318765655990218,
"loss": 0.0054,
"macro_f1": 0.6666666865348816,
"num_tokens": 6478585.0,
"repeat_count": 0.0,
- "routers_loss": 0.00946822389960289,
+ "routers_loss": 0.009968385100364685,
"skip_count": 2.0,
"step": 4018,
"text_loss": 0.31696680188179016
@@ -38188,13 +38188,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.024169921875,
+ "grad_norm": 0.0240478515625,
"learning_rate": 0.0007316023005528362,
"loss": 0.0062,
"macro_f1": 0.6666666865348816,
"num_tokens": 6484153.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027165759820491076,
+ "routers_loss": 0.002349073765799403,
"skip_count": 1.0,
"step": 4020,
"text_loss": 0.30981555581092834
@@ -38207,13 +38207,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 1.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.02880859375,
+ "grad_norm": 0.0299072265625,
"learning_rate": 0.0007313279467562124,
- "loss": 0.0051,
+ "loss": 0.0053,
"macro_f1": 0.9452888369560242,
"num_tokens": 6487029.0,
"repeat_count": 1.0,
- "routers_loss": 0.012701411731541157,
+ "routers_loss": 0.011854278855025768,
"skip_count": 4.0,
"step": 4022,
"text_loss": 0.9689550399780273
@@ -38226,13 +38226,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.050537109375,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.0007310535043142829,
- "loss": 0.0079,
+ "loss": 0.0077,
"macro_f1": 1.0,
"num_tokens": 6490315.0,
"repeat_count": 1.0,
- "routers_loss": 0.010197490453720093,
+ "routers_loss": 0.00908346101641655,
"skip_count": 3.0,
"step": 4024,
"text_loss": 0.1705625057220459
@@ -38245,13 +38245,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0400390625,
+ "grad_norm": 0.039306640625,
"learning_rate": 0.0007307789733322146,
- "loss": 0.0097,
+ "loss": 0.0094,
"macro_f1": 0.3333333432674408,
"num_tokens": 6493921.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008188873762264848,
+ "routers_loss": 0.0007360641611739993,
"skip_count": 0.0,
"step": 4026,
"text_loss": 0.6252996325492859
@@ -38264,13 +38264,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06689453125,
+ "grad_norm": 0.087890625,
"learning_rate": 0.0007305043539152083,
- "loss": 0.007,
+ "loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 6496689.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018946458585560322,
+ "routers_loss": 0.0017757206223905087,
"skip_count": 0.0,
"step": 4028,
"text_loss": 0.40533265471458435
@@ -38283,13 +38283,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.000730229646168499,
- "loss": 0.0078,
+ "loss": 0.0075,
"macro_f1": 0.3333333432674408,
"num_tokens": 6500090.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023306645452976227,
+ "routers_loss": 0.0022657213266938925,
"skip_count": 0.0,
"step": 4030,
"text_loss": 0.25954708456993103
@@ -38302,13 +38302,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.039794921875,
"learning_rate": 0.0007299548501973548,
"loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 6503023.0,
"repeat_count": 0.0,
- "routers_loss": 0.002005136338993907,
+ "routers_loss": 0.0021747269202023745,
"skip_count": 0.0,
"step": 4032,
"text_loss": 0.6223418712615967
@@ -38321,13 +38321,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.0390625,
"learning_rate": 0.0007296799661070782,
"loss": 0.0067,
"macro_f1": 0.6666666865348816,
"num_tokens": 6506382.0,
"repeat_count": 0.0,
- "routers_loss": 0.00668578315526247,
+ "routers_loss": 0.006400502752512693,
"skip_count": 4.0,
"step": 4034,
"text_loss": 0.6873653531074524
@@ -38340,13 +38340,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0281982421875,
+ "grad_norm": 0.028076171875,
"learning_rate": 0.0007294049940030055,
- "loss": 0.0066,
+ "loss": 0.0065,
"macro_f1": 0.3272727429866791,
"num_tokens": 6509194.0,
"repeat_count": 0.0,
- "routers_loss": 0.021298008039593697,
+ "routers_loss": 0.0197185929864645,
"skip_count": 1.0,
"step": 4036,
"text_loss": 0.16156800091266632
@@ -38359,13 +38359,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0458984375,
+ "grad_norm": 0.04345703125,
"learning_rate": 0.0007291299339905059,
- "loss": 0.0075,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 6512271.0,
"repeat_count": 0.0,
- "routers_loss": 0.001004312071017921,
+ "routers_loss": 0.0009541353792883456,
"skip_count": 0.0,
"step": 4038,
"text_loss": 0.5038442015647888
@@ -38378,13 +38378,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.039794921875,
+ "grad_norm": 0.036376953125,
"learning_rate": 0.0007288547861749838,
- "loss": 0.0065,
+ "loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 6516403.0,
"repeat_count": 0.0,
- "routers_loss": 0.007993367500603199,
+ "routers_loss": 0.008226391859352589,
"skip_count": 2.0,
"step": 4040,
"text_loss": 0.3706657588481903
@@ -38397,13 +38397,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03515625,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.0007285795506618758,
- "loss": 0.0062,
+ "loss": 0.0063,
"macro_f1": 0.3272727429866791,
"num_tokens": 6519310.0,
"repeat_count": 0.0,
- "routers_loss": 0.015058980323374271,
+ "routers_loss": 0.017001887783408165,
"skip_count": 1.0,
"step": 4042,
"text_loss": 0.24296723306179047
@@ -38416,13 +38416,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.054443359375,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0007283042275566528,
"loss": 0.0125,
"macro_f1": 0.6666666865348816,
"num_tokens": 6521979.0,
"repeat_count": 0.0,
- "routers_loss": 0.016352638602256775,
+ "routers_loss": 0.01666323095560074,
"skip_count": 2.0,
"step": 4044,
"text_loss": 0.36904850602149963
@@ -38435,13 +38435,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0201416015625,
+ "grad_norm": 0.019775390625,
"learning_rate": 0.0007280288169648192,
- "loss": 0.0044,
+ "loss": 0.0043,
"macro_f1": 0.3333333432674408,
"num_tokens": 6524976.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008094423683360219,
+ "routers_loss": 0.0007593175978399813,
"skip_count": 0.0,
"step": 4046,
"text_loss": 0.7312731146812439
@@ -38454,13 +38454,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.033203125,
+ "grad_norm": 0.0390625,
"learning_rate": 0.0007277533189919127,
- "loss": 0.0061,
+ "loss": 0.0063,
"macro_f1": 1.0,
"num_tokens": 6528638.0,
"repeat_count": 1.0,
- "routers_loss": 0.005490938201546669,
+ "routers_loss": 0.005652119871228933,
"skip_count": 1.0,
"step": 4048,
"text_loss": 0.23326151072978973
@@ -38473,13 +38473,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.026123046875,
+ "grad_norm": 0.0286865234375,
"learning_rate": 0.0007274777337435046,
- "loss": 0.0055,
+ "loss": 0.0056,
"macro_f1": 0.6666666865348816,
"num_tokens": 6532193.0,
"repeat_count": 0.0,
- "routers_loss": 0.009560001082718372,
+ "routers_loss": 0.010509157553315163,
"skip_count": 2.0,
"step": 4050,
"text_loss": 0.23918013274669647
@@ -38492,13 +38492,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0007272020613251999,
"loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 6534994.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023573292419314384,
+ "routers_loss": 0.002153293928131461,
"skip_count": 0.0,
"step": 4052,
"text_loss": 0.5890526175498962
@@ -38511,13 +38511,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.029296875,
+ "grad_norm": 0.04150390625,
"learning_rate": 0.0007269263018426367,
- "loss": 0.0048,
+ "loss": 0.0049,
"macro_f1": 1.0,
"num_tokens": 6537469.0,
"repeat_count": 1.0,
- "routers_loss": 0.0012750910827890038,
+ "routers_loss": 0.0018494052346795797,
"skip_count": 2.0,
"step": 4054,
"text_loss": 0.36058738827705383
@@ -38530,13 +38530,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0546875,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0007266504554014866,
- "loss": 0.006,
+ "loss": 0.0061,
"macro_f1": 0.3333333432674408,
"num_tokens": 6541271.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006701929378323257,
+ "routers_loss": 0.0007579320226795971,
"skip_count": 0.0,
"step": 4056,
"text_loss": 0.4089007079601288
@@ -38549,13 +38549,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.040283203125,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0007263745221074545,
- "loss": 0.0085,
+ "loss": 0.0086,
"macro_f1": 0.6601307392120361,
"num_tokens": 6544293.0,
"repeat_count": 1.0,
- "routers_loss": 0.061707694083452225,
+ "routers_loss": 0.06202420964837074,
"skip_count": 2.0,
"step": 4058,
"text_loss": 0.2226305454969406
@@ -38568,13 +38568,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.027587890625,
+ "grad_norm": 0.0286865234375,
"learning_rate": 0.0007260985020662784,
- "loss": 0.005,
+ "loss": 0.0049,
"macro_f1": 0.5934640765190125,
"num_tokens": 6547640.0,
"repeat_count": 0.0,
- "routers_loss": 0.04534700885415077,
+ "routers_loss": 0.044639844447374344,
"skip_count": 3.0,
"step": 4060,
"text_loss": 0.23004353046417236
@@ -38587,13 +38587,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.017822265625,
+ "grad_norm": 0.0206298828125,
"learning_rate": 0.0007258223953837298,
- "loss": 0.0052,
+ "loss": 0.0053,
"macro_f1": 0.6666666865348816,
"num_tokens": 6550840.0,
"repeat_count": 1.0,
- "routers_loss": 0.004326729103922844,
+ "routers_loss": 0.004215611144900322,
"skip_count": 0.0,
"step": 4062,
"text_loss": 0.2891770601272583
@@ -38606,13 +38606,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044189453125,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.0007255462021656132,
- "loss": 0.0068,
+ "loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 6554122.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009951743995770812,
+ "routers_loss": 0.0011056234361603856,
"skip_count": 0.0,
"step": 4064,
"text_loss": 0.7485370635986328
@@ -38625,13 +38625,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0007252699225177666,
- "loss": 0.0082,
+ "loss": 0.0079,
"macro_f1": 0.6666666865348816,
"num_tokens": 6557138.0,
"repeat_count": 0.0,
- "routers_loss": 0.008738798089325428,
+ "routers_loss": 0.008258933201432228,
"skip_count": 2.0,
"step": 4066,
"text_loss": 0.25219282507896423
@@ -38644,13 +38644,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.03759765625,
"learning_rate": 0.0007249935565460606,
- "loss": 0.0044,
+ "loss": 0.0045,
"macro_f1": 0.3333333432674408,
"num_tokens": 6560654.0,
"repeat_count": 0.0,
- "routers_loss": 0.004576306790113449,
+ "routers_loss": 0.005102175287902355,
"skip_count": 0.0,
"step": 4068,
"text_loss": 0.5553314089775085
@@ -38663,13 +38663,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0007247171043563994,
- "loss": 0.0059,
+ "loss": 0.0061,
"macro_f1": 0.6666666865348816,
"num_tokens": 6563814.0,
"repeat_count": 0.0,
- "routers_loss": 0.013026291504502296,
+ "routers_loss": 0.01283820066601038,
"skip_count": 2.0,
"step": 4070,
"text_loss": 0.15729956328868866
@@ -38682,13 +38682,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0206298828125,
+ "grad_norm": 0.0211181640625,
"learning_rate": 0.0007244405660547199,
"loss": 0.0044,
"macro_f1": 0.3333333432674408,
"num_tokens": 6567060.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010598953813314438,
+ "routers_loss": 0.0009684927063062787,
"skip_count": 0.0,
"step": 4072,
"text_loss": 0.3725031912326813
@@ -38701,13 +38701,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.021484375,
+ "grad_norm": 0.01953125,
"learning_rate": 0.000724163941746992,
- "loss": 0.0061,
+ "loss": 0.0058,
"macro_f1": 0.3333333432674408,
"num_tokens": 6571608.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008197802817448974,
+ "routers_loss": 0.0007890827837400138,
"skip_count": 0.0,
"step": 4074,
"text_loss": 0.8438301682472229
@@ -38722,11 +38722,11 @@
"f1_skip": 1.0,
"grad_norm": 0.02734375,
"learning_rate": 0.0007238872315392189,
- "loss": 0.0067,
+ "loss": 0.0066,
"macro_f1": 1.0,
"num_tokens": 6575214.0,
"repeat_count": 1.0,
- "routers_loss": 0.004072689451277256,
+ "routers_loss": 0.0040600355714559555,
"skip_count": 1.0,
"step": 4076,
"text_loss": 0.5923112034797668
@@ -38739,13 +38739,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0252685546875,
+ "grad_norm": 0.024169921875,
"learning_rate": 0.0007236104355374363,
- "loss": 0.004,
+ "loss": 0.0039,
"macro_f1": 0.6666666865348816,
"num_tokens": 6578383.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024594077840447426,
+ "routers_loss": 0.0024899677373468876,
"skip_count": 2.0,
"step": 4078,
"text_loss": 0.20302526652812958
@@ -38758,13 +38758,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.057373046875,
+ "grad_norm": 0.05517578125,
"learning_rate": 0.000723333553847713,
- "loss": 0.0058,
+ "loss": 0.0056,
"macro_f1": 0.6666666865348816,
"num_tokens": 6582175.0,
"repeat_count": 0.0,
- "routers_loss": 0.0060209049843251705,
+ "routers_loss": 0.006120906211435795,
"skip_count": 2.0,
"step": 4080,
"text_loss": 0.5400223731994629
@@ -38777,13 +38777,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07958984375,
+ "grad_norm": 0.06787109375,
"learning_rate": 0.0007230565865761504,
- "loss": 0.0052,
+ "loss": 0.005,
"macro_f1": 0.3333333432674408,
"num_tokens": 6585516.0,
"repeat_count": 0.0,
- "routers_loss": 0.002700155135244131,
+ "routers_loss": 0.0029941233806312084,
"skip_count": 0.0,
"step": 4082,
"text_loss": 0.19460804760456085
@@ -38796,13 +38796,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.08056640625,
+ "grad_norm": 0.07373046875,
"learning_rate": 0.0007227795338288831,
- "loss": 0.0072,
+ "loss": 0.0071,
"macro_f1": 0.6666666865348816,
"num_tokens": 6588266.0,
"repeat_count": 0.0,
- "routers_loss": 0.009378589689731598,
+ "routers_loss": 0.009357884526252747,
"skip_count": 2.0,
"step": 4084,
"text_loss": 0.35237613320350647
@@ -38815,13 +38815,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0361328125,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0007225023957120782,
- "loss": 0.0085,
+ "loss": 0.0086,
"macro_f1": 0.6666666865348816,
"num_tokens": 6591009.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025940060149878263,
+ "routers_loss": 0.0023083325941115618,
"skip_count": 2.0,
"step": 4086,
"text_loss": 0.4336731433868408
@@ -38834,13 +38834,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0218505859375,
+ "grad_norm": 0.0211181640625,
"learning_rate": 0.0007222251723319356,
- "loss": 0.0035,
+ "loss": 0.0034,
"macro_f1": 0.3333333432674408,
"num_tokens": 6594472.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009030649089254439,
+ "routers_loss": 0.0008416616474278271,
"skip_count": 0.0,
"step": 4088,
"text_loss": 0.6390535831451416
@@ -38853,13 +38853,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.045166015625,
"learning_rate": 0.0007219478637946877,
- "loss": 0.0085,
+ "loss": 0.0084,
"macro_f1": 0.6666666865348816,
"num_tokens": 6597477.0,
"repeat_count": 0.0,
- "routers_loss": 0.005229895934462547,
+ "routers_loss": 0.004390760324895382,
"skip_count": 1.0,
"step": 4090,
"text_loss": 0.525839626789093
@@ -38872,13 +38872,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0260009765625,
+ "grad_norm": 0.0272216796875,
"learning_rate": 0.0007216704702065997,
- "loss": 0.0055,
+ "loss": 0.0054,
"macro_f1": 0.3333333432674408,
"num_tokens": 6600431.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010594666237011552,
+ "routers_loss": 0.0010311100631952286,
"skip_count": 0.0,
"step": 4092,
"text_loss": 0.5310423374176025
@@ -38891,13 +38891,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02197265625,
+ "grad_norm": 0.0228271484375,
"learning_rate": 0.0007213929916739695,
- "loss": 0.0064,
+ "loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 6603899.0,
"repeat_count": 0.0,
- "routers_loss": 0.004303699359297752,
+ "routers_loss": 0.0032497600186616182,
"skip_count": 1.0,
"step": 4094,
"text_loss": 0.2775326073169708
@@ -38910,13 +38910,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.000721115428303127,
- "loss": 0.0083,
+ "loss": 0.0077,
"macro_f1": 1.0,
"num_tokens": 6606544.0,
"repeat_count": 1.0,
- "routers_loss": 0.004739399533718824,
+ "routers_loss": 0.004692315589636564,
"skip_count": 3.0,
"step": 4096,
"text_loss": 0.6667124032974243
@@ -38929,13 +38929,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0250244140625,
+ "grad_norm": 0.0274658203125,
"learning_rate": 0.0007208377802004353,
- "loss": 0.0058,
+ "loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 6610097.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007414906867779791,
+ "routers_loss": 0.0007263485458679497,
"skip_count": 0.0,
"step": 4098,
"text_loss": 0.6916406750679016
@@ -38948,13 +38948,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.026611328125,
+ "grad_norm": 0.0274658203125,
"learning_rate": 0.0007205600474722897,
- "loss": 0.0059,
+ "loss": 0.0058,
"macro_f1": 0.3333333432674408,
"num_tokens": 6613836.0,
"repeat_count": 0.0,
- "routers_loss": 0.001866258797235787,
+ "routers_loss": 0.0017989488551393151,
"skip_count": 0.0,
"step": 4100,
"text_loss": 0.5257929563522339
@@ -38967,13 +38967,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02880859375,
+ "grad_norm": 0.0284423828125,
"learning_rate": 0.000720282230225118,
"loss": 0.0068,
"macro_f1": 0.6666666865348816,
"num_tokens": 6616780.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013150086160749197,
+ "routers_loss": 0.0011308686807751656,
"skip_count": 1.0,
"step": 4102,
"text_loss": 0.4410906732082367
@@ -38986,13 +38986,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.030029296875,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0007200043285653799,
- "loss": 0.0064,
+ "loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 6620110.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021148507948964834,
+ "routers_loss": 0.002058265497907996,
"skip_count": 2.0,
"step": 4104,
"text_loss": 0.8581191897392273
@@ -39005,13 +39005,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.040283203125,
"learning_rate": 0.0007197263425995681,
- "loss": 0.0067,
+ "loss": 0.0066,
"macro_f1": 0.6666666865348816,
"num_tokens": 6622585.0,
"repeat_count": 1.0,
- "routers_loss": 0.0015671581495553255,
+ "routers_loss": 0.0017528717871755362,
"skip_count": 0.0,
"step": 4106,
"text_loss": 0.5000449419021606
@@ -39024,13 +39024,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0245361328125,
+ "grad_norm": 0.02587890625,
"learning_rate": 0.0007194482724342075,
- "loss": 0.0078,
+ "loss": 0.0077,
"macro_f1": 0.3333333432674408,
"num_tokens": 6626356.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020829052664339542,
+ "routers_loss": 0.0021995846182107925,
"skip_count": 0.0,
"step": 4108,
"text_loss": 0.401346892118454
@@ -39043,13 +39043,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0299072265625,
+ "grad_norm": 0.0289306640625,
"learning_rate": 0.0007191701181758547,
- "loss": 0.0073,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 6629738.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013877892633900046,
+ "routers_loss": 0.0014869922306388617,
"skip_count": 0.0,
"step": 4110,
"text_loss": 0.9598422050476074
@@ -39062,13 +39062,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025146484375,
+ "grad_norm": 0.0242919921875,
"learning_rate": 0.0007188918799310993,
- "loss": 0.0078,
+ "loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 6632807.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012319361558184028,
+ "routers_loss": 0.0012853415682911873,
"skip_count": 0.0,
"step": 4112,
"text_loss": 0.3996548354625702
@@ -39081,13 +39081,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.030029296875,
+ "grad_norm": 0.029296875,
"learning_rate": 0.0007186135578065627,
- "loss": 0.0075,
+ "loss": 0.0077,
"macro_f1": 0.3333333432674408,
"num_tokens": 6636227.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009015969699248672,
+ "routers_loss": 0.0009887361666187644,
"skip_count": 0.0,
"step": 4114,
"text_loss": 0.4127283990383148
@@ -39100,13 +39100,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.0007183351519088982,
- "loss": 0.0066,
+ "loss": 0.0068,
"macro_f1": 0.6666666865348816,
"num_tokens": 6639443.0,
"repeat_count": 0.0,
- "routers_loss": 0.006493544206023216,
+ "routers_loss": 0.006282114889472723,
"skip_count": 1.0,
"step": 4116,
"text_loss": 0.20028606057167053
@@ -39119,13 +39119,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.054931640625,
+ "grad_norm": 0.061767578125,
"learning_rate": 0.0007180566623447917,
- "loss": 0.0115,
+ "loss": 0.0114,
"macro_f1": 0.6603773832321167,
"num_tokens": 6642127.0,
"repeat_count": 1.0,
- "routers_loss": 0.008949270471930504,
+ "routers_loss": 0.008101986721158028,
"skip_count": 0.0,
"step": 4118,
"text_loss": 0.763931155204773
@@ -39138,13 +39138,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0277099609375,
+ "grad_norm": 0.0291748046875,
"learning_rate": 0.0007177780892209607,
- "loss": 0.006,
+ "loss": 0.0061,
"macro_f1": 0.3333333432674408,
"num_tokens": 6645376.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019743547309190035,
+ "routers_loss": 0.001953610684722662,
"skip_count": 0.0,
"step": 4120,
"text_loss": 0.42317715287208557
@@ -39157,13 +39157,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.034912109375,
"learning_rate": 0.0007174994326441551,
- "loss": 0.0066,
+ "loss": 0.0065,
"macro_f1": 0.3333333432674408,
"num_tokens": 6648150.0,
"repeat_count": 0.0,
- "routers_loss": 0.003454099874943495,
+ "routers_loss": 0.003279355587437749,
"skip_count": 0.0,
"step": 4122,
"text_loss": 0.19656142592430115
@@ -39176,13 +39176,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.031005859375,
"learning_rate": 0.0007172206927211567,
- "loss": 0.0055,
+ "loss": 0.0053,
"macro_f1": 0.3333333432674408,
"num_tokens": 6650935.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032328376546502113,
+ "routers_loss": 0.0032076311763375998,
"skip_count": 0.0,
"step": 4124,
"text_loss": 0.13608409464359283
@@ -39195,13 +39195,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.026611328125,
+ "grad_norm": 0.0303955078125,
"learning_rate": 0.0007169418695587791,
"loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 6654464.0,
"repeat_count": 0.0,
- "routers_loss": 0.0041675688698887825,
+ "routers_loss": 0.004065621178597212,
"skip_count": 2.0,
"step": 4126,
"text_loss": 0.4882086217403412
@@ -39214,13 +39214,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02783203125,
+ "grad_norm": 0.031005859375,
"learning_rate": 0.0007166629632638678,
"loss": 0.0065,
"macro_f1": 0.3333333432674408,
"num_tokens": 6657749.0,
"repeat_count": 0.0,
- "routers_loss": 0.000975916744209826,
+ "routers_loss": 0.0009243001695722342,
"skip_count": 0.0,
"step": 4128,
"text_loss": 0.31632331013679504
@@ -39233,13 +39233,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0264892578125,
+ "grad_norm": 0.02783203125,
"learning_rate": 0.0007163839739433003,
- "loss": 0.008,
+ "loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 6660997.0,
"repeat_count": 0.0,
- "routers_loss": 0.002182615688070655,
+ "routers_loss": 0.0018459554994478822,
"skip_count": 0.0,
"step": 4130,
"text_loss": 0.6123947501182556
@@ -39252,13 +39252,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.033935546875,
"learning_rate": 0.0007161049017039857,
- "loss": 0.0074,
+ "loss": 0.0073,
"macro_f1": 0.8820862174034119,
"num_tokens": 6663542.0,
"repeat_count": 2.0,
- "routers_loss": 0.03051452897489071,
+ "routers_loss": 0.030032536014914513,
"skip_count": 2.0,
"step": 4132,
"text_loss": 0.6985659003257751
@@ -39271,13 +39271,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0206298828125,
+ "grad_norm": 0.019775390625,
"learning_rate": 0.0007158257466528652,
"loss": 0.0053,
"macro_f1": 0.3333333432674408,
"num_tokens": 6666178.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013405663194134831,
+ "routers_loss": 0.0013813833938911557,
"skip_count": 0.0,
"step": 4134,
"text_loss": 0.38380664587020874
@@ -39290,13 +39290,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.01953125,
+ "grad_norm": 0.021484375,
"learning_rate": 0.0007155465088969114,
- "loss": 0.0079,
+ "loss": 0.008,
"macro_f1": 0.6666666865348816,
"num_tokens": 6668852.0,
"repeat_count": 0.0,
- "routers_loss": 0.00536607438698411,
+ "routers_loss": 0.00513424864038825,
"skip_count": 3.0,
"step": 4136,
"text_loss": 0.49724283814430237
@@ -39309,13 +39309,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.023193359375,
+ "grad_norm": 0.0228271484375,
"learning_rate": 0.0007152671885431288,
"loss": 0.0048,
"macro_f1": 0.3333333432674408,
"num_tokens": 6671430.0,
"repeat_count": 0.0,
- "routers_loss": 0.0004998469958081841,
+ "routers_loss": 0.0005165594047866762,
"skip_count": 0.0,
"step": 4138,
"text_loss": 0.666959822177887
@@ -39328,13 +39328,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044189453125,
+ "grad_norm": 0.047119140625,
"learning_rate": 0.0007149877856985535,
- "loss": 0.0082,
+ "loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 6675215.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017356832977384329,
+ "routers_loss": 0.001685218419879675,
"skip_count": 0.0,
"step": 4140,
"text_loss": 0.3127259612083435
@@ -39347,13 +39347,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0245361328125,
+ "grad_norm": 0.0277099609375,
"learning_rate": 0.000714708300470253,
- "loss": 0.0058,
+ "loss": 0.0061,
"macro_f1": 0.3333333432674408,
"num_tokens": 6678505.0,
"repeat_count": 0.0,
- "routers_loss": 0.003699234686791897,
+ "routers_loss": 0.004025314934551716,
"skip_count": 0.0,
"step": 4142,
"text_loss": 0.3179470896720886
@@ -39366,13 +39366,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0007144287329653269,
"loss": 0.0077,
"macro_f1": 0.6666666865348816,
"num_tokens": 6681127.0,
"repeat_count": 1.0,
- "routers_loss": 0.005084970500320196,
+ "routers_loss": 0.005965690594166517,
"skip_count": 0.0,
"step": 4144,
"text_loss": 0.3862907886505127
@@ -39385,13 +39385,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.040771484375,
+ "grad_norm": 0.039794921875,
"learning_rate": 0.0007141490832909058,
- "loss": 0.0074,
+ "loss": 0.0071,
"macro_f1": 0.3272727429866791,
"num_tokens": 6683968.0,
"repeat_count": 0.0,
- "routers_loss": 0.013118764385581017,
+ "routers_loss": 0.012896374799311161,
"skip_count": 1.0,
"step": 4146,
"text_loss": 0.48156118392944336
@@ -39404,13 +39404,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.034912109375,
"learning_rate": 0.0007138693515541519,
- "loss": 0.005,
+ "loss": 0.0049,
"macro_f1": 0.6666666865348816,
"num_tokens": 6687196.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006807957543060184,
+ "routers_loss": 0.0006367767928168178,
"skip_count": 1.0,
"step": 4148,
"text_loss": 0.676702082157135
@@ -39423,13 +39423,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03125,
+ "grad_norm": 0.030029296875,
"learning_rate": 0.0007135895378622592,
- "loss": 0.0076,
+ "loss": 0.0075,
"macro_f1": 0.6666666865348816,
"num_tokens": 6689972.0,
"repeat_count": 0.0,
- "routers_loss": 0.004619150888174772,
+ "routers_loss": 0.004532640799880028,
"skip_count": 3.0,
"step": 4150,
"text_loss": 0.5865558981895447
@@ -39442,13 +39442,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.040283203125,
"learning_rate": 0.0007133096423224526,
- "loss": 0.0082,
+ "loss": 0.0081,
"macro_f1": 0.3272727429866791,
"num_tokens": 6693568.0,
"repeat_count": 1.0,
- "routers_loss": 0.0404328815639019,
+ "routers_loss": 0.0377078577876091,
"skip_count": 0.0,
"step": 4152,
"text_loss": 0.2790502607822418
@@ -39461,13 +39461,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.053466796875,
+ "grad_norm": 0.056640625,
"learning_rate": 0.0007130296650419885,
- "loss": 0.0071,
+ "loss": 0.0068,
"macro_f1": 0.6666666865348816,
"num_tokens": 6696468.0,
"repeat_count": 0.0,
- "routers_loss": 0.0048319315537810326,
+ "routers_loss": 0.004455826710909605,
"skip_count": 1.0,
"step": 4154,
"text_loss": 0.5869500041007996
@@ -39480,13 +39480,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.050048828125,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0007127496061281551,
"loss": 0.006,
"macro_f1": 0.3333333432674408,
"num_tokens": 6699307.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022895359434187412,
+ "routers_loss": 0.001998464809730649,
"skip_count": 0.0,
"step": 4156,
"text_loss": 0.6931945085525513
@@ -39499,13 +39499,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0007124694656882713,
- "loss": 0.0071,
+ "loss": 0.007,
"macro_f1": 0.6666666865348816,
"num_tokens": 6702647.0,
"repeat_count": 3.0,
- "routers_loss": 0.004655756987631321,
+ "routers_loss": 0.004117495380342007,
"skip_count": 0.0,
"step": 4158,
"text_loss": 0.4325876832008362
@@ -39518,13 +39518,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0189208984375,
+ "grad_norm": 0.0205078125,
"learning_rate": 0.0007121892438296874,
- "loss": 0.0066,
+ "loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 6705964.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014499713433906436,
+ "routers_loss": 0.0014713290147483349,
"skip_count": 0.0,
"step": 4160,
"text_loss": 0.3672060966491699
@@ -39537,13 +39537,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04296875,
+ "grad_norm": 0.04345703125,
"learning_rate": 0.0007119089406597849,
- "loss": 0.0075,
+ "loss": 0.0074,
"macro_f1": 0.6666666865348816,
"num_tokens": 6710182.0,
"repeat_count": 0.0,
- "routers_loss": 0.0039377836510539055,
+ "routers_loss": 0.0037311650812625885,
"skip_count": 1.0,
"step": 4162,
"text_loss": 0.6643805503845215
@@ -39556,13 +39556,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0007116285562859767,
- "loss": 0.0059,
+ "loss": 0.0058,
"macro_f1": 0.3333333432674408,
"num_tokens": 6713410.0,
"repeat_count": 0.0,
- "routers_loss": 0.006864873692393303,
+ "routers_loss": 0.006017287727445364,
"skip_count": 0.0,
"step": 4164,
"text_loss": 0.4606415927410126
@@ -39575,13 +39575,13 @@
"f1_execute": 0.9545454382896423,
"f1_repeat": 0.5,
"f1_skip": 1.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.05419921875,
"learning_rate": 0.0007113480908157065,
- "loss": 0.0109,
+ "loss": 0.0108,
"macro_f1": 0.8181818723678589,
"num_tokens": 6716056.0,
"repeat_count": 3.0,
- "routers_loss": 0.08587442338466644,
+ "routers_loss": 0.08640352636575699,
"skip_count": 4.0,
"step": 4166,
"text_loss": 0.3139408528804779
@@ -39594,13 +39594,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0283203125,
+ "grad_norm": 0.0311279296875,
"learning_rate": 0.0007110675443564491,
"loss": 0.0065,
"macro_f1": 0.3333333432674408,
"num_tokens": 6719497.0,
"repeat_count": 0.0,
- "routers_loss": 0.001434682053513825,
+ "routers_loss": 0.0012731150491163135,
"skip_count": 0.0,
"step": 4168,
"text_loss": 0.7283861637115479
@@ -39613,13 +39613,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0247802734375,
+ "grad_norm": 0.0262451171875,
"learning_rate": 0.0007107869170157108,
- "loss": 0.0056,
+ "loss": 0.0053,
"macro_f1": 0.6666666865348816,
"num_tokens": 6722297.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018422538414597511,
+ "routers_loss": 0.0021509863436222076,
"skip_count": 2.0,
"step": 4170,
"text_loss": 0.5767703056335449
@@ -39632,13 +39632,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.000710506208901028,
- "loss": 0.0083,
+ "loss": 0.0079,
"macro_f1": 0.6666666865348816,
"num_tokens": 6725762.0,
"repeat_count": 0.0,
- "routers_loss": 0.002943754428997636,
+ "routers_loss": 0.00257494836114347,
"skip_count": 1.0,
"step": 4172,
"text_loss": 0.33571913838386536
@@ -39651,13 +39651,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.000710225420119969,
"loss": 0.0052,
"macro_f1": 1.0,
"num_tokens": 6728436.0,
"repeat_count": 1.0,
- "routers_loss": 0.00920829363167286,
+ "routers_loss": 0.00943201594054699,
"skip_count": 3.0,
"step": 4174,
"text_loss": 0.6849368810653687
@@ -39670,13 +39670,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0289306640625,
+ "grad_norm": 0.030517578125,
"learning_rate": 0.0007099445507801323,
- "loss": 0.0062,
+ "loss": 0.006,
"macro_f1": 0.6666666865348816,
"num_tokens": 6731427.0,
"repeat_count": 0.0,
- "routers_loss": 0.010877607390284538,
+ "routers_loss": 0.01046718005090952,
"skip_count": 2.0,
"step": 4176,
"text_loss": 0.3346157670021057
@@ -39689,13 +39689,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05517578125,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0007096636009891477,
- "loss": 0.0095,
+ "loss": 0.0091,
"macro_f1": 0.3333333432674408,
"num_tokens": 6734800.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007745221955701709,
+ "routers_loss": 0.0007813365664333105,
"skip_count": 0.0,
"step": 4178,
"text_loss": 0.49989959597587585
@@ -39708,13 +39708,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03125,
+ "grad_norm": 0.032958984375,
"learning_rate": 0.000709382570854676,
"loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 6738244.0,
"repeat_count": 0.0,
- "routers_loss": 0.002755505731329322,
+ "routers_loss": 0.002825600327923894,
"skip_count": 0.0,
"step": 4180,
"text_loss": 0.15744923055171967
@@ -39727,13 +39727,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0007091014604844078,
- "loss": 0.0078,
+ "loss": 0.0075,
"macro_f1": 0.3333333432674408,
"num_tokens": 6741695.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018092440441250801,
+ "routers_loss": 0.0017124463338404894,
"skip_count": 0.0,
"step": 4182,
"text_loss": 0.3752405643463135
@@ -39746,13 +39746,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.02587890625,
+ "grad_norm": 0.0230712890625,
"learning_rate": 0.0007088202699860655,
- "loss": 0.0052,
+ "loss": 0.0053,
"macro_f1": 1.0,
"num_tokens": 6744882.0,
"repeat_count": 1.0,
- "routers_loss": 0.005326499231159687,
+ "routers_loss": 0.005134924780577421,
"skip_count": 3.0,
"step": 4184,
"text_loss": 0.18534569442272186
@@ -39765,13 +39765,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02294921875,
+ "grad_norm": 0.01904296875,
"learning_rate": 0.000708538999467402,
"loss": 0.0049,
"macro_f1": 0.6666666865348816,
"num_tokens": 6747811.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022658067755401134,
+ "routers_loss": 0.002371585462242365,
"skip_count": 1.0,
"step": 4186,
"text_loss": 0.6251029968261719
@@ -39784,13 +39784,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.080078125,
+ "grad_norm": 0.064453125,
"learning_rate": 0.0007082576490362004,
- "loss": 0.0055,
+ "loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 6750765.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022298030089586973,
+ "routers_loss": 0.002088436856865883,
"skip_count": 0.0,
"step": 4188,
"text_loss": 0.35471436381340027
@@ -39803,13 +39803,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0238037109375,
+ "grad_norm": 0.0255126953125,
"learning_rate": 0.000707976218800275,
"loss": 0.0093,
"macro_f1": 0.3333333432674408,
"num_tokens": 6754021.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013450054684653878,
+ "routers_loss": 0.0012272283202037215,
"skip_count": 0.0,
"step": 4190,
"text_loss": 0.5737302899360657
@@ -39822,13 +39822,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.051025390625,
+ "grad_norm": 0.07763671875,
"learning_rate": 0.0007076947088674701,
- "loss": 0.0064,
+ "loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 6756793.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026260579470545053,
+ "routers_loss": 0.0026050808373838663,
"skip_count": 0.0,
"step": 4192,
"text_loss": 0.526336669921875
@@ -39841,13 +39841,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.054931640625,
"learning_rate": 0.000707413119345661,
- "loss": 0.0084,
+ "loss": 0.0086,
"macro_f1": 0.3333333432674408,
"num_tokens": 6760221.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014576761750504375,
+ "routers_loss": 0.0013151296880096197,
"skip_count": 0.0,
"step": 4194,
"text_loss": 0.5678895711898804
@@ -39860,13 +39860,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0419921875,
+ "grad_norm": 0.037353515625,
"learning_rate": 0.0007071314503427532,
- "loss": 0.0058,
+ "loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 6763721.0,
"repeat_count": 0.0,
- "routers_loss": 0.00165031966753304,
+ "routers_loss": 0.001528652966953814,
"skip_count": 0.0,
"step": 4196,
"text_loss": 0.7640175223350525
@@ -39879,13 +39879,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0252685546875,
+ "grad_norm": 0.0240478515625,
"learning_rate": 0.0007068497019666829,
- "loss": 0.0047,
+ "loss": 0.0046,
"macro_f1": 0.3333333432674408,
"num_tokens": 6768581.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017519505927339196,
+ "routers_loss": 0.0019202446565032005,
"skip_count": 0.0,
"step": 4198,
"text_loss": 0.41878414154052734
@@ -39904,7 +39904,7 @@
"macro_f1": 0.6666666865348816,
"num_tokens": 6772758.0,
"repeat_count": 0.0,
- "routers_loss": 0.005213241558521986,
+ "routers_loss": 0.004667408298701048,
"skip_count": 1.0,
"step": 4200,
"text_loss": 0.3550313413143158
@@ -39917,13 +39917,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.048828125,
+ "grad_norm": 0.050537109375,
"learning_rate": 0.0007062859675269513,
- "loss": 0.0063,
+ "loss": 0.0064,
"macro_f1": 0.6666666865348816,
"num_tokens": 6776671.0,
"repeat_count": 3.0,
- "routers_loss": 0.004372407682240009,
+ "routers_loss": 0.00568761583417654,
"skip_count": 0.0,
"step": 4202,
"text_loss": 0.1707649976015091
@@ -39936,13 +39936,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033203125,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0007060039816793141,
- "loss": 0.0073,
+ "loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 6780284.0,
"repeat_count": 0.0,
- "routers_loss": 0.003470032475888729,
+ "routers_loss": 0.0030401297844946384,
"skip_count": 0.0,
"step": 4204,
"text_loss": 0.2686377167701721
@@ -39955,13 +39955,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.03564453125,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.0007057219168905625,
- "loss": 0.0067,
+ "loss": 0.0068,
"macro_f1": 1.0,
"num_tokens": 6783525.0,
"repeat_count": 1.0,
- "routers_loss": 0.003391953418031335,
+ "routers_loss": 0.003353122156113386,
"skip_count": 5.0,
"step": 4206,
"text_loss": 0.5235374569892883
@@ -39974,13 +39974,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0274658203125,
+ "grad_norm": 0.026123046875,
"learning_rate": 0.000705439773268784,
- "loss": 0.0052,
+ "loss": 0.005,
"macro_f1": 0.6666666865348816,
"num_tokens": 6787691.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013759827706962824,
+ "routers_loss": 0.0016532237641513348,
"skip_count": 1.0,
"step": 4208,
"text_loss": 0.5002681612968445
@@ -39993,13 +39993,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0007051575509220972,
"loss": 0.0086,
"macro_f1": 0.3333333432674408,
"num_tokens": 6790833.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011075466172769666,
+ "routers_loss": 0.0011808308772742748,
"skip_count": 0.0,
"step": 4210,
"text_loss": 0.7251001596450806
@@ -40012,13 +40012,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0380859375,
+ "grad_norm": 0.04443359375,
"learning_rate": 0.0007048752499586497,
"loss": 0.0057,
"macro_f1": 0.6666666865348816,
"num_tokens": 6794260.0,
"repeat_count": 0.0,
- "routers_loss": 0.0063498299568891525,
+ "routers_loss": 0.006246297620236874,
"skip_count": 2.0,
"step": 4212,
"text_loss": 0.2430499643087387
@@ -40031,13 +40031,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.00070459287048662,
- "loss": 0.0074,
+ "loss": 0.0075,
"macro_f1": 0.3333333432674408,
"num_tokens": 6797413.0,
"repeat_count": 0.0,
- "routers_loss": 0.001165185822173953,
+ "routers_loss": 0.0012964420020580292,
"skip_count": 0.0,
"step": 4214,
"text_loss": 0.48889362812042236
@@ -40050,13 +40050,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0283203125,
+ "grad_norm": 0.031494140625,
"learning_rate": 0.0007043104126142163,
- "loss": 0.0073,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 6800815.0,
"repeat_count": 0.0,
- "routers_loss": 0.002119335113093257,
+ "routers_loss": 0.0018109704833477736,
"skip_count": 0.0,
"step": 4216,
"text_loss": 0.5617026686668396
@@ -40069,13 +40069,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0225830078125,
+ "grad_norm": 0.0250244140625,
"learning_rate": 0.0007040278764496771,
- "loss": 0.0061,
+ "loss": 0.0062,
"macro_f1": 1.0,
"num_tokens": 6803937.0,
"repeat_count": 2.0,
- "routers_loss": 0.002939696190878749,
+ "routers_loss": 0.0028699536342173815,
"skip_count": 1.0,
"step": 4218,
"text_loss": 0.548405647277832
@@ -40088,13 +40088,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0007037452621012708,
"loss": 0.0072,
"macro_f1": 0.3333333432674408,
"num_tokens": 6806946.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008340062922798097,
+ "routers_loss": 0.0007951617590151727,
"skip_count": 0.0,
"step": 4220,
"text_loss": 0.5702725648880005
@@ -40107,13 +40107,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.024169921875,
+ "grad_norm": 0.030517578125,
"learning_rate": 0.0007034625696772958,
- "loss": 0.0053,
+ "loss": 0.0056,
"macro_f1": 0.6666666865348816,
"num_tokens": 6810083.0,
"repeat_count": 0.0,
- "routers_loss": 0.003032320411875844,
+ "routers_loss": 0.003436052706092596,
"skip_count": 2.0,
"step": 4222,
"text_loss": 0.3898725211620331
@@ -40126,13 +40126,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03857421875,
+ "grad_norm": 0.03955078125,
"learning_rate": 0.00070317979928608,
- "loss": 0.007,
+ "loss": 0.0065,
"macro_f1": 0.3333333432674408,
"num_tokens": 6812845.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005332283908501267,
+ "routers_loss": 0.0005070401239208877,
"skip_count": 0.0,
"step": 4224,
"text_loss": 0.5244157910346985
@@ -40145,13 +40145,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.0390625,
"learning_rate": 0.000702896951035982,
- "loss": 0.0103,
+ "loss": 0.0101,
"macro_f1": 0.3272727429866791,
"num_tokens": 6815801.0,
"repeat_count": 0.0,
- "routers_loss": 0.015828115865588188,
+ "routers_loss": 0.01560303382575512,
"skip_count": 1.0,
"step": 4226,
"text_loss": 0.26503118872642517
@@ -40164,13 +40164,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.03564453125,
"learning_rate": 0.0007026140250353896,
- "loss": 0.0088,
+ "loss": 0.0086,
"macro_f1": 0.6666666865348816,
"num_tokens": 6819464.0,
"repeat_count": 0.0,
- "routers_loss": 0.010141439735889435,
+ "routers_loss": 0.009310240857303143,
"skip_count": 2.0,
"step": 4228,
"text_loss": 0.15597499907016754
@@ -40189,7 +40189,7 @@
"macro_f1": 0.3333333432674408,
"num_tokens": 6822657.0,
"repeat_count": 0.0,
- "routers_loss": 0.004937903955578804,
+ "routers_loss": 0.005309136584401131,
"skip_count": 0.0,
"step": 4230,
"text_loss": 0.5271651148796082
@@ -40202,13 +40202,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.046875,
"learning_rate": 0.0007020479402164226,
- "loss": 0.009,
+ "loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 6825661.0,
"repeat_count": 0.0,
- "routers_loss": 0.005930901039391756,
+ "routers_loss": 0.005936166271567345,
"skip_count": 2.0,
"step": 4232,
"text_loss": 0.6105108857154846
@@ -40221,13 +40221,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0361328125,
+ "grad_norm": 0.040283203125,
"learning_rate": 0.0007017647816149727,
- "loss": 0.0065,
+ "loss": 0.0064,
"macro_f1": 0.3333333432674408,
"num_tokens": 6828688.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015492573147639632,
+ "routers_loss": 0.001653556595556438,
"skip_count": 0.0,
"step": 4234,
"text_loss": 0.6966437101364136
@@ -40240,13 +40240,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.032470703125,
"learning_rate": 0.000701481545696878,
- "loss": 0.0093,
+ "loss": 0.009,
"macro_f1": 0.3333333432674408,
"num_tokens": 6831850.0,
"repeat_count": 0.0,
- "routers_loss": 0.001357862027361989,
+ "routers_loss": 0.0013501866487786174,
"skip_count": 0.0,
"step": 4236,
"text_loss": 1.259678840637207
@@ -40259,13 +40259,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.059326171875,
+ "grad_norm": 0.059814453125,
"learning_rate": 0.0007011982325706747,
- "loss": 0.006,
+ "loss": 0.0058,
"macro_f1": 0.6666666865348816,
"num_tokens": 6834862.0,
"repeat_count": 0.0,
- "routers_loss": 0.00899078231304884,
+ "routers_loss": 0.008970130234956741,
"skip_count": 1.0,
"step": 4238,
"text_loss": 0.24906545877456665
@@ -40278,13 +40278,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.052490234375,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0007009148423449292,
- "loss": 0.0067,
+ "loss": 0.0063,
"macro_f1": 0.3333333432674408,
"num_tokens": 6838148.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027724208775907755,
+ "routers_loss": 0.0026013399474322796,
"skip_count": 0.0,
"step": 4240,
"text_loss": 0.291467547416687
@@ -40297,13 +40297,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.037109375,
"learning_rate": 0.0007006313751282371,
- "loss": 0.0095,
+ "loss": 0.0094,
"macro_f1": 0.3272727429866791,
"num_tokens": 6841142.0,
"repeat_count": 0.0,
- "routers_loss": 0.0202134158462286,
+ "routers_loss": 0.021415632218122482,
"skip_count": 1.0,
"step": 4242,
"text_loss": 0.507606029510498
@@ -40316,13 +40316,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0301513671875,
+ "grad_norm": 0.0289306640625,
"learning_rate": 0.0007003478310292236,
- "loss": 0.0061,
+ "loss": 0.006,
"macro_f1": 0.3333333432674408,
"num_tokens": 6844042.0,
"repeat_count": 0.0,
- "routers_loss": 0.00252551375888288,
+ "routers_loss": 0.0023636550176888704,
"skip_count": 0.0,
"step": 4244,
"text_loss": 0.11626995354890823
@@ -40335,13 +40335,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0303955078125,
+ "grad_norm": 0.03466796875,
"learning_rate": 0.0007000642101565433,
- "loss": 0.0081,
+ "loss": 0.008,
"macro_f1": 0.3272727429866791,
"num_tokens": 6847359.0,
"repeat_count": 1.0,
- "routers_loss": 0.022849632427096367,
+ "routers_loss": 0.025154776871204376,
"skip_count": 0.0,
"step": 4246,
"text_loss": 0.42898693680763245
@@ -40354,13 +40354,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.038330078125,
"learning_rate": 0.0006997805126188803,
- "loss": 0.0055,
+ "loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 6850443.0,
"repeat_count": 0.0,
- "routers_loss": 0.005312036257237196,
+ "routers_loss": 0.00540317315608263,
"skip_count": 0.0,
"step": 4248,
"text_loss": 0.18085283041000366
@@ -40373,13 +40373,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.000699496738524948,
- "loss": 0.0072,
+ "loss": 0.007,
"macro_f1": 0.3333333432674408,
"num_tokens": 6853495.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015959764132276177,
+ "routers_loss": 0.0014433214673772454,
"skip_count": 0.0,
"step": 4250,
"text_loss": 0.5524004697799683
@@ -40392,13 +40392,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.032470703125,
"learning_rate": 0.0006992128879834891,
- "loss": 0.0056,
+ "loss": 0.0054,
"macro_f1": 1.0,
"num_tokens": 6856774.0,
"repeat_count": 1.0,
- "routers_loss": 0.01500304602086544,
+ "routers_loss": 0.013381492346525192,
"skip_count": 3.0,
"step": 4252,
"text_loss": 0.19605717062950134
@@ -40411,13 +40411,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.04248046875,
"learning_rate": 0.0006989289611032758,
- "loss": 0.0096,
+ "loss": 0.0095,
"macro_f1": 0.6666666865348816,
"num_tokens": 6860313.0,
"repeat_count": 0.0,
- "routers_loss": 0.006884181406348944,
+ "routers_loss": 0.007140172645449638,
"skip_count": 1.0,
"step": 4254,
"text_loss": 0.3182447552680969
@@ -40430,13 +40430,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.030029296875,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0006986449579931091,
- "loss": 0.0066,
+ "loss": 0.0064,
"macro_f1": 0.6666666865348816,
"num_tokens": 6863683.0,
"repeat_count": 0.0,
- "routers_loss": 0.007357228547334671,
+ "routers_loss": 0.006486213766038418,
"skip_count": 1.0,
"step": 4256,
"text_loss": 0.19250160455703735
@@ -40449,13 +40449,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.04248046875,
"learning_rate": 0.0006983608787618201,
- "loss": 0.0073,
+ "loss": 0.0072,
"macro_f1": 0.3333333432674408,
"num_tokens": 6867609.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016412866534665227,
+ "routers_loss": 0.001465818495489657,
"skip_count": 0.0,
"step": 4258,
"text_loss": 0.5912898182868958
@@ -40468,13 +40468,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04443359375,
+ "grad_norm": 0.04248046875,
"learning_rate": 0.000698076723518268,
"loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 6870040.0,
"repeat_count": 0.0,
- "routers_loss": 0.003204819979146123,
+ "routers_loss": 0.0031106441747397184,
"skip_count": 0.0,
"step": 4260,
"text_loss": 0.13542121648788452
@@ -40487,13 +40487,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0306396484375,
+ "grad_norm": 0.02978515625,
"learning_rate": 0.0006977924923713418,
- "loss": 0.0076,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 6873441.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005213851109147072,
+ "routers_loss": 0.0005377951893024147,
"skip_count": 0.0,
"step": 4262,
"text_loss": 0.352464497089386
@@ -40506,13 +40506,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0006975081854299594,
- "loss": 0.0093,
+ "loss": 0.0092,
"macro_f1": 0.3333333432674408,
"num_tokens": 6876637.0,
"repeat_count": 0.0,
- "routers_loss": 0.0067594959400594234,
+ "routers_loss": 0.007052485831081867,
"skip_count": 0.0,
"step": 4264,
"text_loss": 0.5023844242095947
@@ -40525,13 +40525,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02197265625,
+ "grad_norm": 0.02294921875,
"learning_rate": 0.0006972238028030678,
- "loss": 0.0076,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 6879928.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013809602241963148,
+ "routers_loss": 0.0013608322478830814,
"skip_count": 0.0,
"step": 4266,
"text_loss": 0.8664718270301819
@@ -40544,13 +40544,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0281982421875,
+ "grad_norm": 0.0247802734375,
"learning_rate": 0.0006969393445996429,
- "loss": 0.0064,
+ "loss": 0.0063,
"macro_f1": 0.3333333432674408,
"num_tokens": 6883425.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009357557282783091,
+ "routers_loss": 0.0007607188890688121,
"skip_count": 0.0,
"step": 4268,
"text_loss": 0.5131992101669312
@@ -40563,13 +40563,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03857421875,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0006966548109286897,
- "loss": 0.0079,
+ "loss": 0.0075,
"macro_f1": 0.3333333432674408,
"num_tokens": 6886790.0,
"repeat_count": 0.0,
- "routers_loss": 0.00034129369305446744,
+ "routers_loss": 0.00035804163780994713,
"skip_count": 0.0,
"step": 4270,
"text_loss": 0.5352054834365845
@@ -40582,13 +40582,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.032470703125,
"learning_rate": 0.000696370201899242,
"loss": 0.0064,
"macro_f1": 0.6666666865348816,
"num_tokens": 6889747.0,
"repeat_count": 0.0,
- "routers_loss": 0.004583079367876053,
+ "routers_loss": 0.004451376851648092,
"skip_count": 1.0,
"step": 4272,
"text_loss": 0.47865036129951477
@@ -40601,13 +40601,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03564453125,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0006960855176203623,
- "loss": 0.007,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 6892604.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015929298242554069,
+ "routers_loss": 0.0015342880506068468,
"skip_count": 0.0,
"step": 4274,
"text_loss": 0.36278650164604187
@@ -40620,13 +40620,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0260009765625,
+ "grad_norm": 0.024169921875,
"learning_rate": 0.0006958007582011425,
- "loss": 0.0052,
+ "loss": 0.005,
"macro_f1": 0.6666666865348816,
"num_tokens": 6895563.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021544951014220715,
+ "routers_loss": 0.0022974940948188305,
"skip_count": 2.0,
"step": 4276,
"text_loss": 0.6695618629455566
@@ -40639,13 +40639,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.030517578125,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0006955159237507027,
"loss": 0.0064,
"macro_f1": 0.6666666865348816,
"num_tokens": 6898591.0,
"repeat_count": 0.0,
- "routers_loss": 0.008612595498561859,
+ "routers_loss": 0.00859096460044384,
"skip_count": 1.0,
"step": 4278,
"text_loss": 0.44284722208976746
@@ -40658,13 +40658,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0306396484375,
+ "grad_norm": 0.033935546875,
"learning_rate": 0.0006952310143781921,
- "loss": 0.0056,
+ "loss": 0.0058,
"macro_f1": 1.0,
"num_tokens": 6903119.0,
"repeat_count": 1.0,
- "routers_loss": 0.00829319842159748,
+ "routers_loss": 0.007919861935079098,
"skip_count": 3.0,
"step": 4280,
"text_loss": 0.5006136298179626
@@ -40677,13 +40677,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0238037109375,
+ "grad_norm": 0.0277099609375,
"learning_rate": 0.0006949460301927886,
- "loss": 0.0046,
+ "loss": 0.0045,
"macro_f1": 0.3333333432674408,
"num_tokens": 6906394.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009446305921301246,
+ "routers_loss": 0.0008476210059598088,
"skip_count": 0.0,
"step": 4282,
"text_loss": 0.8153555989265442
@@ -40696,13 +40696,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04345703125,
+ "grad_norm": 0.048095703125,
"learning_rate": 0.0006946609713036985,
- "loss": 0.0082,
+ "loss": 0.0081,
"macro_f1": 0.6666666865348816,
"num_tokens": 6909136.0,
"repeat_count": 0.0,
- "routers_loss": 0.007239636033773422,
+ "routers_loss": 0.006711610127240419,
"skip_count": 2.0,
"step": 4284,
"text_loss": 0.43136683106422424
@@ -40715,13 +40715,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0206298828125,
+ "grad_norm": 0.0185546875,
"learning_rate": 0.0006943758378201571,
- "loss": 0.0063,
+ "loss": 0.0059,
"macro_f1": 0.3333333432674408,
"num_tokens": 6912734.0,
"repeat_count": 0.0,
- "routers_loss": 0.003926573321223259,
+ "routers_loss": 0.0038677838165313005,
"skip_count": 0.0,
"step": 4286,
"text_loss": 0.2693749964237213
@@ -40734,13 +40734,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0245361328125,
+ "grad_norm": 0.02783203125,
"learning_rate": 0.0006940906298514278,
- "loss": 0.0044,
+ "loss": 0.0045,
"macro_f1": 0.3333333432674408,
"num_tokens": 6915838.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012871087528765202,
+ "routers_loss": 0.0012188015971332788,
"skip_count": 0.0,
"step": 4288,
"text_loss": 0.5809219479560852
@@ -40753,13 +40753,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025390625,
+ "grad_norm": 0.026123046875,
"learning_rate": 0.0006938053475068031,
- "loss": 0.0057,
+ "loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 6919225.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018292219610884786,
+ "routers_loss": 0.001955829095095396,
"skip_count": 0.0,
"step": 4290,
"text_loss": 0.5116089582443237
@@ -40772,13 +40772,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.07275390625,
+ "grad_norm": 0.11279296875,
"learning_rate": 0.0006935199908956037,
- "loss": 0.0074,
+ "loss": 0.0073,
"macro_f1": 0.6666666865348816,
"num_tokens": 6922495.0,
"repeat_count": 1.0,
- "routers_loss": 0.0036494603846222162,
+ "routers_loss": 0.0035709093790501356,
"skip_count": 0.0,
"step": 4292,
"text_loss": 0.2745901644229889
@@ -40791,13 +40791,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025390625,
+ "grad_norm": 0.02587890625,
"learning_rate": 0.0006932345601271786,
- "loss": 0.0051,
+ "loss": 0.005,
"macro_f1": 0.3333333432674408,
"num_tokens": 6925317.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005956419045105577,
+ "routers_loss": 0.0005745319649577141,
"skip_count": 0.0,
"step": 4294,
"text_loss": 0.6039219498634338
@@ -40810,13 +40810,13 @@
"f1_execute": 0.9743589162826538,
"f1_repeat": 0.800000011920929,
"f1_skip": 1.0,
- "grad_norm": 0.052734375,
+ "grad_norm": 0.0693359375,
"learning_rate": 0.0006929490553109056,
- "loss": 0.0105,
+ "loss": 0.0107,
"macro_f1": 0.9247862696647644,
"num_tokens": 6928054.0,
"repeat_count": 3.0,
- "routers_loss": 0.05667201802134514,
+ "routers_loss": 0.061689916998147964,
"skip_count": 6.0,
"step": 4296,
"text_loss": 0.3904837667942047
@@ -40829,13 +40829,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0234375,
+ "grad_norm": 0.0240478515625,
"learning_rate": 0.0006926634765561907,
- "loss": 0.0036,
+ "loss": 0.0033,
"macro_f1": 0.3333333432674408,
"num_tokens": 6931348.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017167082987725735,
+ "routers_loss": 0.002007248578593135,
"skip_count": 0.0,
"step": 4298,
"text_loss": 0.5170742273330688
@@ -40848,13 +40848,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0299072265625,
+ "grad_norm": 0.0302734375,
"learning_rate": 0.000692377823972468,
- "loss": 0.0063,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 6934411.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005654593114741147,
+ "routers_loss": 0.0005786226247437298,
"skip_count": 0.0,
"step": 4300,
"text_loss": 0.8032443523406982
@@ -40867,13 +40867,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.029541015625,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0006920920976692004,
- "loss": 0.0072,
+ "loss": 0.0071,
"macro_f1": 0.3272727429866791,
"num_tokens": 6938153.0,
"repeat_count": 1.0,
- "routers_loss": 0.022815195843577385,
+ "routers_loss": 0.024602646008133888,
"skip_count": 0.0,
"step": 4302,
"text_loss": 0.446534663438797
@@ -40892,7 +40892,7 @@
"macro_f1": 0.6666666865348816,
"num_tokens": 6940731.0,
"repeat_count": 0.0,
- "routers_loss": 0.005607374478131533,
+ "routers_loss": 0.005759815219789743,
"skip_count": 2.0,
"step": 4304,
"text_loss": 0.15479247272014618
@@ -40905,13 +40905,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.040283203125,
+ "grad_norm": 0.04150390625,
"learning_rate": 0.0006915204243420214,
"loss": 0.0056,
"macro_f1": 0.6666666865348816,
"num_tokens": 6943246.0,
"repeat_count": 0.0,
- "routers_loss": 0.005993676837533712,
+ "routers_loss": 0.005315347574651241,
"skip_count": 1.0,
"step": 4306,
"text_loss": 0.22127842903137207
@@ -40924,13 +40924,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0225830078125,
+ "grad_norm": 0.0240478515625,
"learning_rate": 0.0006912344775371765,
- "loss": 0.0064,
+ "loss": 0.0063,
"macro_f1": 0.3333333432674408,
"num_tokens": 6947197.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010728619527071714,
+ "routers_loss": 0.0012061651796102524,
"skip_count": 0.0,
"step": 4308,
"text_loss": 0.7058854103088379
@@ -40943,13 +40943,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0303955078125,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0006909484574509191,
- "loss": 0.0068,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 6951817.0,
"repeat_count": 0.0,
- "routers_loss": 0.0027683766093105078,
+ "routers_loss": 0.0029203309677541256,
"skip_count": 0.0,
"step": 4310,
"text_loss": 0.6014000773429871
@@ -40962,13 +40962,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02587890625,
+ "grad_norm": 0.0245361328125,
"learning_rate": 0.0006906623641928525,
"loss": 0.0067,
"macro_f1": 0.6666666865348816,
"num_tokens": 6955094.0,
"repeat_count": 0.0,
- "routers_loss": 0.006130238063633442,
+ "routers_loss": 0.005703397560864687,
"skip_count": 2.0,
"step": 4312,
"text_loss": 0.5923848152160645
@@ -40981,13 +40981,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.053466796875,
+ "grad_norm": 0.08154296875,
"learning_rate": 0.0006903761978726084,
- "loss": 0.0074,
+ "loss": 0.0073,
"macro_f1": 1.0,
"num_tokens": 6958127.0,
"repeat_count": 1.0,
- "routers_loss": 0.005145471077412367,
+ "routers_loss": 0.004489895887672901,
"skip_count": 2.0,
"step": 4314,
"text_loss": 0.36911651492118835
@@ -41000,13 +41000,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.023681640625,
+ "grad_norm": 0.0223388671875,
"learning_rate": 0.000690089958599846,
- "loss": 0.0047,
+ "loss": 0.0046,
"macro_f1": 0.6666666865348816,
"num_tokens": 6960871.0,
"repeat_count": 0.0,
- "routers_loss": 0.004196064081043005,
+ "routers_loss": 0.003871412482112646,
"skip_count": 2.0,
"step": 4316,
"text_loss": 0.442545086145401
@@ -41019,13 +41019,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0294189453125,
+ "grad_norm": 0.0301513671875,
"learning_rate": 0.000689803646484253,
- "loss": 0.0059,
+ "loss": 0.0058,
"macro_f1": 1.0,
"num_tokens": 6963980.0,
"repeat_count": 1.0,
- "routers_loss": 0.007919433526694775,
+ "routers_loss": 0.008667866699397564,
"skip_count": 2.0,
"step": 4318,
"text_loss": 0.1987489014863968
@@ -41038,13 +41038,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0289306640625,
+ "grad_norm": 0.030517578125,
"learning_rate": 0.0006895172616355446,
- "loss": 0.0067,
+ "loss": 0.0069,
"macro_f1": 0.6666666865348816,
"num_tokens": 6967132.0,
"repeat_count": 1.0,
- "routers_loss": 0.008535753935575485,
+ "routers_loss": 0.00843339879065752,
"skip_count": 0.0,
"step": 4320,
"text_loss": 0.48267918825149536
@@ -41057,13 +41057,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.045654296875,
+ "grad_norm": 0.042236328125,
"learning_rate": 0.0006892308041634639,
- "loss": 0.0063,
+ "loss": 0.0064,
"macro_f1": 0.3333333432674408,
"num_tokens": 6969971.0,
"repeat_count": 0.0,
- "routers_loss": 0.00036565042682923377,
+ "routers_loss": 0.0004312851815484464,
"skip_count": 0.0,
"step": 4322,
"text_loss": 0.3662732243537903
@@ -41076,13 +41076,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03564453125,
+ "grad_norm": 0.034912109375,
"learning_rate": 0.0006889442741777822,
- "loss": 0.006,
+ "loss": 0.0058,
"macro_f1": 0.6666666865348816,
"num_tokens": 6973114.0,
"repeat_count": 0.0,
- "routers_loss": 0.004728913307189941,
+ "routers_loss": 0.004588035400956869,
"skip_count": 3.0,
"step": 4324,
"text_loss": 0.6707104444503784
@@ -41095,13 +41095,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.047607421875,
"learning_rate": 0.0006886576717882982,
"loss": 0.0057,
"macro_f1": 0.8817967176437378,
"num_tokens": 6976013.0,
"repeat_count": 2.0,
- "routers_loss": 0.06778892129659653,
+ "routers_loss": 0.0687296912074089,
"skip_count": 3.0,
"step": 4326,
"text_loss": 0.1662217676639557
@@ -41114,13 +41114,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.034912109375,
"learning_rate": 0.0006883709971048384,
- "loss": 0.0093,
+ "loss": 0.0091,
"macro_f1": 0.3333333432674408,
"num_tokens": 6979200.0,
"repeat_count": 0.0,
- "routers_loss": 0.0030250558629631996,
+ "routers_loss": 0.002950174268335104,
"skip_count": 0.0,
"step": 4328,
"text_loss": 0.21168152987957
@@ -41133,13 +41133,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02294921875,
+ "grad_norm": 0.031494140625,
"learning_rate": 0.0006880842502372572,
"loss": 0.0065,
"macro_f1": 0.3333333432674408,
"num_tokens": 6982640.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033437241800129414,
+ "routers_loss": 0.0032158740796148777,
"skip_count": 0.0,
"step": 4330,
"text_loss": 0.26790961623191833
@@ -41152,13 +41152,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.027587890625,
+ "grad_norm": 0.026611328125,
"learning_rate": 0.0006877974312954365,
- "loss": 0.0079,
+ "loss": 0.0077,
"macro_f1": 0.3333333432674408,
"num_tokens": 6985917.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005326211685314775,
+ "routers_loss": 0.0005083635332994163,
"skip_count": 0.0,
"step": 4332,
"text_loss": 0.9736502170562744
@@ -41177,7 +41177,7 @@
"macro_f1": 0.32098764181137085,
"num_tokens": 6988388.0,
"repeat_count": 0.0,
- "routers_loss": 0.034170545637607574,
+ "routers_loss": 0.03473830223083496,
"skip_count": 2.0,
"step": 4334,
"text_loss": 0.21662230789661407
@@ -41190,13 +41190,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0006872235776287425,
- "loss": 0.0092,
+ "loss": 0.0091,
"macro_f1": 0.3333333432674408,
"num_tokens": 6991360.0,
"repeat_count": 0.0,
- "routers_loss": 0.002001045737415552,
+ "routers_loss": 0.002206524135544896,
"skip_count": 0.0,
"step": 4336,
"text_loss": 0.6026972532272339
@@ -41209,13 +41209,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.030517578125,
+ "grad_norm": 0.038330078125,
"learning_rate": 0.0006869365431237711,
"loss": 0.0049,
"macro_f1": 0.6666666865348816,
"num_tokens": 6995080.0,
"repeat_count": 1.0,
- "routers_loss": 0.0009856362594291568,
+ "routers_loss": 0.000969731598161161,
"skip_count": 0.0,
"step": 4338,
"text_loss": 0.5833017230033875
@@ -41228,13 +41228,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.0283203125,
+ "grad_norm": 0.0281982421875,
"learning_rate": 0.0006866494369843635,
"loss": 0.0054,
"macro_f1": 0.8820862174034119,
"num_tokens": 6998526.0,
"repeat_count": 2.0,
- "routers_loss": 0.013545103371143341,
+ "routers_loss": 0.013962293043732643,
"skip_count": 2.0,
"step": 4340,
"text_loss": 0.41465985774993896
@@ -41247,13 +41247,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0302734375,
+ "grad_norm": 0.0294189453125,
"learning_rate": 0.0006863622593205397,
- "loss": 0.0049,
+ "loss": 0.005,
"macro_f1": 0.6666666865348816,
"num_tokens": 7001494.0,
"repeat_count": 0.0,
- "routers_loss": 0.006991719361394644,
+ "routers_loss": 0.0064964210614562035,
"skip_count": 3.0,
"step": 4342,
"text_loss": 0.3774271011352539
@@ -41266,13 +41266,13 @@
"f1_execute": 0.9767441749572754,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.029052734375,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0006860750102423464,
- "loss": 0.0063,
+ "loss": 0.0062,
"macro_f1": 0.6589147448539734,
"num_tokens": 7005544.0,
"repeat_count": 1.0,
- "routers_loss": 0.02598598413169384,
+ "routers_loss": 0.023250726982951164,
"skip_count": 6.0,
"step": 4344,
"text_loss": 0.2732464373111725
@@ -41285,13 +41285,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0238037109375,
+ "grad_norm": 0.0250244140625,
"learning_rate": 0.0006857876898598582,
"loss": 0.0051,
"macro_f1": 0.6666666865348816,
"num_tokens": 7008847.0,
"repeat_count": 0.0,
- "routers_loss": 0.0039848871529102325,
+ "routers_loss": 0.0038170060142874718,
"skip_count": 2.0,
"step": 4346,
"text_loss": 0.29610875248908997
@@ -41306,11 +41306,11 @@
"f1_skip": 0.0,
"grad_norm": 0.0303955078125,
"learning_rate": 0.0006855002982831769,
- "loss": 0.0074,
+ "loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 7012577.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012735783820971847,
+ "routers_loss": 0.0012856025714427233,
"skip_count": 0.0,
"step": 4348,
"text_loss": 0.6098502278327942
@@ -41323,13 +41323,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03857421875,
+ "grad_norm": 0.061767578125,
"learning_rate": 0.0006852128356224314,
"loss": 0.0066,
"macro_f1": 0.6666666865348816,
"num_tokens": 7015650.0,
"repeat_count": 0.0,
- "routers_loss": 0.00863664597272873,
+ "routers_loss": 0.008162742480635643,
"skip_count": 1.0,
"step": 4350,
"text_loss": 0.20868146419525146
@@ -41342,13 +41342,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.023681640625,
+ "grad_norm": 0.023193359375,
"learning_rate": 0.0006849253019877778,
- "loss": 0.0075,
+ "loss": 0.0074,
"macro_f1": 0.8817967176437378,
"num_tokens": 7019925.0,
"repeat_count": 2.0,
- "routers_loss": 0.023779816925525665,
+ "routers_loss": 0.023544032126665115,
"skip_count": 3.0,
"step": 4352,
"text_loss": 0.628226101398468
@@ -41361,13 +41361,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0517578125,
+ "grad_norm": 0.06298828125,
"learning_rate": 0.0006846376974893996,
"loss": 0.008,
"macro_f1": 0.6666666865348816,
"num_tokens": 7023130.0,
"repeat_count": 0.0,
- "routers_loss": 0.004940718412399292,
+ "routers_loss": 0.004982319660484791,
"skip_count": 2.0,
"step": 4354,
"text_loss": 0.7037544250488281
@@ -41380,13 +41380,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0673828125,
+ "grad_norm": 0.0654296875,
"learning_rate": 0.0006843500222375074,
"loss": 0.0066,
"macro_f1": 0.6666666865348816,
"num_tokens": 7026422.0,
"repeat_count": 1.0,
- "routers_loss": 0.004191596060991287,
+ "routers_loss": 0.004015266429632902,
"skip_count": 0.0,
"step": 4356,
"text_loss": 0.22352729737758636
@@ -41399,13 +41399,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 1.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.042724609375,
"learning_rate": 0.0006840622763423391,
"loss": 0.0071,
"macro_f1": 0.9449735879898071,
"num_tokens": 7029077.0,
"repeat_count": 2.0,
- "routers_loss": 0.019883066415786743,
+ "routers_loss": 0.021162014454603195,
"skip_count": 4.0,
"step": 4358,
"text_loss": 0.2431403249502182
@@ -41418,13 +41418,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0361328125,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0006837744599141591,
"loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 7032582.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007547057466581464,
+ "routers_loss": 0.0007044129306450486,
"skip_count": 0.0,
"step": 4360,
"text_loss": 0.26667487621307373
@@ -41437,13 +41437,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.041748046875,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0006834865730632594,
- "loss": 0.0067,
+ "loss": 0.0066,
"macro_f1": 0.6666666865348816,
"num_tokens": 7035642.0,
"repeat_count": 0.0,
- "routers_loss": 0.0069348798133432865,
+ "routers_loss": 0.0067853196524083614,
"skip_count": 1.0,
"step": 4362,
"text_loss": 0.20965275168418884
@@ -41456,13 +41456,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0267333984375,
+ "grad_norm": 0.0281982421875,
"learning_rate": 0.0006831986158999588,
- "loss": 0.0064,
+ "loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 7038601.0,
"repeat_count": 0.0,
- "routers_loss": 0.008647902868688107,
+ "routers_loss": 0.00899333506822586,
"skip_count": 2.0,
"step": 4364,
"text_loss": 0.26860126852989197
@@ -41475,13 +41475,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041748046875,
+ "grad_norm": 0.039794921875,
"learning_rate": 0.000682910588534603,
- "loss": 0.0089,
+ "loss": 0.0087,
"macro_f1": 0.3333333432674408,
"num_tokens": 7042274.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019517095061019063,
+ "routers_loss": 0.0019194348715245724,
"skip_count": 0.0,
"step": 4366,
"text_loss": 0.14046810567378998
@@ -41494,13 +41494,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.03125,
+ "grad_norm": 0.034423828125,
"learning_rate": 0.0006826224910775647,
- "loss": 0.006,
+ "loss": 0.0062,
"macro_f1": 1.0,
"num_tokens": 7045268.0,
"repeat_count": 1.0,
- "routers_loss": 0.007441094610840082,
+ "routers_loss": 0.006915684789419174,
"skip_count": 3.0,
"step": 4368,
"text_loss": 0.5900366306304932
@@ -41513,13 +41513,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0006823343236392432,
- "loss": 0.0072,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 7049407.0,
"repeat_count": 0.0,
- "routers_loss": 0.00144639378413558,
+ "routers_loss": 0.001678116386756301,
"skip_count": 0.0,
"step": 4370,
"text_loss": 0.7868026494979858
@@ -41532,13 +41532,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.024169921875,
+ "grad_norm": 0.0274658203125,
"learning_rate": 0.000682046086330065,
"loss": 0.0075,
"macro_f1": 0.3333333432674408,
"num_tokens": 7052783.0,
"repeat_count": 0.0,
- "routers_loss": 0.0003659129433799535,
+ "routers_loss": 0.0003459530707914382,
"skip_count": 0.0,
"step": 4372,
"text_loss": 0.6349637508392334
@@ -41551,13 +41551,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0281982421875,
+ "grad_norm": 0.0279541015625,
"learning_rate": 0.0006817577792604831,
- "loss": 0.0052,
+ "loss": 0.0051,
"macro_f1": 0.3333333432674408,
"num_tokens": 7055757.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012188151013106108,
+ "routers_loss": 0.0011729507241398096,
"skip_count": 0.0,
"step": 4374,
"text_loss": 0.43258991837501526
@@ -41570,13 +41570,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.03564453125,
"learning_rate": 0.0006814694025409773,
"loss": 0.0088,
"macro_f1": 0.3333333432674408,
"num_tokens": 7058684.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006639147759415209,
+ "routers_loss": 0.0006664610700681806,
"skip_count": 0.0,
"step": 4376,
"text_loss": 0.5307940244674683
@@ -41589,13 +41589,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.07763671875,
+ "grad_norm": 0.091796875,
"learning_rate": 0.0006811809562820542,
- "loss": 0.0081,
+ "loss": 0.0083,
"macro_f1": 0.6666666865348816,
"num_tokens": 7061902.0,
"repeat_count": 0.0,
- "routers_loss": 0.004041146486997604,
+ "routers_loss": 0.004595907870680094,
"skip_count": 2.0,
"step": 4378,
"text_loss": 0.5830042362213135
@@ -41608,13 +41608,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.027587890625,
+ "grad_norm": 0.0274658203125,
"learning_rate": 0.0006808924405942467,
"loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 7065100.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028930313419550657,
+ "routers_loss": 0.0032026609405875206,
"skip_count": 0.0,
"step": 4380,
"text_loss": 0.20797798037528992
@@ -41627,13 +41627,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0186767578125,
+ "grad_norm": 0.0184326171875,
"learning_rate": 0.0006806038555881148,
- "loss": 0.0041,
+ "loss": 0.004,
"macro_f1": 0.6666666865348816,
"num_tokens": 7068556.0,
"repeat_count": 1.0,
- "routers_loss": 0.0027319532819092274,
+ "routers_loss": 0.0024626904632896185,
"skip_count": 0.0,
"step": 4382,
"text_loss": 0.5791074633598328
@@ -41646,13 +41646,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.041015625,
+ "grad_norm": 0.040283203125,
"learning_rate": 0.0006803152013742448,
- "loss": 0.0077,
+ "loss": 0.0075,
"macro_f1": 1.0,
"num_tokens": 7071284.0,
"repeat_count": 1.0,
- "routers_loss": 0.011207868345081806,
+ "routers_loss": 0.010723610408604145,
"skip_count": 2.0,
"step": 4384,
"text_loss": 0.13227243721485138
@@ -41665,13 +41665,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0299072265625,
+ "grad_norm": 0.029052734375,
"learning_rate": 0.0006800264780632495,
- "loss": 0.0054,
+ "loss": 0.0053,
"macro_f1": 0.6666666865348816,
"num_tokens": 7074428.0,
"repeat_count": 1.0,
- "routers_loss": 0.001005658763460815,
+ "routers_loss": 0.0011231007520109415,
"skip_count": 0.0,
"step": 4386,
"text_loss": 0.4360627233982086
@@ -41684,13 +41684,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0296630859375,
+ "grad_norm": 0.0291748046875,
"learning_rate": 0.0006797376857657681,
- "loss": 0.0083,
+ "loss": 0.0081,
"macro_f1": 1.0,
"num_tokens": 7078313.0,
"repeat_count": 2.0,
- "routers_loss": 0.00910002738237381,
+ "routers_loss": 0.008419238030910492,
"skip_count": 1.0,
"step": 4388,
"text_loss": 0.5183924436569214
@@ -41703,13 +41703,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.04931640625,
+ "grad_norm": 0.046142578125,
"learning_rate": 0.0006794488245924664,
- "loss": 0.0085,
+ "loss": 0.0084,
"macro_f1": 1.0,
"num_tokens": 7081258.0,
"repeat_count": 1.0,
- "routers_loss": 0.0076475366950035095,
+ "routers_loss": 0.006582668516784906,
"skip_count": 3.0,
"step": 4390,
"text_loss": 0.2797473669052124
@@ -41722,13 +41722,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.046630859375,
"learning_rate": 0.0006791598946540368,
"loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 7084527.0,
"repeat_count": 0.0,
- "routers_loss": 0.005813235882669687,
+ "routers_loss": 0.00557357631623745,
"skip_count": 2.0,
"step": 4392,
"text_loss": 0.39495575428009033
@@ -41741,13 +41741,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0419921875,
+ "grad_norm": 0.06005859375,
"learning_rate": 0.0006788708960611975,
"loss": 0.0054,
"macro_f1": 0.3333333432674408,
"num_tokens": 7087675.0,
"repeat_count": 0.0,
- "routers_loss": 0.007134446874260902,
+ "routers_loss": 0.007155992556363344,
"skip_count": 0.0,
"step": 4394,
"text_loss": 0.3785299062728882
@@ -41760,13 +41760,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0172119140625,
+ "grad_norm": 0.01806640625,
"learning_rate": 0.0006785818289246934,
"loss": 0.0045,
"macro_f1": 0.3333333432674408,
"num_tokens": 7090171.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008882717229425907,
+ "routers_loss": 0.0009265039698220789,
"skip_count": 0.0,
"step": 4396,
"text_loss": 0.42634522914886475
@@ -41779,13 +41779,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.042236328125,
+ "grad_norm": 0.046142578125,
"learning_rate": 0.0006782926933552955,
"loss": 0.0059,
"macro_f1": 1.0,
"num_tokens": 7092529.0,
"repeat_count": 1.0,
- "routers_loss": 0.008333612233400345,
+ "routers_loss": 0.008679097518324852,
"skip_count": 7.0,
"step": 4398,
"text_loss": 0.4283660054206848
@@ -41798,13 +41798,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042236328125,
+ "grad_norm": 0.042724609375,
"learning_rate": 0.0006780034894638014,
- "loss": 0.006,
+ "loss": 0.0058,
"macro_f1": 0.3333333432674408,
"num_tokens": 7095141.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026251052040606737,
+ "routers_loss": 0.002363949315622449,
"skip_count": 0.0,
"step": 4400,
"text_loss": 0.481539249420166
@@ -41817,13 +41817,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.039794921875,
+ "grad_norm": 0.040283203125,
"learning_rate": 0.000677714217361034,
- "loss": 0.0055,
+ "loss": 0.0057,
"macro_f1": 0.6666666865348816,
"num_tokens": 7098208.0,
"repeat_count": 0.0,
- "routers_loss": 0.003755744779482484,
+ "routers_loss": 0.004005146212875843,
"skip_count": 3.0,
"step": 4402,
"text_loss": 0.6443291902542114
@@ -41836,13 +41836,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0252685546875,
+ "grad_norm": 0.0306396484375,
"learning_rate": 0.0006774248771578435,
"loss": 0.0051,
"macro_f1": 0.3333333432674408,
"num_tokens": 7101681.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028396346606314182,
+ "routers_loss": 0.0026864963583648205,
"skip_count": 0.0,
"step": 4404,
"text_loss": 0.16315312683582306
@@ -41855,13 +41855,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 1.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.0006771354689651054,
- "loss": 0.0052,
+ "loss": 0.005,
"macro_f1": 0.9449735879898071,
"num_tokens": 7104719.0,
"repeat_count": 2.0,
- "routers_loss": 0.02745615690946579,
+ "routers_loss": 0.02719845622777939,
"skip_count": 4.0,
"step": 4406,
"text_loss": 0.37855592370033264
@@ -41874,13 +41874,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0289306640625,
+ "grad_norm": 0.0284423828125,
"learning_rate": 0.0006768459928937213,
- "loss": 0.005,
+ "loss": 0.0048,
"macro_f1": 0.3333333432674408,
"num_tokens": 7108697.0,
"repeat_count": 0.0,
- "routers_loss": 0.010080067440867424,
+ "routers_loss": 0.010488593950867653,
"skip_count": 0.0,
"step": 4408,
"text_loss": 0.23133711516857147
@@ -41893,13 +41893,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0247802734375,
+ "grad_norm": 0.02392578125,
"learning_rate": 0.0006765564490546193,
- "loss": 0.0052,
+ "loss": 0.0053,
"macro_f1": 0.6666666865348816,
"num_tokens": 7111426.0,
"repeat_count": 1.0,
- "routers_loss": 0.001801682054065168,
+ "routers_loss": 0.0013637891970574856,
"skip_count": 0.0,
"step": 4410,
"text_loss": 0.41399383544921875
@@ -41912,13 +41912,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05224609375,
+ "grad_norm": 0.0732421875,
"learning_rate": 0.0006762668375587528,
- "loss": 0.0068,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 7114241.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009764294954948127,
+ "routers_loss": 0.000900395680218935,
"skip_count": 0.0,
"step": 4412,
"text_loss": 0.6460412740707397
@@ -41931,13 +41931,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03564453125,
+ "grad_norm": 0.0498046875,
"learning_rate": 0.0006759771585171016,
"loss": 0.0043,
"macro_f1": 0.3333333432674408,
"num_tokens": 7117031.0,
"repeat_count": 0.0,
- "routers_loss": 0.002657619072124362,
+ "routers_loss": 0.0024001260753721,
"skip_count": 0.0,
"step": 4414,
"text_loss": 0.7645824551582336
@@ -41950,13 +41950,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0303955078125,
+ "grad_norm": 0.0306396484375,
"learning_rate": 0.0006756874120406714,
"loss": 0.0058,
"macro_f1": 1.0,
"num_tokens": 7120766.0,
"repeat_count": 3.0,
- "routers_loss": 0.005801939871162176,
+ "routers_loss": 0.005034091416746378,
"skip_count": 4.0,
"step": 4416,
"text_loss": 0.31753066182136536
@@ -41969,13 +41969,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0006753975982404934,
"loss": 0.0072,
"macro_f1": 0.3333333432674408,
"num_tokens": 7125243.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026111488696187735,
+ "routers_loss": 0.002483269665390253,
"skip_count": 0.0,
"step": 4418,
"text_loss": 0.5304268002510071
@@ -41988,13 +41988,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0281982421875,
+ "grad_norm": 0.027099609375,
"learning_rate": 0.0006751077172276249,
- "loss": 0.0055,
+ "loss": 0.0052,
"macro_f1": 0.3272727429866791,
"num_tokens": 7127795.0,
"repeat_count": 0.0,
- "routers_loss": 0.028494317084550858,
+ "routers_loss": 0.02676006779074669,
"skip_count": 1.0,
"step": 4420,
"text_loss": 0.22011354565620422
@@ -42007,13 +42007,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.06396484375,
+ "grad_norm": 0.06201171875,
"learning_rate": 0.000674817769113149,
- "loss": 0.0061,
+ "loss": 0.0059,
"macro_f1": 0.6666666865348816,
"num_tokens": 7130837.0,
"repeat_count": 0.0,
- "routers_loss": 0.003031681990250945,
+ "routers_loss": 0.003267093561589718,
"skip_count": 2.0,
"step": 4422,
"text_loss": 0.2906076908111572
@@ -42026,13 +42026,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.0284423828125,
+ "grad_norm": 0.027099609375,
"learning_rate": 0.000674527754008174,
- "loss": 0.0047,
+ "loss": 0.0045,
"macro_f1": 0.5934640765190125,
"num_tokens": 7135090.0,
"repeat_count": 0.0,
- "routers_loss": 0.023750508204102516,
+ "routers_loss": 0.022510390728712082,
"skip_count": 3.0,
"step": 4424,
"text_loss": 0.2544902563095093
@@ -42045,13 +42045,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0006742376720238345,
"loss": 0.0034,
"macro_f1": 0.3333333432674408,
"num_tokens": 7138751.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012101450702175498,
+ "routers_loss": 0.0011178571730852127,
"skip_count": 0.0,
"step": 4426,
"text_loss": 0.6811438798904419
@@ -42064,13 +42064,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0216064453125,
+ "grad_norm": 0.0238037109375,
"learning_rate": 0.0006739475232712904,
- "loss": 0.0035,
+ "loss": 0.0036,
"macro_f1": 1.0,
"num_tokens": 7141762.0,
"repeat_count": 2.0,
- "routers_loss": 0.005393387749791145,
+ "routers_loss": 0.005595206283032894,
"skip_count": 1.0,
"step": 4428,
"text_loss": 0.38743990659713745
@@ -42083,13 +42083,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03173828125,
+ "grad_norm": 0.033203125,
"learning_rate": 0.0006736573078617272,
- "loss": 0.0066,
+ "loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 7145235.0,
"repeat_count": 0.0,
- "routers_loss": 0.0029694747645407915,
+ "routers_loss": 0.002793942578136921,
"skip_count": 2.0,
"step": 4430,
"text_loss": 0.21894219517707825
@@ -42102,13 +42102,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.048828125,
"learning_rate": 0.0006733670259063561,
"loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 7149042.0,
"repeat_count": 0.0,
- "routers_loss": 0.006469822954386473,
+ "routers_loss": 0.006146818865090609,
"skip_count": 3.0,
"step": 4432,
"text_loss": 0.17822015285491943
@@ -42121,13 +42121,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.042236328125,
"learning_rate": 0.0006730766775164136,
- "loss": 0.0059,
+ "loss": 0.0061,
"macro_f1": 0.5492662787437439,
"num_tokens": 7152166.0,
"repeat_count": 0.0,
- "routers_loss": 0.026202494278550148,
+ "routers_loss": 0.026045087724924088,
"skip_count": 2.0,
"step": 4434,
"text_loss": 0.2910420000553131
@@ -42140,13 +42140,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0308837890625,
+ "grad_norm": 0.03466796875,
"learning_rate": 0.0006727862628031618,
- "loss": 0.0074,
+ "loss": 0.0073,
"macro_f1": 0.6666666865348816,
"num_tokens": 7155506.0,
"repeat_count": 2.0,
- "routers_loss": 0.002748608123511076,
+ "routers_loss": 0.0022973387967795134,
"skip_count": 0.0,
"step": 4436,
"text_loss": 0.3502544164657593
@@ -42159,13 +42159,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.021484375,
+ "grad_norm": 0.022705078125,
"learning_rate": 0.0006724957818778882,
"loss": 0.0061,
"macro_f1": 0.6666666865348816,
"num_tokens": 7158739.0,
"repeat_count": 0.0,
- "routers_loss": 0.002528413198888302,
+ "routers_loss": 0.002357073128223419,
"skip_count": 1.0,
"step": 4438,
"text_loss": 0.26200664043426514
@@ -42178,13 +42178,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0279541015625,
+ "grad_norm": 0.0277099609375,
"learning_rate": 0.0006722052348519054,
- "loss": 0.0095,
+ "loss": 0.0093,
"macro_f1": 0.3333333432674408,
"num_tokens": 7161776.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005445044371299446,
+ "routers_loss": 0.0005521026905626059,
"skip_count": 0.0,
"step": 4440,
"text_loss": 0.3922915458679199
@@ -42197,13 +42197,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04296875,
+ "grad_norm": 0.044189453125,
"learning_rate": 0.000671914621836552,
- "loss": 0.0108,
+ "loss": 0.0106,
"macro_f1": 0.6666666865348816,
"num_tokens": 7164763.0,
"repeat_count": 0.0,
- "routers_loss": 0.008194289170205593,
+ "routers_loss": 0.007691344246268272,
"skip_count": 2.0,
"step": 4442,
"text_loss": 0.6021351218223572
@@ -42216,13 +42216,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0308837890625,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.000671623942943191,
- "loss": 0.0075,
+ "loss": 0.0073,
"macro_f1": 0.3333333432674408,
"num_tokens": 7167924.0,
"repeat_count": 0.0,
- "routers_loss": 0.0033410112373530865,
+ "routers_loss": 0.0032181134447455406,
"skip_count": 0.0,
"step": 4444,
"text_loss": 0.23639555275440216
@@ -42235,13 +42235,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0286865234375,
+ "grad_norm": 0.030029296875,
"learning_rate": 0.0006713331982832113,
- "loss": 0.0073,
+ "loss": 0.0071,
"macro_f1": 0.3272727429866791,
"num_tokens": 7170743.0,
"repeat_count": 1.0,
- "routers_loss": 0.024495115503668785,
+ "routers_loss": 0.024979131296277046,
"skip_count": 0.0,
"step": 4446,
"text_loss": 0.4957772493362427
@@ -42254,13 +42254,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041748046875,
+ "grad_norm": 0.043212890625,
"learning_rate": 0.0006710423879680271,
- "loss": 0.0069,
+ "loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 7174660.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026295294519513845,
+ "routers_loss": 0.002571308286860585,
"skip_count": 0.0,
"step": 4448,
"text_loss": 0.47968071699142456
@@ -42273,13 +42273,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0308837890625,
+ "grad_norm": 0.031494140625,
"learning_rate": 0.000670751512109077,
- "loss": 0.0063,
+ "loss": 0.006,
"macro_f1": 0.3333333432674408,
"num_tokens": 7177965.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024814927019178867,
+ "routers_loss": 0.00212799571454525,
"skip_count": 0.0,
"step": 4450,
"text_loss": 0.6550716161727905
@@ -42292,13 +42292,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.041748046875,
"learning_rate": 0.0006704605708178252,
- "loss": 0.0105,
+ "loss": 0.0107,
"macro_f1": 0.6666666865348816,
"num_tokens": 7181512.0,
"repeat_count": 0.0,
- "routers_loss": 0.004174043424427509,
+ "routers_loss": 0.004176430404186249,
"skip_count": 1.0,
"step": 4452,
"text_loss": 0.36959558725357056
@@ -42311,13 +42311,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0267333984375,
+ "grad_norm": 0.024658203125,
"learning_rate": 0.0006701695642057613,
- "loss": 0.005,
+ "loss": 0.0048,
"macro_f1": 0.3333333432674408,
"num_tokens": 7184555.0,
"repeat_count": 0.0,
- "routers_loss": 0.001206343644298613,
+ "routers_loss": 0.0010968588758260012,
"skip_count": 0.0,
"step": 4454,
"text_loss": 0.6686749458312988
@@ -42330,13 +42330,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.037353515625,
"learning_rate": 0.0006698784923843993,
- "loss": 0.0077,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 7187474.0,
"repeat_count": 0.0,
- "routers_loss": 0.001408674637787044,
+ "routers_loss": 0.0014241471653804183,
"skip_count": 0.0,
"step": 4456,
"text_loss": 0.6147221922874451
@@ -42349,13 +42349,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.026611328125,
+ "grad_norm": 0.0306396484375,
"learning_rate": 0.0006695873554652784,
- "loss": 0.0071,
+ "loss": 0.0073,
"macro_f1": 0.3333333432674408,
"num_tokens": 7190649.0,
"repeat_count": 0.0,
- "routers_loss": 0.008512571454048157,
+ "routers_loss": 0.008801907300949097,
"skip_count": 0.0,
"step": 4458,
"text_loss": 0.26381927728652954
@@ -42368,13 +42368,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.048095703125,
+ "grad_norm": 0.04638671875,
"learning_rate": 0.0006692961535599634,
"loss": 0.0079,
"macro_f1": 0.6666666865348816,
"num_tokens": 7193961.0,
"repeat_count": 0.0,
- "routers_loss": 0.009439903311431408,
+ "routers_loss": 0.009027508087456226,
"skip_count": 1.0,
"step": 4460,
"text_loss": 0.1926470547914505
@@ -42387,13 +42387,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03564453125,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0006690048867800427,
- "loss": 0.0088,
+ "loss": 0.0087,
"macro_f1": 0.3333333432674408,
"num_tokens": 7197456.0,
"repeat_count": 0.0,
- "routers_loss": 0.002294899197295308,
+ "routers_loss": 0.0022697453387081623,
"skip_count": 0.0,
"step": 4462,
"text_loss": 0.6736721992492676
@@ -42406,13 +42406,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.024658203125,
+ "grad_norm": 0.0238037109375,
"learning_rate": 0.0006687135552371305,
"loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 7200290.0,
"repeat_count": 0.0,
- "routers_loss": 0.006510137114673853,
+ "routers_loss": 0.006747903767973185,
"skip_count": 1.0,
"step": 4464,
"text_loss": 0.2026437371969223
@@ -42427,11 +42427,11 @@
"f1_skip": 0.0,
"grad_norm": 0.032470703125,
"learning_rate": 0.0006684221590428657,
- "loss": 0.0067,
+ "loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 7203320.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010735326213762164,
+ "routers_loss": 0.0011565096210688353,
"skip_count": 0.0,
"step": 4466,
"text_loss": 0.7587730288505554
@@ -42444,13 +42444,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.052734375,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0006681306983089121,
- "loss": 0.0084,
+ "loss": 0.0083,
"macro_f1": 0.8820862174034119,
"num_tokens": 7206411.0,
"repeat_count": 2.0,
- "routers_loss": 0.02467990666627884,
+ "routers_loss": 0.023645581677556038,
"skip_count": 2.0,
"step": 4468,
"text_loss": 0.8981561660766602
@@ -42463,13 +42463,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0380859375,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.0006678391731469575,
- "loss": 0.0071,
+ "loss": 0.007,
"macro_f1": 0.3333333432674408,
"num_tokens": 7209421.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035951859317719936,
+ "routers_loss": 0.0035848666448146105,
"skip_count": 0.0,
"step": 4470,
"text_loss": 0.1522839516401291
@@ -42482,13 +42482,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0311279296875,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0006675475836687152,
- "loss": 0.007,
+ "loss": 0.0069,
"macro_f1": 1.0,
"num_tokens": 7212267.0,
"repeat_count": 1.0,
- "routers_loss": 0.004971543326973915,
+ "routers_loss": 0.005046425387263298,
"skip_count": 1.0,
"step": 4472,
"text_loss": 0.46007999777793884
@@ -42501,13 +42501,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.028076171875,
+ "grad_norm": 0.02685546875,
"learning_rate": 0.0006672559299859228,
- "loss": 0.0062,
+ "loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 7215195.0,
"repeat_count": 0.0,
- "routers_loss": 0.002104618586599827,
+ "routers_loss": 0.0019333874806761742,
"skip_count": 0.0,
"step": 4474,
"text_loss": 1.0859547853469849
@@ -42520,13 +42520,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.040283203125,
"learning_rate": 0.0006669642122103423,
- "loss": 0.0045,
+ "loss": 0.0044,
"macro_f1": 0.3333333432674408,
"num_tokens": 7217941.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005096147069707513,
+ "routers_loss": 0.0005401032394729555,
"skip_count": 0.0,
"step": 4476,
"text_loss": 0.9754356145858765
@@ -42545,7 +42545,7 @@
"macro_f1": 0.3272727429866791,
"num_tokens": 7222494.0,
"repeat_count": 1.0,
- "routers_loss": 0.016167305409908295,
+ "routers_loss": 0.015569722279906273,
"skip_count": 0.0,
"step": 4478,
"text_loss": 0.2896423637866974
@@ -42558,13 +42558,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03271484375,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.0006663805848279898,
- "loss": 0.0057,
+ "loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 7225292.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021911219228059053,
+ "routers_loss": 0.0020135147497057915,
"skip_count": 0.0,
"step": 4480,
"text_loss": 0.8492724299430847
@@ -42577,13 +42577,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0191650390625,
+ "grad_norm": 0.0194091796875,
"learning_rate": 0.0006660886754448648,
- "loss": 0.006,
+ "loss": 0.0058,
"macro_f1": 0.6666666865348816,
"num_tokens": 7229184.0,
"repeat_count": 1.0,
- "routers_loss": 0.002788309706375003,
+ "routers_loss": 0.002355351345613599,
"skip_count": 0.0,
"step": 4482,
"text_loss": 0.189764603972435
@@ -42596,13 +42596,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0201416015625,
+ "grad_norm": 0.02099609375,
"learning_rate": 0.0006657967024162459,
"loss": 0.0045,
"macro_f1": 0.3333333432674408,
"num_tokens": 7232906.0,
"repeat_count": 0.0,
- "routers_loss": 0.003091001184657216,
+ "routers_loss": 0.003044391982257366,
"skip_count": 0.0,
"step": 4484,
"text_loss": 0.4239847660064697
@@ -42615,13 +42615,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0260009765625,
+ "grad_norm": 0.0269775390625,
"learning_rate": 0.0006655046658540179,
- "loss": 0.0048,
+ "loss": 0.0047,
"macro_f1": 0.6666666865348816,
"num_tokens": 7235996.0,
"repeat_count": 0.0,
- "routers_loss": 0.006288980133831501,
+ "routers_loss": 0.00602696230635047,
"skip_count": 2.0,
"step": 4486,
"text_loss": 0.217103973031044
@@ -42634,13 +42634,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0179443359375,
+ "grad_norm": 0.0169677734375,
"learning_rate": 0.0006652125658700896,
- "loss": 0.0032,
+ "loss": 0.0031,
"macro_f1": 0.6666666865348816,
"num_tokens": 7238882.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017161039868369699,
+ "routers_loss": 0.001470155781134963,
"skip_count": 1.0,
"step": 4488,
"text_loss": 0.6090770363807678
@@ -42653,13 +42653,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.03759765625,
"learning_rate": 0.0006649204025763945,
- "loss": 0.0057,
+ "loss": 0.0055,
"macro_f1": 1.0,
"num_tokens": 7241815.0,
"repeat_count": 1.0,
- "routers_loss": 0.008624191395938396,
+ "routers_loss": 0.008737480267882347,
"skip_count": 2.0,
"step": 4490,
"text_loss": 0.48314425349235535
@@ -42672,13 +42672,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.017333984375,
+ "grad_norm": 0.0177001953125,
"learning_rate": 0.0006646281760848902,
- "loss": 0.004,
+ "loss": 0.0038,
"macro_f1": 0.3333333432674408,
"num_tokens": 7244848.0,
"repeat_count": 0.0,
- "routers_loss": 0.00083601736696437,
+ "routers_loss": 0.0008257135050371289,
"skip_count": 0.0,
"step": 4492,
"text_loss": 0.5884748101234436
@@ -42693,11 +42693,11 @@
"f1_skip": 0.0,
"grad_norm": 0.0228271484375,
"learning_rate": 0.0006643358865075581,
- "loss": 0.0057,
+ "loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 7247930.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016617088112980127,
+ "routers_loss": 0.0016262239078059793,
"skip_count": 0.0,
"step": 4494,
"text_loss": 0.21444730460643768
@@ -42710,13 +42710,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.030029296875,
+ "grad_norm": 0.0299072265625,
"learning_rate": 0.0006640435339564042,
- "loss": 0.0075,
+ "loss": 0.0073,
"macro_f1": 0.3333333432674408,
"num_tokens": 7251776.0,
"repeat_count": 0.0,
- "routers_loss": 0.001377894077450037,
+ "routers_loss": 0.001315156347118318,
"skip_count": 0.0,
"step": 4496,
"text_loss": 0.6890370845794678
@@ -42729,13 +42729,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.03662109375,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0006637511185434588,
"loss": 0.0091,
"macro_f1": 1.0,
"num_tokens": 7255070.0,
"repeat_count": 1.0,
- "routers_loss": 0.007681882940232754,
+ "routers_loss": 0.007614497095346451,
"skip_count": 3.0,
"step": 4498,
"text_loss": 0.516417920589447
@@ -42748,13 +42748,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0245361328125,
+ "grad_norm": 0.0238037109375,
"learning_rate": 0.0006634586403807758,
"loss": 0.0041,
"macro_f1": 1.0,
"num_tokens": 7258115.0,
"repeat_count": 3.0,
- "routers_loss": 0.0049721370451152325,
+ "routers_loss": 0.004906686954200268,
"skip_count": 2.0,
"step": 4500,
"text_loss": 0.577463686466217
@@ -42767,13 +42767,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05859375,
+ "grad_norm": 0.0927734375,
"learning_rate": 0.0006631660995804334,
"loss": 0.0067,
"macro_f1": 0.6601307392120361,
"num_tokens": 7260769.0,
"repeat_count": 1.0,
- "routers_loss": 0.01382436417043209,
+ "routers_loss": 0.013337121345102787,
"skip_count": 2.0,
"step": 4502,
"text_loss": 0.37124839425086975
@@ -42786,13 +42786,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0419921875,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0006628734962545339,
- "loss": 0.0083,
+ "loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 7263908.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024676774628460407,
+ "routers_loss": 0.0023418180644512177,
"skip_count": 0.0,
"step": 4504,
"text_loss": 0.17937727272510529
@@ -42805,13 +42805,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03466796875,
+ "grad_norm": 0.040771484375,
"learning_rate": 0.0006625808305152033,
- "loss": 0.0067,
+ "loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 7267391.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006373177748173475,
+ "routers_loss": 0.0006556165171787143,
"skip_count": 0.0,
"step": 4506,
"text_loss": 0.45344987511634827
@@ -42824,13 +42824,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025146484375,
+ "grad_norm": 0.02734375,
"learning_rate": 0.0006622881024745919,
- "loss": 0.0044,
+ "loss": 0.0042,
"macro_f1": 0.3333333432674408,
"num_tokens": 7271402.0,
"repeat_count": 0.0,
- "routers_loss": 0.002280580811202526,
+ "routers_loss": 0.0021988123189657927,
"skip_count": 0.0,
"step": 4508,
"text_loss": 0.5842905640602112
@@ -42843,13 +42843,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02783203125,
+ "grad_norm": 0.029052734375,
"learning_rate": 0.0006619953122448734,
"loss": 0.0075,
"macro_f1": 0.6666666865348816,
"num_tokens": 7274354.0,
"repeat_count": 0.0,
- "routers_loss": 0.007782169617712498,
+ "routers_loss": 0.00774174090474844,
"skip_count": 2.0,
"step": 4510,
"text_loss": 0.27159228920936584
@@ -42862,13 +42862,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03955078125,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.0006617024599382456,
"loss": 0.0058,
"macro_f1": 0.3333333432674408,
"num_tokens": 7277378.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007145124254748225,
+ "routers_loss": 0.0006942499312572181,
"skip_count": 0.0,
"step": 4512,
"text_loss": 0.4464176297187805
@@ -42881,13 +42881,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.039794921875,
"learning_rate": 0.0006614095456669302,
- "loss": 0.0049,
+ "loss": 0.0048,
"macro_f1": 0.3333333432674408,
"num_tokens": 7280526.0,
"repeat_count": 0.0,
- "routers_loss": 0.0031456330325454473,
+ "routers_loss": 0.003003394464030862,
"skip_count": 0.0,
"step": 4514,
"text_loss": 0.31188079714775085
@@ -42900,13 +42900,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.024658203125,
+ "grad_norm": 0.028076171875,
"learning_rate": 0.0006611165695431725,
- "loss": 0.0067,
+ "loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 7283916.0,
"repeat_count": 0.0,
- "routers_loss": 0.000815888459328562,
+ "routers_loss": 0.0006948060472495854,
"skip_count": 0.0,
"step": 4516,
"text_loss": 0.5266574025154114
@@ -42919,13 +42919,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.042236328125,
"learning_rate": 0.0006608235316792413,
- "loss": 0.0064,
+ "loss": 0.0063,
"macro_f1": 0.3333333432674408,
"num_tokens": 7286843.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015030937502160668,
+ "routers_loss": 0.0014080886030569673,
"skip_count": 0.0,
"step": 4518,
"text_loss": 0.5880120396614075
@@ -42938,13 +42938,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0283203125,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0006605304321874295,
- "loss": 0.007,
+ "loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 7289940.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017453476320952177,
+ "routers_loss": 0.0016894340515136719,
"skip_count": 0.0,
"step": 4520,
"text_loss": 0.6623797416687012
@@ -42957,13 +42957,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.024169921875,
+ "grad_norm": 0.0228271484375,
"learning_rate": 0.0006602372711800531,
- "loss": 0.0045,
+ "loss": 0.0044,
"macro_f1": 0.3333333432674408,
"num_tokens": 7292869.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035615740343928337,
+ "routers_loss": 0.003522444050759077,
"skip_count": 0.0,
"step": 4522,
"text_loss": 0.5488807559013367
@@ -42976,13 +42976,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0234375,
+ "grad_norm": 0.0240478515625,
"learning_rate": 0.0006599440487694521,
- "loss": 0.0068,
+ "loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 7296618.0,
"repeat_count": 0.0,
- "routers_loss": 0.001281693228520453,
+ "routers_loss": 0.0011981099378317595,
"skip_count": 0.0,
"step": 4524,
"text_loss": 0.4128517210483551
@@ -42995,13 +42995,13 @@
"f1_execute": 0.978723406791687,
"f1_repeat": 0.800000011920929,
"f1_skip": 1.0,
- "grad_norm": 0.02587890625,
+ "grad_norm": 0.0269775390625,
"learning_rate": 0.00065965076506799,
- "loss": 0.0048,
+ "loss": 0.0047,
"macro_f1": 0.9262410998344421,
"num_tokens": 7300481.0,
"repeat_count": 3.0,
- "routers_loss": 0.011079956777393818,
+ "routers_loss": 0.010548194870352745,
"skip_count": 2.0,
"step": 4526,
"text_loss": 0.26450902223587036
@@ -43014,13 +43014,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02685546875,
+ "grad_norm": 0.028076171875,
"learning_rate": 0.0006593574201880536,
- "loss": 0.0062,
+ "loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 7303272.0,
"repeat_count": 0.0,
- "routers_loss": 0.005837739445269108,
+ "routers_loss": 0.005642973352223635,
"skip_count": 1.0,
"step": 4528,
"text_loss": 0.35269856452941895
@@ -43033,13 +43033,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032958984375,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.000659064014242053,
- "loss": 0.0046,
+ "loss": 0.0043,
"macro_f1": 0.6666666865348816,
"num_tokens": 7306615.0,
"repeat_count": 0.0,
- "routers_loss": 0.004657972138375044,
+ "routers_loss": 0.004171932581812143,
"skip_count": 1.0,
"step": 4530,
"text_loss": 0.18814080953598022
@@ -43052,13 +43052,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0286865234375,
+ "grad_norm": 0.02734375,
"learning_rate": 0.0006587705473424223,
- "loss": 0.0072,
+ "loss": 0.0071,
"macro_f1": 0.6666666865348816,
"num_tokens": 7310368.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025366253685206175,
+ "routers_loss": 0.002289367141202092,
"skip_count": 2.0,
"step": 4532,
"text_loss": 0.7363705635070801
@@ -43071,13 +43071,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.036376953125,
"learning_rate": 0.000658477019601618,
- "loss": 0.0072,
+ "loss": 0.0071,
"macro_f1": 0.6666666865348816,
"num_tokens": 7313788.0,
"repeat_count": 0.0,
- "routers_loss": 0.005018982570618391,
+ "routers_loss": 0.004440625663846731,
"skip_count": 1.0,
"step": 4534,
"text_loss": 0.8126176595687866
@@ -43090,13 +43090,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.029296875,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0006581834311321211,
- "loss": 0.0085,
+ "loss": 0.0086,
"macro_f1": 0.6666666865348816,
"num_tokens": 7317864.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013490618439391255,
+ "routers_loss": 0.0013160990783944726,
"skip_count": 2.0,
"step": 4536,
"text_loss": 0.7015916109085083
@@ -43109,32 +43109,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0390625,
+ "grad_norm": 0.04736328125,
"learning_rate": 0.000657889782046435,
- "loss": 0.0062,
+ "loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 7320693.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035190414637327194,
+ "routers_loss": 0.0032275544945150614,
"skip_count": 2.0,
"step": 4538,
"text_loss": 0.6481677293777466
},
{
"acc_repeat": 0.0,
- "acc_skip": 1.0,
- "avg_layers": 27.0,
+ "acc_skip": 0.0,
+ "avg_layers": 28.0,
"epoch": 21.314646316407398,
- "f1_execute": 1.0,
+ "f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
- "f1_skip": 1.0,
- "grad_norm": 0.0223388671875,
+ "f1_skip": 0.0,
+ "grad_norm": 0.0230712890625,
"learning_rate": 0.0006575960724570865,
- "loss": 0.0055,
- "macro_f1": 0.6666666865348816,
+ "loss": 0.0054,
+ "macro_f1": 0.3272727429866791,
"num_tokens": 7324335.0,
"repeat_count": 0.0,
- "routers_loss": 0.007447404786944389,
+ "routers_loss": 0.009769129566848278,
"skip_count": 1.0,
"step": 4540,
"text_loss": 0.22194676101207733
@@ -43147,13 +43147,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.04248046875,
+ "grad_norm": 0.042724609375,
"learning_rate": 0.0006573023024766258,
- "loss": 0.0062,
+ "loss": 0.0061,
"macro_f1": 1.0,
"num_tokens": 7327431.0,
"repeat_count": 2.0,
- "routers_loss": 0.0030924465972930193,
+ "routers_loss": 0.0036973082460463047,
"skip_count": 4.0,
"step": 4542,
"text_loss": 0.475127637386322
@@ -43166,13 +43166,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039794921875,
+ "grad_norm": 0.0361328125,
"learning_rate": 0.000657008472217626,
- "loss": 0.0061,
+ "loss": 0.0059,
"macro_f1": 0.3333333432674408,
"num_tokens": 7330262.0,
"repeat_count": 0.0,
- "routers_loss": 0.000717726768925786,
+ "routers_loss": 0.0007046440150588751,
"skip_count": 0.0,
"step": 4544,
"text_loss": 0.2649917006492615
@@ -43185,13 +43185,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044921875,
+ "grad_norm": 0.04443359375,
"learning_rate": 0.0006567145817926836,
- "loss": 0.0088,
+ "loss": 0.0087,
"macro_f1": 0.3333333432674408,
"num_tokens": 7333110.0,
"repeat_count": 0.0,
- "routers_loss": 0.0029236951377242804,
+ "routers_loss": 0.0026714997366070747,
"skip_count": 0.0,
"step": 4546,
"text_loss": 0.5490524768829346
@@ -43204,13 +43204,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.043701171875,
+ "grad_norm": 0.062255859375,
"learning_rate": 0.0006564206313144175,
"loss": 0.0075,
"macro_f1": 0.3333333432674408,
"num_tokens": 7336101.0,
"repeat_count": 0.0,
- "routers_loss": 0.006141145247966051,
+ "routers_loss": 0.006552211008965969,
"skip_count": 0.0,
"step": 4548,
"text_loss": 0.14098678529262543
@@ -43223,13 +43223,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0301513671875,
+ "grad_norm": 0.02978515625,
"learning_rate": 0.0006561266208954707,
- "loss": 0.0069,
+ "loss": 0.0068,
"macro_f1": 0.6666666865348816,
"num_tokens": 7339435.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035991708282381296,
+ "routers_loss": 0.0035560601390898228,
"skip_count": 2.0,
"step": 4550,
"text_loss": 0.20412275195121765
@@ -43242,13 +43242,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0238037109375,
+ "grad_norm": 0.0269775390625,
"learning_rate": 0.0006558325506485081,
"loss": 0.0057,
"macro_f1": 0.6666666865348816,
"num_tokens": 7342609.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024792153853923082,
+ "routers_loss": 0.0020106974989175797,
"skip_count": 1.0,
"step": 4552,
"text_loss": 0.6184256076812744
@@ -43263,11 +43263,11 @@
"f1_skip": 0.0,
"grad_norm": 0.050537109375,
"learning_rate": 0.0006555384206862183,
- "loss": 0.0091,
+ "loss": 0.009,
"macro_f1": 0.3333333432674408,
"num_tokens": 7345614.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014447715366259217,
+ "routers_loss": 0.0014235252747312188,
"skip_count": 0.0,
"step": 4554,
"text_loss": 1.0108838081359863
@@ -43280,13 +43280,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031005859375,
+ "grad_norm": 0.0302734375,
"learning_rate": 0.0006552442311213121,
- "loss": 0.0043,
+ "loss": 0.0041,
"macro_f1": 0.3272727429866791,
"num_tokens": 7348957.0,
"repeat_count": 1.0,
- "routers_loss": 0.02027573436498642,
+ "routers_loss": 0.01703745685517788,
"skip_count": 0.0,
"step": 4556,
"text_loss": 0.21315747499465942
@@ -43299,13 +43299,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.0263671875,
+ "grad_norm": 0.0269775390625,
"learning_rate": 0.0006549499820665237,
"loss": 0.0077,
"macro_f1": 0.5934640765190125,
"num_tokens": 7352724.0,
"repeat_count": 0.0,
- "routers_loss": 0.012388292700052261,
+ "routers_loss": 0.013315381482243538,
"skip_count": 3.0,
"step": 4558,
"text_loss": 0.34369465708732605
@@ -43318,13 +43318,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0308837890625,
+ "grad_norm": 0.033935546875,
"learning_rate": 0.00065465567363461,
- "loss": 0.0074,
+ "loss": 0.0072,
"macro_f1": 0.3333333432674408,
"num_tokens": 7356592.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011293066199868917,
+ "routers_loss": 0.0017354936571791768,
"skip_count": 0.0,
"step": 4560,
"text_loss": 0.6267461180686951
@@ -43337,13 +43337,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.037841796875,
"learning_rate": 0.0006543613059383503,
"loss": 0.0062,
"macro_f1": 0.6666666865348816,
"num_tokens": 7359774.0,
"repeat_count": 0.0,
- "routers_loss": 0.011833512224256992,
+ "routers_loss": 0.011646085418760777,
"skip_count": 2.0,
"step": 4562,
"text_loss": 0.4400193989276886
@@ -43356,13 +43356,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0277099609375,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0006540668790905471,
"loss": 0.0059,
"macro_f1": 0.3333333432674408,
"num_tokens": 7362765.0,
"repeat_count": 0.0,
- "routers_loss": 0.002059109043329954,
+ "routers_loss": 0.0019345436012372375,
"skip_count": 0.0,
"step": 4564,
"text_loss": 0.49204275012016296
@@ -43375,13 +43375,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.023681640625,
+ "grad_norm": 0.02685546875,
"learning_rate": 0.0006537723932040251,
"loss": 0.0048,
"macro_f1": 0.6666666865348816,
"num_tokens": 7366337.0,
"repeat_count": 0.0,
- "routers_loss": 0.005968277342617512,
+ "routers_loss": 0.00562885170802474,
"skip_count": 1.0,
"step": 4566,
"text_loss": 0.22566382586956024
@@ -43394,13 +43394,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.037841796875,
+ "grad_norm": 0.03515625,
"learning_rate": 0.0006534778483916319,
- "loss": 0.0087,
+ "loss": 0.0084,
"macro_f1": 1.0,
"num_tokens": 7369851.0,
"repeat_count": 2.0,
- "routers_loss": 0.005483719054609537,
+ "routers_loss": 0.005508176051080227,
"skip_count": 2.0,
"step": 4568,
"text_loss": 0.8057850003242493
@@ -43413,13 +43413,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.030029296875,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0006531832447662377,
"loss": 0.0073,
"macro_f1": 0.6666666865348816,
"num_tokens": 7373918.0,
"repeat_count": 0.0,
- "routers_loss": 0.006533551495522261,
+ "routers_loss": 0.006460923235863447,
"skip_count": 2.0,
"step": 4570,
"text_loss": 0.5141497254371643
@@ -43432,13 +43432,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04345703125,
+ "grad_norm": 0.042236328125,
"learning_rate": 0.0006528885824407351,
- "loss": 0.0083,
+ "loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 7376674.0,
"repeat_count": 0.0,
- "routers_loss": 0.003182383719831705,
+ "routers_loss": 0.0032120654359459877,
"skip_count": 0.0,
"step": 4572,
"text_loss": 0.1281338930130005
@@ -43451,13 +43451,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.054931640625,
+ "grad_norm": 0.052490234375,
"learning_rate": 0.0006525938615280394,
- "loss": 0.0118,
+ "loss": 0.0116,
"macro_f1": 0.3333333432674408,
"num_tokens": 7379791.0,
"repeat_count": 0.0,
- "routers_loss": 0.00441814586520195,
+ "routers_loss": 0.00443810923025012,
"skip_count": 0.0,
"step": 4574,
"text_loss": 0.268352210521698
@@ -43470,13 +43470,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0296630859375,
+ "grad_norm": 0.027587890625,
"learning_rate": 0.000652299082141088,
"loss": 0.0055,
"macro_f1": 0.6666666865348816,
"num_tokens": 7382886.0,
"repeat_count": 0.0,
- "routers_loss": 0.008390357717871666,
+ "routers_loss": 0.008284369483590126,
"skip_count": 2.0,
"step": 4576,
"text_loss": 0.30193832516670227
@@ -43489,13 +43489,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0006520042443928411,
- "loss": 0.0071,
+ "loss": 0.0068,
"macro_f1": 0.8823530077934265,
"num_tokens": 7386036.0,
"repeat_count": 2.0,
- "routers_loss": 0.03992438316345215,
+ "routers_loss": 0.03383317217230797,
"skip_count": 1.0,
"step": 4578,
"text_loss": 0.23106542229652405
@@ -43508,13 +43508,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.0419921875,
"learning_rate": 0.000651709348396281,
- "loss": 0.0053,
+ "loss": 0.0056,
"macro_f1": 0.6666666865348816,
"num_tokens": 7388908.0,
"repeat_count": 0.0,
- "routers_loss": 0.001781110418960452,
+ "routers_loss": 0.0017075951909646392,
"skip_count": 1.0,
"step": 4580,
"text_loss": 0.386099249124527
@@ -43527,13 +43527,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0296630859375,
+ "grad_norm": 0.031494140625,
"learning_rate": 0.0006514143942644124,
- "loss": 0.0049,
+ "loss": 0.0048,
"macro_f1": 0.6666666865348816,
"num_tokens": 7392004.0,
"repeat_count": 0.0,
- "routers_loss": 0.009884138591587543,
+ "routers_loss": 0.009516917169094086,
"skip_count": 1.0,
"step": 4582,
"text_loss": 0.3162059485912323
@@ -43546,13 +43546,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04736328125,
+ "grad_norm": 0.051513671875,
"learning_rate": 0.0006511193821102623,
- "loss": 0.0078,
+ "loss": 0.0076,
"macro_f1": 0.3333333432674408,
"num_tokens": 7395538.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032415634486824274,
+ "routers_loss": 0.0031392278615385294,
"skip_count": 0.0,
"step": 4584,
"text_loss": 0.5536221861839294
@@ -43565,13 +43565,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.06298828125,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0006508243120468799,
- "loss": 0.0054,
+ "loss": 0.0051,
"macro_f1": 0.3333333432674408,
"num_tokens": 7398461.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014925460563972592,
+ "routers_loss": 0.0014138511614874005,
"skip_count": 0.0,
"step": 4586,
"text_loss": 0.7934318780899048
@@ -43584,13 +43584,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0242919921875,
+ "grad_norm": 0.0224609375,
"learning_rate": 0.0006505291841873367,
- "loss": 0.0053,
+ "loss": 0.0052,
"macro_f1": 0.3333333432674408,
"num_tokens": 7401611.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005577150150202215,
+ "routers_loss": 0.0005265916115604341,
"skip_count": 0.0,
"step": 4588,
"text_loss": 0.4569905698299408
@@ -43609,7 +43609,7 @@
"macro_f1": 0.3333333432674408,
"num_tokens": 7404641.0,
"repeat_count": 0.0,
- "routers_loss": 0.0023784362711012363,
+ "routers_loss": 0.0024988956283777952,
"skip_count": 0.0,
"step": 4590,
"text_loss": 0.49998772144317627
@@ -43622,13 +43622,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0235595703125,
+ "grad_norm": 0.025634765625,
"learning_rate": 0.0006499387555321636,
"loss": 0.0054,
"macro_f1": 0.6666666865348816,
"num_tokens": 7407574.0,
"repeat_count": 0.0,
- "routers_loss": 0.004376447293907404,
+ "routers_loss": 0.004110113717615604,
"skip_count": 1.0,
"step": 4592,
"text_loss": 0.5679413676261902
@@ -43641,13 +43641,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03271484375,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0006496434549627874,
- "loss": 0.0069,
+ "loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 7410806.0,
"repeat_count": 0.0,
- "routers_loss": 0.0032524678390473127,
+ "routers_loss": 0.0032845588866621256,
"skip_count": 0.0,
"step": 4594,
"text_loss": 0.35515281558036804
@@ -43660,13 +43660,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.036865234375,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0006493480970497568,
"loss": 0.0052,
"macro_f1": 0.6666666865348816,
"num_tokens": 7413402.0,
"repeat_count": 0.0,
- "routers_loss": 0.009982835501432419,
+ "routers_loss": 0.010577172972261906,
"skip_count": 1.0,
"step": 4596,
"text_loss": 0.26111698150634766
@@ -43679,13 +43679,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.03662109375,
+ "grad_norm": 0.035888671875,
"learning_rate": 0.0006490526819062537,
- "loss": 0.0093,
+ "loss": 0.0091,
"macro_f1": 1.0,
"num_tokens": 7417236.0,
"repeat_count": 1.0,
- "routers_loss": 0.002379048615694046,
+ "routers_loss": 0.002054794691503048,
"skip_count": 2.0,
"step": 4598,
"text_loss": 0.6480993628501892
@@ -43698,13 +43698,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07763671875,
+ "grad_norm": 0.07958984375,
"learning_rate": 0.0006487572096454818,
"loss": 0.0072,
"macro_f1": 0.3333333432674408,
"num_tokens": 7420278.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017828276613727212,
+ "routers_loss": 0.0017989084590226412,
"skip_count": 0.0,
"step": 4600,
"text_loss": 0.4935401678085327
@@ -43717,13 +43717,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0006484616803806665,
- "loss": 0.0058,
+ "loss": 0.0056,
"macro_f1": 0.6666666865348816,
"num_tokens": 7423866.0,
"repeat_count": 0.0,
- "routers_loss": 0.007584894075989723,
+ "routers_loss": 0.006671485956758261,
"skip_count": 1.0,
"step": 4602,
"text_loss": 0.15030258893966675
@@ -43736,13 +43736,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.0311279296875,
"learning_rate": 0.0006481660942250552,
- "loss": 0.0054,
+ "loss": 0.0052,
"macro_f1": 0.6666666865348816,
"num_tokens": 7426884.0,
"repeat_count": 0.0,
- "routers_loss": 0.008093188516795635,
+ "routers_loss": 0.008334980346262455,
"skip_count": 3.0,
"step": 4604,
"text_loss": 0.29933279752731323
@@ -43755,13 +43755,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.030029296875,
+ "grad_norm": 0.03125,
"learning_rate": 0.0006478704512919173,
"loss": 0.0077,
"macro_f1": 0.6666666865348816,
"num_tokens": 7431017.0,
"repeat_count": 0.0,
- "routers_loss": 0.012283207848668098,
+ "routers_loss": 0.011923984624445438,
"skip_count": 3.0,
"step": 4606,
"text_loss": 0.35141825675964355
@@ -43774,13 +43774,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0267333984375,
+ "grad_norm": 0.0279541015625,
"learning_rate": 0.0006475747516945432,
"loss": 0.0056,
"macro_f1": 0.6666666865348816,
"num_tokens": 7434406.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035103289410471916,
+ "routers_loss": 0.0031092462595552206,
"skip_count": 3.0,
"step": 4608,
"text_loss": 0.21021464467048645
@@ -43793,13 +43793,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0279541015625,
+ "grad_norm": 0.02978515625,
"learning_rate": 0.000647278995546246,
- "loss": 0.0058,
+ "loss": 0.0057,
"macro_f1": 0.6666666865348816,
"num_tokens": 7437204.0,
"repeat_count": 1.0,
- "routers_loss": 0.0006666383123956621,
+ "routers_loss": 0.0006713552866131067,
"skip_count": 0.0,
"step": 4610,
"text_loss": 0.4052635431289673
@@ -43812,13 +43812,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0274658203125,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0006469831829603598,
"loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 7439741.0,
"repeat_count": 0.0,
- "routers_loss": 0.0028148891869932413,
+ "routers_loss": 0.0022583482787013054,
"skip_count": 2.0,
"step": 4612,
"text_loss": 0.5443860292434692
@@ -43831,13 +43831,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04296875,
+ "grad_norm": 0.044677734375,
"learning_rate": 0.0006466873140502407,
- "loss": 0.0074,
+ "loss": 0.0073,
"macro_f1": 0.6666666865348816,
"num_tokens": 7443619.0,
"repeat_count": 0.0,
- "routers_loss": 0.0037154473830014467,
+ "routers_loss": 0.004187075886875391,
"skip_count": 2.0,
"step": 4614,
"text_loss": 0.30709847807884216
@@ -43850,13 +43850,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.019775390625,
+ "grad_norm": 0.0194091796875,
"learning_rate": 0.0006463913889292661,
"loss": 0.0075,
"macro_f1": 0.3333333432674408,
"num_tokens": 7446696.0,
"repeat_count": 0.0,
- "routers_loss": 0.007844357751309872,
+ "routers_loss": 0.008314833045005798,
"skip_count": 0.0,
"step": 4616,
"text_loss": 0.22949637472629547
@@ -43869,13 +43869,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0250244140625,
+ "grad_norm": 0.02685546875,
"learning_rate": 0.0006460954077108353,
- "loss": 0.0047,
+ "loss": 0.0048,
"macro_f1": 0.3333333432674408,
"num_tokens": 7450377.0,
"repeat_count": 0.0,
- "routers_loss": 0.001379768829792738,
+ "routers_loss": 0.001277514616958797,
"skip_count": 0.0,
"step": 4618,
"text_loss": 0.37715134024620056
@@ -43888,13 +43888,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0289306640625,
+ "grad_norm": 0.02734375,
"learning_rate": 0.0006457993705083684,
- "loss": 0.0051,
+ "loss": 0.005,
"macro_f1": 0.6666666865348816,
"num_tokens": 7453271.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019801959861069918,
+ "routers_loss": 0.0022756033577024937,
"skip_count": 2.0,
"step": 4620,
"text_loss": 0.7373883128166199
@@ -43907,13 +43907,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.019775390625,
+ "grad_norm": 0.02099609375,
"learning_rate": 0.0006455032774353078,
"loss": 0.0062,
"macro_f1": 0.6666666865348816,
"num_tokens": 7456492.0,
"repeat_count": 0.0,
- "routers_loss": 0.0038891383446753025,
+ "routers_loss": 0.0039057908579707146,
"skip_count": 2.0,
"step": 4622,
"text_loss": 0.5058769583702087
@@ -43926,13 +43926,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0196533203125,
+ "grad_norm": 0.0203857421875,
"learning_rate": 0.0006452071286051169,
"loss": 0.0039,
"macro_f1": 0.3333333432674408,
"num_tokens": 7459619.0,
"repeat_count": 0.0,
- "routers_loss": 0.001924185431562364,
+ "routers_loss": 0.0019458672031760216,
"skip_count": 0.0,
"step": 4624,
"text_loss": 0.5110082030296326
@@ -43945,13 +43945,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.033447265625,
"learning_rate": 0.0006449109241312802,
- "loss": 0.0059,
+ "loss": 0.006,
"macro_f1": 0.6666666865348816,
"num_tokens": 7462552.0,
"repeat_count": 0.0,
- "routers_loss": 0.000527520664036274,
+ "routers_loss": 0.0002716891176532954,
"skip_count": 1.0,
"step": 4626,
"text_loss": 0.6197522878646851
@@ -43964,13 +43964,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.05126953125,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0006446146641273042,
- "loss": 0.0063,
+ "loss": 0.0061,
"macro_f1": 0.6666666865348816,
"num_tokens": 7466769.0,
"repeat_count": 0.0,
- "routers_loss": 0.004048905335366726,
+ "routers_loss": 0.0037578947376459837,
"skip_count": 2.0,
"step": 4628,
"text_loss": 0.1653924286365509
@@ -43983,13 +43983,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0228271484375,
+ "grad_norm": 0.022705078125,
"learning_rate": 0.000644318348706716,
- "loss": 0.0074,
+ "loss": 0.0072,
"macro_f1": 0.3333333432674408,
"num_tokens": 7470216.0,
"repeat_count": 0.0,
- "routers_loss": 0.001336342073045671,
+ "routers_loss": 0.0012791058979928493,
"skip_count": 0.0,
"step": 4630,
"text_loss": 0.7114694118499756
@@ -44002,13 +44002,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.032958984375,
"learning_rate": 0.0006440219779830643,
- "loss": 0.0076,
+ "loss": 0.0075,
"macro_f1": 0.6666666865348816,
"num_tokens": 7472975.0,
"repeat_count": 0.0,
- "routers_loss": 0.007155329454690218,
+ "routers_loss": 0.00736592011526227,
"skip_count": 2.0,
"step": 4632,
"text_loss": 0.26601463556289673
@@ -44021,13 +44021,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.000643725552069919,
- "loss": 0.0071,
+ "loss": 0.0073,
"macro_f1": 0.3333333432674408,
"num_tokens": 7475672.0,
"repeat_count": 0.0,
- "routers_loss": 0.0004819786408916116,
+ "routers_loss": 0.00045455715735442936,
"skip_count": 0.0,
"step": 4634,
"text_loss": 0.5028402805328369
@@ -44040,13 +44040,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0225830078125,
+ "grad_norm": 0.022705078125,
"learning_rate": 0.0006434290710808711,
- "loss": 0.0055,
+ "loss": 0.0054,
"macro_f1": 0.6666666865348816,
"num_tokens": 7478850.0,
"repeat_count": 0.0,
- "routers_loss": 0.004355283919721842,
+ "routers_loss": 0.004247233271598816,
"skip_count": 2.0,
"step": 4636,
"text_loss": 0.12746070325374603
@@ -44059,13 +44059,13 @@
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.04052734375,
"learning_rate": 0.0006431325351295324,
"loss": 0.0083,
"macro_f1": 0.5427350401878357,
"num_tokens": 7481747.0,
"repeat_count": 1.0,
- "routers_loss": 0.04843593016266823,
+ "routers_loss": 0.047564394772052765,
"skip_count": 2.0,
"step": 4638,
"text_loss": 0.24056802690029144
@@ -44078,13 +44078,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0006428359443295362,
"loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 7484885.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010549267753958702,
+ "routers_loss": 0.0011175100225955248,
"skip_count": 0.0,
"step": 4640,
"text_loss": 0.6265338063240051
@@ -44097,13 +44097,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0361328125,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0006425392987945369,
"loss": 0.0086,
"macro_f1": 0.5492662787437439,
"num_tokens": 7487973.0,
"repeat_count": 0.0,
- "routers_loss": 0.016608718782663345,
+ "routers_loss": 0.016879938542842865,
"skip_count": 2.0,
"step": 4642,
"text_loss": 0.2523447275161743
@@ -44116,13 +44116,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.032958984375,
"learning_rate": 0.0006422425986382093,
"loss": 0.0055,
"macro_f1": 0.5934640765190125,
"num_tokens": 7491024.0,
"repeat_count": 0.0,
- "routers_loss": 0.01848086155951023,
+ "routers_loss": 0.018616504967212677,
"skip_count": 3.0,
"step": 4644,
"text_loss": 0.38890624046325684
@@ -44135,13 +44135,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.038818359375,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0006419458439742496,
"loss": 0.0056,
"macro_f1": 0.3272727429866791,
"num_tokens": 7494199.0,
"repeat_count": 0.0,
- "routers_loss": 0.022435056045651436,
+ "routers_loss": 0.023129139095544815,
"skip_count": 1.0,
"step": 4646,
"text_loss": 0.4060848355293274
@@ -44154,13 +44154,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0006416490349163747,
- "loss": 0.0083,
+ "loss": 0.0086,
"macro_f1": 0.3333333432674408,
"num_tokens": 7497287.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018073184182867408,
+ "routers_loss": 0.0018601802876219153,
"skip_count": 0.0,
"step": 4648,
"text_loss": 0.3387545943260193
@@ -44173,13 +44173,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0006413521715783225,
- "loss": 0.0078,
+ "loss": 0.0079,
"macro_f1": 0.3333333432674408,
"num_tokens": 7500598.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017208937788382173,
+ "routers_loss": 0.0017482215771451592,
"skip_count": 0.0,
"step": 4650,
"text_loss": 0.4290996193885803
@@ -44192,13 +44192,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04052734375,
+ "grad_norm": 0.040771484375,
"learning_rate": 0.0006410552540738514,
- "loss": 0.0071,
+ "loss": 0.007,
"macro_f1": 0.3272727429866791,
"num_tokens": 7503252.0,
"repeat_count": 1.0,
- "routers_loss": 0.04149872064590454,
+ "routers_loss": 0.0420118011534214,
"skip_count": 0.0,
"step": 4652,
"text_loss": 0.439496248960495
@@ -44211,13 +44211,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.028076171875,
+ "grad_norm": 0.026611328125,
"learning_rate": 0.000640758282516741,
- "loss": 0.0057,
+ "loss": 0.0055,
"macro_f1": 1.0,
"num_tokens": 7506382.0,
"repeat_count": 1.0,
- "routers_loss": 0.002120798220857978,
+ "routers_loss": 0.0017782216891646385,
"skip_count": 1.0,
"step": 4654,
"text_loss": 0.8513308167457581
@@ -44230,13 +44230,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.0439453125,
"learning_rate": 0.0006404612570207911,
- "loss": 0.0103,
+ "loss": 0.0102,
"macro_f1": 0.3272727429866791,
"num_tokens": 7510423.0,
"repeat_count": 0.0,
- "routers_loss": 0.009855805896222591,
+ "routers_loss": 0.010385853238403797,
"skip_count": 0.0,
"step": 4656,
"text_loss": 0.7159742712974548
@@ -44249,13 +44249,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03369140625,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0006401641776998223,
- "loss": 0.0047,
+ "loss": 0.0046,
"macro_f1": 0.3333333432674408,
"num_tokens": 7513394.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011576786637306213,
+ "routers_loss": 0.0011917101219296455,
"skip_count": 0.0,
"step": 4658,
"text_loss": 0.6165401339530945
@@ -44268,13 +44268,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.028564453125,
"learning_rate": 0.0006398670446676766,
"loss": 0.007,
"macro_f1": 1.0,
"num_tokens": 7516828.0,
"repeat_count": 3.0,
- "routers_loss": 0.008810436353087425,
+ "routers_loss": 0.008860073052346706,
"skip_count": 4.0,
"step": 4660,
"text_loss": 0.923275887966156
@@ -44287,13 +44287,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.042236328125,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0006395698580382153,
- "loss": 0.0065,
+ "loss": 0.0063,
"macro_f1": 0.3333333432674408,
"num_tokens": 7519764.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005982713773846626,
+ "routers_loss": 0.000505418807733804,
"skip_count": 0.0,
"step": 4662,
"text_loss": 0.6143050789833069
@@ -44306,13 +44306,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04150390625,
+ "grad_norm": 0.0439453125,
"learning_rate": 0.0006392726179253212,
- "loss": 0.0047,
+ "loss": 0.0049,
"macro_f1": 0.6666666865348816,
"num_tokens": 7522390.0,
"repeat_count": 0.0,
- "routers_loss": 0.004173434805124998,
+ "routers_loss": 0.004020806401968002,
"skip_count": 1.0,
"step": 4664,
"text_loss": 0.6935067176818848
@@ -44325,13 +44325,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.04052734375,
+ "grad_norm": 0.052001953125,
"learning_rate": 0.0006389753244428972,
- "loss": 0.0078,
+ "loss": 0.0079,
"macro_f1": 1.0,
"num_tokens": 7525821.0,
"repeat_count": 1.0,
- "routers_loss": 0.008930242620408535,
+ "routers_loss": 0.00957963801920414,
"skip_count": 2.0,
"step": 4666,
"text_loss": 0.3350338637828827
@@ -44344,13 +44344,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.04296875,
+ "grad_norm": 0.039794921875,
"learning_rate": 0.0006386779777048666,
- "loss": 0.0066,
+ "loss": 0.0063,
"macro_f1": 0.6601307392120361,
"num_tokens": 7529513.0,
"repeat_count": 1.0,
- "routers_loss": 0.02444119192659855,
+ "routers_loss": 0.020673364400863647,
"skip_count": 2.0,
"step": 4668,
"text_loss": 0.47800472378730774
@@ -44363,13 +44363,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0255126953125,
+ "grad_norm": 0.0257568359375,
"learning_rate": 0.0006383805778251735,
- "loss": 0.005,
+ "loss": 0.0048,
"macro_f1": 0.6666666865348816,
"num_tokens": 7533450.0,
"repeat_count": 0.0,
- "routers_loss": 0.007665765006095171,
+ "routers_loss": 0.007217096630483866,
"skip_count": 1.0,
"step": 4670,
"text_loss": 0.4506106972694397
@@ -44382,13 +44382,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0284423828125,
+ "grad_norm": 0.0257568359375,
"learning_rate": 0.0006380831249177817,
"loss": 0.0039,
"macro_f1": 0.6666666865348816,
"num_tokens": 7536287.0,
"repeat_count": 1.0,
- "routers_loss": 0.008599632419645786,
+ "routers_loss": 0.007001714315265417,
"skip_count": 0.0,
"step": 4672,
"text_loss": 0.4081715941429138
@@ -44401,13 +44401,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0250244140625,
+ "grad_norm": 0.027587890625,
"learning_rate": 0.0006377856190966762,
- "loss": 0.0055,
+ "loss": 0.0054,
"macro_f1": 0.3333333432674408,
"num_tokens": 7539442.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014951099874451756,
+ "routers_loss": 0.0015112817054614425,
"skip_count": 0.0,
"step": 4674,
"text_loss": 0.21451139450073242
@@ -44420,13 +44420,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0006374880604758615,
- "loss": 0.0086,
+ "loss": 0.0083,
"macro_f1": 0.6666666865348816,
"num_tokens": 7542594.0,
"repeat_count": 0.0,
- "routers_loss": 0.00817523431032896,
+ "routers_loss": 0.007311929017305374,
"skip_count": 2.0,
"step": 4676,
"text_loss": 0.14785248041152954
@@ -44439,13 +44439,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.0306396484375,
"learning_rate": 0.0006371904491693626,
- "loss": 0.0052,
+ "loss": 0.0053,
"macro_f1": 0.6666666865348816,
"num_tokens": 7545780.0,
"repeat_count": 0.0,
- "routers_loss": 0.007712447550147772,
+ "routers_loss": 0.007489737123250961,
"skip_count": 1.0,
"step": 4678,
"text_loss": 0.2248108983039856
@@ -44458,13 +44458,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.03466796875,
+ "grad_norm": 0.031494140625,
"learning_rate": 0.0006368927852912247,
- "loss": 0.0059,
+ "loss": 0.0057,
"macro_f1": 1.0,
"num_tokens": 7548287.0,
"repeat_count": 1.0,
- "routers_loss": 0.010472464375197887,
+ "routers_loss": 0.009772555902600288,
"skip_count": 1.0,
"step": 4680,
"text_loss": 0.1566995233297348
@@ -44477,13 +44477,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.0006365950689555133,
- "loss": 0.0065,
+ "loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 7551424.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019706315360963345,
+ "routers_loss": 0.002134992741048336,
"skip_count": 0.0,
"step": 4682,
"text_loss": 0.7322417497634888
@@ -44496,13 +44496,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0257568359375,
+ "grad_norm": 0.0240478515625,
"learning_rate": 0.0006362973002763139,
- "loss": 0.0071,
+ "loss": 0.007,
"macro_f1": 1.0,
"num_tokens": 7554182.0,
"repeat_count": 1.0,
- "routers_loss": 0.0077865333296358585,
+ "routers_loss": 0.008511497639119625,
"skip_count": 4.0,
"step": 4684,
"text_loss": 0.24387991428375244
@@ -44515,13 +44515,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.046875,
+ "grad_norm": 0.04931640625,
"learning_rate": 0.0006359994793677319,
"loss": 0.0095,
"macro_f1": 0.6666666865348816,
"num_tokens": 7557044.0,
"repeat_count": 0.0,
- "routers_loss": 0.004420961253345013,
+ "routers_loss": 0.004151526838541031,
"skip_count": 2.0,
"step": 4686,
"text_loss": 0.6139411330223083
@@ -44534,13 +44534,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0233154296875,
+ "grad_norm": 0.0228271484375,
"learning_rate": 0.0006357016063438928,
- "loss": 0.0047,
+ "loss": 0.0046,
"macro_f1": 0.3333333432674408,
"num_tokens": 7560231.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011308451648801565,
+ "routers_loss": 0.0009724601986818016,
"skip_count": 0.0,
"step": 4688,
"text_loss": 0.7875718474388123
@@ -44553,13 +44553,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03271484375,
+ "grad_norm": 0.0308837890625,
"learning_rate": 0.0006354036813189421,
"loss": 0.0053,
"macro_f1": 0.3333333432674408,
"num_tokens": 7562953.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008846965502016246,
+ "routers_loss": 0.0008926765876822174,
"skip_count": 0.0,
"step": 4690,
"text_loss": 0.5195512771606445
@@ -44572,13 +44572,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.037353515625,
+ "grad_norm": 0.03759765625,
"learning_rate": 0.0006351057044070455,
"loss": 0.0078,
"macro_f1": 0.3333333432674408,
"num_tokens": 7566137.0,
"repeat_count": 0.0,
- "routers_loss": 0.003313175868242979,
+ "routers_loss": 0.0031294538639485836,
"skip_count": 0.0,
"step": 4692,
"text_loss": 0.7288873195648193
@@ -44591,13 +44591,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0274658203125,
+ "grad_norm": 0.026123046875,
"learning_rate": 0.0006348076757223877,
- "loss": 0.004,
+ "loss": 0.0038,
"macro_f1": 0.6666666865348816,
"num_tokens": 7569073.0,
"repeat_count": 0.0,
- "routers_loss": 0.0016258886316791177,
+ "routers_loss": 0.0015065820189192891,
"skip_count": 2.0,
"step": 4694,
"text_loss": 0.7242236137390137
@@ -44610,13 +44610,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0218505859375,
+ "grad_norm": 0.0235595703125,
"learning_rate": 0.0006345095953791746,
- "loss": 0.0075,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 7573025.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005521657876670361,
+ "routers_loss": 0.0005603441968560219,
"skip_count": 0.0,
"step": 4696,
"text_loss": 0.34443899989128113
@@ -44629,13 +44629,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.023681640625,
+ "grad_norm": 0.02490234375,
"learning_rate": 0.0006342114634916307,
"loss": 0.0068,
"macro_f1": 0.3333333432674408,
"num_tokens": 7576546.0,
"repeat_count": 0.0,
- "routers_loss": 0.0011082915589213371,
+ "routers_loss": 0.0011047758162021637,
"skip_count": 0.0,
"step": 4698,
"text_loss": 0.4892682731151581
@@ -44648,13 +44648,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0218505859375,
+ "grad_norm": 0.02490234375,
"learning_rate": 0.0006339132801740008,
"loss": 0.0076,
"macro_f1": 0.6666666865348816,
"num_tokens": 7580711.0,
"repeat_count": 0.0,
- "routers_loss": 0.001985425828024745,
+ "routers_loss": 0.0019803126342594624,
"skip_count": 2.0,
"step": 4700,
"text_loss": 0.4479489028453827
@@ -44667,13 +44667,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.037353515625,
+ "grad_norm": 0.0458984375,
"learning_rate": 0.0006336150455405494,
"loss": 0.0067,
"macro_f1": 0.6666666865348816,
"num_tokens": 7583385.0,
"repeat_count": 1.0,
- "routers_loss": 0.0005365543183870614,
+ "routers_loss": 0.0005326359532773495,
"skip_count": 0.0,
"step": 4702,
"text_loss": 0.627504825592041
@@ -44686,13 +44686,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.021240234375,
+ "grad_norm": 0.0194091796875,
"learning_rate": 0.0006333167597055604,
- "loss": 0.0037,
+ "loss": 0.0035,
"macro_f1": 0.3333333432674408,
"num_tokens": 7586584.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006241816445253789,
+ "routers_loss": 0.0005587987834587693,
"skip_count": 0.0,
"step": 4704,
"text_loss": 0.43891432881355286
@@ -44705,13 +44705,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.0263671875,
"learning_rate": 0.0006330184227833376,
- "loss": 0.0045,
+ "loss": 0.0044,
"macro_f1": 0.6666666865348816,
"num_tokens": 7590408.0,
"repeat_count": 0.0,
- "routers_loss": 0.00726567255333066,
+ "routers_loss": 0.007053783163428307,
"skip_count": 2.0,
"step": 4706,
"text_loss": 0.19946859776973724
@@ -44724,13 +44724,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0224609375,
+ "grad_norm": 0.0228271484375,
"learning_rate": 0.0006327200348882043,
- "loss": 0.0047,
+ "loss": 0.0045,
"macro_f1": 0.6666666865348816,
"num_tokens": 7593857.0,
"repeat_count": 1.0,
- "routers_loss": 0.0011741123162209988,
+ "routers_loss": 0.0009479080326855183,
"skip_count": 0.0,
"step": 4708,
"text_loss": 0.7973214387893677
@@ -44743,13 +44743,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07177734375,
+ "grad_norm": 0.1259765625,
"learning_rate": 0.0006324215961345032,
- "loss": 0.0057,
+ "loss": 0.006,
"macro_f1": 0.3333333432674408,
"num_tokens": 7596429.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012845906894654036,
+ "routers_loss": 0.0012403312139213085,
"skip_count": 0.0,
"step": 4710,
"text_loss": 0.48477989435195923
@@ -44762,13 +44762,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.032470703125,
+ "grad_norm": 0.03515625,
"learning_rate": 0.0006321231066365966,
- "loss": 0.0069,
+ "loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 7599618.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005659137386828661,
+ "routers_loss": 0.0005520360427908599,
"skip_count": 0.0,
"step": 4712,
"text_loss": 0.44222453236579895
@@ -44781,13 +44781,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.039306640625,
+ "grad_norm": 0.04150390625,
"learning_rate": 0.0006318245665088665,
"loss": 0.0077,
"macro_f1": 0.3333333432674408,
"num_tokens": 7603180.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018121730536222458,
+ "routers_loss": 0.0015553623670712113,
"skip_count": 0.0,
"step": 4714,
"text_loss": 0.5132410526275635
@@ -44800,13 +44800,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02734375,
+ "grad_norm": 0.027587890625,
"learning_rate": 0.0006315259758657138,
- "loss": 0.0049,
+ "loss": 0.0047,
"macro_f1": 0.6666666865348816,
"num_tokens": 7606457.0,
"repeat_count": 0.0,
- "routers_loss": 0.004462256096303463,
+ "routers_loss": 0.004210884217172861,
"skip_count": 1.0,
"step": 4716,
"text_loss": 0.39850690960884094
@@ -44819,13 +44819,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.021728515625,
+ "grad_norm": 0.02294921875,
"learning_rate": 0.0006312273348215589,
- "loss": 0.0069,
+ "loss": 0.0068,
"macro_f1": 0.6666666865348816,
"num_tokens": 7609317.0,
"repeat_count": 1.0,
- "routers_loss": 0.0011878227815032005,
+ "routers_loss": 0.001220117206685245,
"skip_count": 0.0,
"step": 4718,
"text_loss": 0.3509018123149872
@@ -44838,13 +44838,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.031494140625,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0006309286434908419,
- "loss": 0.008,
+ "loss": 0.0081,
"macro_f1": 0.6666666865348816,
"num_tokens": 7613076.0,
"repeat_count": 0.0,
- "routers_loss": 0.008010992780327797,
+ "routers_loss": 0.007768960203975439,
"skip_count": 2.0,
"step": 4720,
"text_loss": 0.33361560106277466
@@ -44857,32 +44857,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031494140625,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.0006306299019880217,
"loss": 0.006,
"macro_f1": 0.3333333432674408,
"num_tokens": 7616242.0,
"repeat_count": 0.0,
- "routers_loss": 0.005931100342422724,
+ "routers_loss": 0.006226699333637953,
"skip_count": 0.0,
"step": 4722,
"text_loss": 0.23661087453365326
},
{
"acc_repeat": 1.0,
- "acc_skip": 1.0,
- "avg_layers": 28.0,
+ "acc_skip": 0.0,
+ "avg_layers": 29.0,
"epoch": 22.17845611975345,
- "f1_execute": 1.0,
+ "f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
- "f1_skip": 1.0,
- "grad_norm": 0.0478515625,
+ "f1_skip": 0.0,
+ "grad_norm": 0.045654296875,
"learning_rate": 0.0006303311104275766,
- "loss": 0.0075,
- "macro_f1": 1.0,
+ "loss": 0.0073,
+ "macro_f1": 0.6603773832321167,
"num_tokens": 7619069.0,
"repeat_count": 1.0,
- "routers_loss": 0.013775430619716644,
+ "routers_loss": 0.015590761788189411,
"skip_count": 1.0,
"step": 4724,
"text_loss": 0.23373056948184967
@@ -44895,13 +44895,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0302734375,
+ "grad_norm": 0.028564453125,
"learning_rate": 0.0006300322689240041,
- "loss": 0.0077,
+ "loss": 0.0076,
"macro_f1": 1.0,
"num_tokens": 7622581.0,
"repeat_count": 1.0,
- "routers_loss": 0.0069032334722578526,
+ "routers_loss": 0.006862971931695938,
"skip_count": 2.0,
"step": 4726,
"text_loss": 0.8301828503608704
@@ -44914,13 +44914,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.038818359375,
"learning_rate": 0.0006297333775918209,
- "loss": 0.0089,
+ "loss": 0.0086,
"macro_f1": 1.0,
"num_tokens": 7625566.0,
"repeat_count": 1.0,
- "routers_loss": 0.006230995524674654,
+ "routers_loss": 0.006256614346057177,
"skip_count": 1.0,
"step": 4728,
"text_loss": 0.3756707012653351
@@ -44933,13 +44933,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.030517578125,
+ "grad_norm": 0.0301513671875,
"learning_rate": 0.0006294344365455626,
- "loss": 0.0078,
+ "loss": 0.0079,
"macro_f1": 1.0,
"num_tokens": 7629047.0,
"repeat_count": 1.0,
- "routers_loss": 0.009772522374987602,
+ "routers_loss": 0.009151885285973549,
"skip_count": 2.0,
"step": 4730,
"text_loss": 0.33362850546836853
@@ -44952,13 +44952,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.035888671875,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.0006291354458997841,
- "loss": 0.007,
+ "loss": 0.0071,
"macro_f1": 0.3333333432674408,
"num_tokens": 7631847.0,
"repeat_count": 0.0,
- "routers_loss": 0.000902787665836513,
+ "routers_loss": 0.0009307434665970504,
"skip_count": 0.0,
"step": 4732,
"text_loss": 0.4572524130344391
@@ -44971,13 +44971,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0262451171875,
+ "grad_norm": 0.0272216796875,
"learning_rate": 0.0006288364057690591,
- "loss": 0.0072,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 7635181.0,
"repeat_count": 0.0,
- "routers_loss": 0.0004107247805222869,
+ "routers_loss": 0.00041220212006010115,
"skip_count": 0.0,
"step": 4734,
"text_loss": 0.40211325883865356
@@ -44990,13 +44990,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03515625,
+ "grad_norm": 0.03955078125,
"learning_rate": 0.0006285373162679804,
"loss": 0.0045,
"macro_f1": 0.6666666865348816,
"num_tokens": 7637752.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008339153719134629,
+ "routers_loss": 0.0006696670898236334,
"skip_count": 2.0,
"step": 4736,
"text_loss": 0.7588053345680237
@@ -45009,13 +45009,13 @@
"f1_execute": 0.9777777791023254,
"f1_repeat": 0.8571428656578064,
"f1_skip": 1.0,
- "grad_norm": 0.0390625,
+ "grad_norm": 0.03759765625,
"learning_rate": 0.0006282381775111597,
"loss": 0.0081,
"macro_f1": 0.9449735879898071,
"num_tokens": 7640719.0,
"repeat_count": 4.0,
- "routers_loss": 0.015601541846990585,
+ "routers_loss": 0.016283133998513222,
"skip_count": 2.0,
"step": 4738,
"text_loss": 0.5697863101959229
@@ -45028,13 +45028,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.037841796875,
+ "grad_norm": 0.03955078125,
"learning_rate": 0.0006279389896132274,
- "loss": 0.0062,
+ "loss": 0.0061,
"macro_f1": 0.6666666865348816,
"num_tokens": 7643524.0,
"repeat_count": 0.0,
- "routers_loss": 0.00740925082936883,
+ "routers_loss": 0.00763951288536191,
"skip_count": 3.0,
"step": 4740,
"text_loss": 0.548592209815979
@@ -45049,11 +45049,11 @@
"f1_skip": 1.0,
"grad_norm": 0.03857421875,
"learning_rate": 0.0006276397526888329,
- "loss": 0.0095,
+ "loss": 0.0094,
"macro_f1": 0.925203263759613,
"num_tokens": 7646919.0,
"repeat_count": 3.0,
- "routers_loss": 0.03791050612926483,
+ "routers_loss": 0.038590483367443085,
"skip_count": 5.0,
"step": 4742,
"text_loss": 0.27226054668426514
@@ -45066,13 +45066,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03564453125,
+ "grad_norm": 0.037109375,
"learning_rate": 0.0006273404668526443,
"loss": 0.005,
"macro_f1": 0.3333333432674408,
"num_tokens": 7650404.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013001165352761745,
+ "routers_loss": 0.0012555639259517193,
"skip_count": 0.0,
"step": 4744,
"text_loss": 0.47892290353775024
@@ -45085,13 +45085,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0235595703125,
+ "grad_norm": 0.0233154296875,
"learning_rate": 0.0006270411322193488,
"loss": 0.0049,
"macro_f1": 0.6666666865348816,
"num_tokens": 7652942.0,
"repeat_count": 1.0,
- "routers_loss": 0.001371108810417354,
+ "routers_loss": 0.0015356402145698667,
"skip_count": 0.0,
"step": 4746,
"text_loss": 0.5515767931938171
@@ -45104,13 +45104,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03369140625,
+ "grad_norm": 0.0390625,
"learning_rate": 0.0006267417489036517,
"loss": 0.0087,
"macro_f1": 0.3333333432674408,
"num_tokens": 7656269.0,
"repeat_count": 0.0,
- "routers_loss": 0.00558467349037528,
+ "routers_loss": 0.005182140972465277,
"skip_count": 0.0,
"step": 4748,
"text_loss": 0.3496028184890747
@@ -45123,13 +45123,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.062255859375,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.0006264423170202773,
"loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 7658664.0,
"repeat_count": 0.0,
- "routers_loss": 0.0044899932108819485,
+ "routers_loss": 0.004144361708313227,
"skip_count": 0.0,
"step": 4750,
"text_loss": 0.2786032557487488
@@ -45142,13 +45142,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0269775390625,
+ "grad_norm": 0.0267333984375,
"learning_rate": 0.0006261428366839685,
- "loss": 0.0047,
+ "loss": 0.0046,
"macro_f1": 0.3333333432674408,
"num_tokens": 7661471.0,
"repeat_count": 0.0,
- "routers_loss": 0.0002782076771836728,
+ "routers_loss": 0.00035335420398041606,
"skip_count": 0.0,
"step": 4752,
"text_loss": 0.4838487505912781
@@ -45161,32 +45161,32 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02783203125,
+ "grad_norm": 0.030517578125,
"learning_rate": 0.0006258433080094868,
- "loss": 0.0044,
+ "loss": 0.0045,
"macro_f1": 0.6666666865348816,
"num_tokens": 7664593.0,
"repeat_count": 0.0,
- "routers_loss": 0.010121302679181099,
+ "routers_loss": 0.0103341368958354,
"skip_count": 2.0,
"step": 4754,
"text_loss": 0.24325360357761383
},
{
"acc_repeat": 0.0,
- "acc_skip": 0.5,
- "avg_layers": 27.0,
+ "acc_skip": 1.0,
+ "avg_layers": 26.0,
"epoch": 22.328734957440563,
- "f1_execute": 0.9811320900917053,
+ "f1_execute": 1.0,
"f1_repeat": 0.0,
- "f1_skip": 0.6666666865348816,
- "grad_norm": 0.035888671875,
+ "f1_skip": 1.0,
+ "grad_norm": 0.036376953125,
"learning_rate": 0.0006255437311116119,
- "loss": 0.0082,
- "macro_f1": 0.5492662787437439,
+ "loss": 0.0079,
+ "macro_f1": 0.6666666865348816,
"num_tokens": 7667573.0,
"repeat_count": 0.0,
- "routers_loss": 0.015182681381702423,
+ "routers_loss": 0.014633853919804096,
"skip_count": 2.0,
"step": 4756,
"text_loss": 0.21569855511188507
@@ -45199,13 +45199,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.029052734375,
+ "grad_norm": 0.0284423828125,
"learning_rate": 0.0006252441061051426,
- "loss": 0.0056,
+ "loss": 0.0052,
"macro_f1": 0.3333333432674408,
"num_tokens": 7671171.0,
"repeat_count": 0.0,
- "routers_loss": 0.005404457915574312,
+ "routers_loss": 0.004900569561868906,
"skip_count": 0.0,
"step": 4758,
"text_loss": 0.12832018733024597
@@ -45218,13 +45218,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.030517578125,
+ "grad_norm": 0.028564453125,
"learning_rate": 0.0006249444331048955,
"loss": 0.0055,
"macro_f1": 0.3333333432674408,
"num_tokens": 7673932.0,
"repeat_count": 0.0,
- "routers_loss": 0.002476566471159458,
+ "routers_loss": 0.0020371589343994856,
"skip_count": 0.0,
"step": 4760,
"text_loss": 0.38652482628822327
@@ -45237,13 +45237,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0281982421875,
+ "grad_norm": 0.025634765625,
"learning_rate": 0.000624644712225706,
"loss": 0.0054,
"macro_f1": 0.6666666865348816,
"num_tokens": 7677396.0,
"repeat_count": 0.0,
- "routers_loss": 0.003040580777451396,
+ "routers_loss": 0.0028059002943336964,
"skip_count": 2.0,
"step": 4762,
"text_loss": 0.7937633395195007
@@ -45256,13 +45256,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0238037109375,
+ "grad_norm": 0.02587890625,
"learning_rate": 0.0006243449435824276,
- "loss": 0.005,
+ "loss": 0.0051,
"macro_f1": 0.3333333432674408,
"num_tokens": 7680392.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007072070729918778,
+ "routers_loss": 0.0007225095760077238,
"skip_count": 0.0,
"step": 4764,
"text_loss": 0.5690395832061768
@@ -45275,13 +45275,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02783203125,
+ "grad_norm": 0.0281982421875,
"learning_rate": 0.0006240451272899321,
- "loss": 0.0063,
+ "loss": 0.0062,
"macro_f1": 0.6666666865348816,
"num_tokens": 7684121.0,
"repeat_count": 0.0,
- "routers_loss": 0.0024044427555054426,
+ "routers_loss": 0.002052050782367587,
"skip_count": 1.0,
"step": 4766,
"text_loss": 0.5321336984634399
@@ -45294,13 +45294,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.03515625,
"learning_rate": 0.0006237452634631099,
- "loss": 0.0071,
+ "loss": 0.007,
"macro_f1": 0.6666666865348816,
"num_tokens": 7687236.0,
"repeat_count": 1.0,
- "routers_loss": 0.003375594737008214,
+ "routers_loss": 0.0039039517287164927,
"skip_count": 0.0,
"step": 4768,
"text_loss": 0.30823320150375366
@@ -45313,13 +45313,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0299072265625,
+ "grad_norm": 0.0303955078125,
"learning_rate": 0.0006234453522168694,
- "loss": 0.0087,
+ "loss": 0.0084,
"macro_f1": 0.5492662787437439,
"num_tokens": 7690355.0,
"repeat_count": 0.0,
- "routers_loss": 0.016256459057331085,
+ "routers_loss": 0.014570238068699837,
"skip_count": 2.0,
"step": 4770,
"text_loss": 0.21501587331295013
@@ -45332,13 +45332,13 @@
"f1_execute": 0.949999988079071,
"f1_repeat": 0.800000011920929,
"f1_skip": 0.9090909361839294,
- "grad_norm": 0.048583984375,
+ "grad_norm": 0.04541015625,
"learning_rate": 0.000623145393666137,
- "loss": 0.0071,
+ "loss": 0.0069,
"macro_f1": 0.886363685131073,
"num_tokens": 7693559.0,
"repeat_count": 3.0,
- "routers_loss": 0.06640318781137466,
+ "routers_loss": 0.061707716435194016,
"skip_count": 6.0,
"step": 4772,
"text_loss": 0.24371100962162018
@@ -45351,13 +45351,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0255126953125,
+ "grad_norm": 0.0281982421875,
"learning_rate": 0.0006228453879258576,
"loss": 0.0037,
"macro_f1": 0.6666666865348816,
"num_tokens": 7696422.0,
"repeat_count": 0.0,
- "routers_loss": 0.004930639173835516,
+ "routers_loss": 0.005053870379924774,
"skip_count": 2.0,
"step": 4774,
"text_loss": 0.237778440117836
@@ -45370,13 +45370,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05224609375,
+ "grad_norm": 0.060302734375,
"learning_rate": 0.0006225453351109934,
- "loss": 0.0088,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 7700460.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018267944687977433,
+ "routers_loss": 0.0017990898340940475,
"skip_count": 0.0,
"step": 4776,
"text_loss": 0.612456738948822
@@ -45389,13 +45389,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.033203125,
+ "grad_norm": 0.03466796875,
"learning_rate": 0.000622245235336526,
"loss": 0.0054,
"macro_f1": 0.6666666865348816,
"num_tokens": 7703330.0,
"repeat_count": 0.0,
- "routers_loss": 0.004756844602525234,
+ "routers_loss": 0.004507021512836218,
"skip_count": 2.0,
"step": 4778,
"text_loss": 0.36898812651634216
@@ -45408,13 +45408,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.031005859375,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0006219450887174537,
- "loss": 0.0065,
+ "loss": 0.0064,
"macro_f1": 0.6666666865348816,
"num_tokens": 7707243.0,
"repeat_count": 0.0,
- "routers_loss": 0.00667968625202775,
+ "routers_loss": 0.006295828148722649,
"skip_count": 1.0,
"step": 4780,
"text_loss": 0.14474599063396454
@@ -45427,13 +45427,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0294189453125,
+ "grad_norm": 0.03515625,
"learning_rate": 0.0006216448953687932,
- "loss": 0.0071,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 7711121.0,
"repeat_count": 0.0,
- "routers_loss": 0.004780827090144157,
+ "routers_loss": 0.005049831233918667,
"skip_count": 0.0,
"step": 4782,
"text_loss": 0.4696790277957916
@@ -45446,13 +45446,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02783203125,
+ "grad_norm": 0.028076171875,
"learning_rate": 0.0006213446554055795,
"loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 7714889.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006851314683444798,
+ "routers_loss": 0.0006010758224874735,
"skip_count": 0.0,
"step": 4784,
"text_loss": 0.46253830194473267
@@ -45465,13 +45465,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0277099609375,
+ "grad_norm": 0.028564453125,
"learning_rate": 0.0006210443689428649,
- "loss": 0.0062,
+ "loss": 0.0063,
"macro_f1": 1.0,
"num_tokens": 7718420.0,
"repeat_count": 3.0,
- "routers_loss": 0.00759447505697608,
+ "routers_loss": 0.006691234186291695,
"skip_count": 1.0,
"step": 4786,
"text_loss": 0.579987645149231
@@ -45484,13 +45484,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.033447265625,
+ "grad_norm": 0.035400390625,
"learning_rate": 0.00062074403609572,
- "loss": 0.0076,
+ "loss": 0.0074,
"macro_f1": 0.3333333432674408,
"num_tokens": 7721720.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019895671866834164,
+ "routers_loss": 0.001864895923063159,
"skip_count": 0.0,
"step": 4788,
"text_loss": 0.325242817401886
@@ -45503,13 +45503,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0284423828125,
+ "grad_norm": 0.02880859375,
"learning_rate": 0.0006204436569792324,
- "loss": 0.009,
+ "loss": 0.0089,
"macro_f1": 0.3333333432674408,
"num_tokens": 7724916.0,
"repeat_count": 0.0,
- "routers_loss": 0.0020269565284252167,
+ "routers_loss": 0.00202955212444067,
"skip_count": 0.0,
"step": 4790,
"text_loss": 0.49637556076049805
@@ -45522,13 +45522,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.032470703125,
"learning_rate": 0.0006201432317085083,
- "loss": 0.0086,
+ "loss": 0.0085,
"macro_f1": 0.6666666865348816,
"num_tokens": 7728081.0,
"repeat_count": 1.0,
- "routers_loss": 0.004511707462370396,
+ "routers_loss": 0.0037843603640794754,
"skip_count": 0.0,
"step": 4792,
"text_loss": 0.38812628388404846
@@ -45541,13 +45541,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.02880859375,
+ "grad_norm": 0.0301513671875,
"learning_rate": 0.0006198427603986711,
- "loss": 0.0065,
+ "loss": 0.0066,
"macro_f1": 0.6666666865348816,
"num_tokens": 7731457.0,
"repeat_count": 0.0,
- "routers_loss": 0.011496705003082752,
+ "routers_loss": 0.012036679312586784,
"skip_count": 3.0,
"step": 4794,
"text_loss": 0.2996312379837036
@@ -45560,13 +45560,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.022705078125,
+ "grad_norm": 0.0247802734375,
"learning_rate": 0.0006195422431648623,
- "loss": 0.0058,
+ "loss": 0.006,
"macro_f1": 0.6666666865348816,
"num_tokens": 7734595.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009816563688218594,
+ "routers_loss": 0.0008874868508428335,
"skip_count": 1.0,
"step": 4796,
"text_loss": 0.3203189969062805
@@ -45579,13 +45579,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0281982421875,
+ "grad_norm": 0.0283203125,
"learning_rate": 0.0006192416801222403,
"loss": 0.0051,
"macro_f1": 1.0,
"num_tokens": 7737565.0,
"repeat_count": 1.0,
- "routers_loss": 0.0031518745236098766,
+ "routers_loss": 0.0032894534524530172,
"skip_count": 1.0,
"step": 4798,
"text_loss": 0.3283322751522064
@@ -45598,13 +45598,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.056640625,
+ "grad_norm": 0.053955078125,
"learning_rate": 0.0006189410713859815,
"loss": 0.0076,
"macro_f1": 0.6666666865348816,
"num_tokens": 7740439.0,
"repeat_count": 0.0,
- "routers_loss": 0.009768245741724968,
+ "routers_loss": 0.009667043574154377,
"skip_count": 2.0,
"step": 4800,
"text_loss": 0.25219282507896423
@@ -45617,13 +45617,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03662109375,
+ "grad_norm": 0.03857421875,
"learning_rate": 0.0006186404170712797,
- "loss": 0.0094,
+ "loss": 0.0093,
"macro_f1": 0.6666666865348816,
"num_tokens": 7743813.0,
"repeat_count": 0.0,
- "routers_loss": 0.012967129237949848,
+ "routers_loss": 0.012643060646951199,
"skip_count": 4.0,
"step": 4802,
"text_loss": 0.22567439079284668
@@ -45636,13 +45636,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.031982421875,
+ "grad_norm": 0.03125,
"learning_rate": 0.0006183397172933462,
- "loss": 0.006,
+ "loss": 0.0058,
"macro_f1": 0.3333333432674408,
"num_tokens": 7747182.0,
"repeat_count": 0.0,
- "routers_loss": 0.002813612576574087,
+ "routers_loss": 0.002678517485037446,
"skip_count": 0.0,
"step": 4804,
"text_loss": 0.19188879430294037
@@ -45655,13 +45655,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0228271484375,
+ "grad_norm": 0.0233154296875,
"learning_rate": 0.0006180389721674101,
"loss": 0.0049,
"macro_f1": 0.3333333432674408,
"num_tokens": 7750735.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013491564895957708,
+ "routers_loss": 0.0013385121710598469,
"skip_count": 0.0,
"step": 4806,
"text_loss": 0.5860441327095032
@@ -45674,13 +45674,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.031005859375,
+ "grad_norm": 0.0284423828125,
"learning_rate": 0.000617738181808717,
- "loss": 0.0063,
+ "loss": 0.0064,
"macro_f1": 0.6666666865348816,
"num_tokens": 7753843.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035517180804163218,
+ "routers_loss": 0.0034869094379246235,
"skip_count": 1.0,
"step": 4808,
"text_loss": 0.4366260766983032
@@ -45693,13 +45693,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.044677734375,
+ "grad_norm": 0.0478515625,
"learning_rate": 0.0006174373463325306,
"loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 7757039.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014680681051686406,
+ "routers_loss": 0.0013648992171511054,
"skip_count": 0.0,
"step": 4810,
"text_loss": 0.5217258334159851
@@ -45712,13 +45712,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.025390625,
+ "grad_norm": 0.0274658203125,
"learning_rate": 0.0006171364658541314,
"loss": 0.0044,
"macro_f1": 1.0,
"num_tokens": 7760016.0,
"repeat_count": 1.0,
- "routers_loss": 0.004398355260491371,
+ "routers_loss": 0.0038017008919268847,
"skip_count": 2.0,
"step": 4812,
"text_loss": 0.8130963444709778
@@ -45731,13 +45731,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.039794921875,
+ "grad_norm": 0.03466796875,
"learning_rate": 0.0006168355404888177,
- "loss": 0.008,
+ "loss": 0.0078,
"macro_f1": 0.6666666865348816,
"num_tokens": 7762961.0,
"repeat_count": 0.0,
- "routers_loss": 0.006870325654745102,
+ "routers_loss": 0.006867518648505211,
"skip_count": 2.0,
"step": 4814,
"text_loss": 0.17822521924972534
@@ -45750,13 +45750,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0299072265625,
+ "grad_norm": 0.03076171875,
"learning_rate": 0.0006165345703519043,
- "loss": 0.0059,
+ "loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 7766399.0,
"repeat_count": 0.0,
- "routers_loss": 0.0004937525955028832,
+ "routers_loss": 0.0004653502255678177,
"skip_count": 0.0,
"step": 4816,
"text_loss": 0.5316070914268494
@@ -45769,13 +45769,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.036376953125,
+ "grad_norm": 0.035888671875,
"learning_rate": 0.0006162335555587238,
- "loss": 0.0081,
+ "loss": 0.008,
"macro_f1": 1.0,
"num_tokens": 7769039.0,
"repeat_count": 1.0,
- "routers_loss": 0.0014112245989963412,
+ "routers_loss": 0.0016906452365219593,
"skip_count": 1.0,
"step": 4818,
"text_loss": 0.5680997967720032
@@ -45788,13 +45788,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.051025390625,
+ "grad_norm": 0.05615234375,
"learning_rate": 0.0006159324962246257,
"loss": 0.0066,
"macro_f1": 0.3333333432674408,
"num_tokens": 7772768.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026105360593646765,
+ "routers_loss": 0.002541248919442296,
"skip_count": 0.0,
"step": 4820,
"text_loss": 0.6169226169586182
@@ -45807,13 +45807,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.029541015625,
+ "grad_norm": 0.031494140625,
"learning_rate": 0.0006156313924649762,
- "loss": 0.0057,
+ "loss": 0.0056,
"macro_f1": 0.6666666865348816,
"num_tokens": 7775545.0,
"repeat_count": 0.0,
- "routers_loss": 0.008672980591654778,
+ "routers_loss": 0.008644679561257362,
"skip_count": 2.0,
"step": 4822,
"text_loss": 0.2211475968360901
@@ -45826,13 +45826,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0294189453125,
+ "grad_norm": 0.02880859375,
"learning_rate": 0.0006153302443951589,
- "loss": 0.006,
+ "loss": 0.0059,
"macro_f1": 0.6666666865348816,
"num_tokens": 7778837.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035932520404458046,
+ "routers_loss": 0.0041346061043441296,
"skip_count": 2.0,
"step": 4824,
"text_loss": 0.5369775891304016
@@ -45851,7 +45851,7 @@
"macro_f1": 0.3333333432674408,
"num_tokens": 7782309.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012299016816541553,
+ "routers_loss": 0.0012756052892655134,
"skip_count": 0.0,
"step": 4826,
"text_loss": 0.5294989943504333
@@ -45864,13 +45864,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.026123046875,
+ "grad_norm": 0.02734375,
"learning_rate": 0.0006147278157866403,
- "loss": 0.0047,
+ "loss": 0.0046,
"macro_f1": 0.3272727429866791,
"num_tokens": 7785565.0,
"repeat_count": 0.0,
- "routers_loss": 0.02901158109307289,
+ "routers_loss": 0.029718991369009018,
"skip_count": 1.0,
"step": 4828,
"text_loss": 0.6920449733734131
@@ -45883,13 +45883,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03466796875,
+ "grad_norm": 0.032470703125,
"learning_rate": 0.0006144265354787906,
"loss": 0.0052,
"macro_f1": 0.3333333432674408,
"num_tokens": 7788218.0,
"repeat_count": 0.0,
- "routers_loss": 0.00484448904171586,
+ "routers_loss": 0.004829924553632736,
"skip_count": 0.0,
"step": 4830,
"text_loss": 0.17072243988513947
@@ -45902,13 +45902,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07421875,
+ "grad_norm": 0.06689453125,
"learning_rate": 0.0006141252113224767,
- "loss": 0.0045,
+ "loss": 0.0043,
"macro_f1": 0.3333333432674408,
"num_tokens": 7790788.0,
"repeat_count": 0.0,
- "routers_loss": 0.002483877120539546,
+ "routers_loss": 0.00254037044942379,
"skip_count": 0.0,
"step": 4832,
"text_loss": 0.20075996220111847
@@ -45921,13 +45921,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0157470703125,
+ "grad_norm": 0.01519775390625,
"learning_rate": 0.0006138238434331666,
- "loss": 0.0046,
+ "loss": 0.0044,
"macro_f1": 0.3333333432674408,
"num_tokens": 7793913.0,
"repeat_count": 0.0,
- "routers_loss": 0.0004437893512658775,
+ "routers_loss": 0.0004426188243087381,
"skip_count": 0.0,
"step": 4834,
"text_loss": 0.695742130279541
@@ -45940,13 +45940,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0240478515625,
+ "grad_norm": 0.022216796875,
"learning_rate": 0.000613522431926345,
- "loss": 0.0037,
+ "loss": 0.0036,
"macro_f1": 1.0,
"num_tokens": 7796932.0,
"repeat_count": 1.0,
- "routers_loss": 0.005339824128895998,
+ "routers_loss": 0.005176798906177282,
"skip_count": 3.0,
"step": 4836,
"text_loss": 0.4910822808742523
@@ -45959,13 +45959,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025634765625,
+ "grad_norm": 0.0262451171875,
"learning_rate": 0.0006132209769175132,
- "loss": 0.0047,
+ "loss": 0.0045,
"macro_f1": 0.3333333432674408,
"num_tokens": 7800686.0,
"repeat_count": 0.0,
- "routers_loss": 0.004220465198159218,
+ "routers_loss": 0.004120545461773872,
"skip_count": 0.0,
"step": 4838,
"text_loss": 0.3701378405094147
@@ -45978,13 +45978,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.022216796875,
+ "grad_norm": 0.0218505859375,
"learning_rate": 0.0006129194785221894,
"loss": 0.005,
"macro_f1": 0.3333333432674408,
"num_tokens": 7804765.0,
"repeat_count": 0.0,
- "routers_loss": 0.00431162491440773,
+ "routers_loss": 0.0043835826218128204,
"skip_count": 0.0,
"step": 4840,
"text_loss": 0.343635618686676
@@ -45997,13 +45997,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.035400390625,
+ "grad_norm": 0.035888671875,
"learning_rate": 0.0006126179368559086,
- "loss": 0.0059,
+ "loss": 0.0055,
"macro_f1": 0.6666666865348816,
"num_tokens": 7807498.0,
"repeat_count": 0.0,
- "routers_loss": 0.0013186183059588075,
+ "routers_loss": 0.001394893741235137,
"skip_count": 1.0,
"step": 4842,
"text_loss": 0.47756674885749817
@@ -46016,13 +46016,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.041259765625,
+ "grad_norm": 0.048828125,
"learning_rate": 0.000612316352034222,
- "loss": 0.0053,
+ "loss": 0.0054,
"macro_f1": 0.6666666865348816,
"num_tokens": 7810784.0,
"repeat_count": 0.0,
- "routers_loss": 0.0030679802875965834,
+ "routers_loss": 0.0031262130942195654,
"skip_count": 2.0,
"step": 4844,
"text_loss": 0.13077901303768158
@@ -46035,13 +46035,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.0546875,
+ "grad_norm": 0.058349609375,
"learning_rate": 0.0006120147241726972,
- "loss": 0.0083,
+ "loss": 0.0081,
"macro_f1": 0.8823530077934265,
"num_tokens": 7814754.0,
"repeat_count": 2.0,
- "routers_loss": 0.02003045752644539,
+ "routers_loss": 0.016139274463057518,
"skip_count": 1.0,
"step": 4846,
"text_loss": 0.18850074708461761
@@ -46054,13 +46054,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04638671875,
+ "grad_norm": 0.041015625,
"learning_rate": 0.0006117130533869189,
"loss": 0.0057,
"macro_f1": 0.3333333432674408,
"num_tokens": 7818245.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010096770711243153,
+ "routers_loss": 0.0009124451316893101,
"skip_count": 0.0,
"step": 4848,
"text_loss": 0.42503559589385986
@@ -46073,13 +46073,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.022705078125,
+ "grad_norm": 0.0224609375,
"learning_rate": 0.0006114113397924878,
"loss": 0.0062,
"macro_f1": 0.3333333432674408,
"num_tokens": 7822214.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014919894747436047,
+ "routers_loss": 0.0015132242115214467,
"skip_count": 0.0,
"step": 4850,
"text_loss": 0.16767354309558868
@@ -46092,13 +46092,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.040283203125,
+ "grad_norm": 0.04150390625,
"learning_rate": 0.0006111095835050212,
"loss": 0.0062,
"macro_f1": 1.0,
"num_tokens": 7825019.0,
"repeat_count": 2.0,
- "routers_loss": 0.0065781730227172375,
+ "routers_loss": 0.006253300234675407,
"skip_count": 2.0,
"step": 4852,
"text_loss": 0.44826745986938477
@@ -46111,13 +46111,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.025146484375,
+ "grad_norm": 0.024169921875,
"learning_rate": 0.0006108077846401524,
- "loss": 0.0037,
+ "loss": 0.0038,
"macro_f1": 0.3333333432674408,
"num_tokens": 7828113.0,
"repeat_count": 0.0,
- "routers_loss": 0.00244692200794816,
+ "routers_loss": 0.0024391328915953636,
"skip_count": 0.0,
"step": 4854,
"text_loss": 0.2009880244731903
@@ -46130,13 +46130,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.028564453125,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0006105059433135317,
- "loss": 0.0079,
+ "loss": 0.0078,
"macro_f1": 1.0,
"num_tokens": 7831177.0,
"repeat_count": 1.0,
- "routers_loss": 0.002367270179092884,
+ "routers_loss": 0.0020866121631115675,
"skip_count": 1.0,
"step": 4856,
"text_loss": 0.7082528471946716
@@ -46149,13 +46149,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0272216796875,
+ "grad_norm": 0.025390625,
"learning_rate": 0.0006102040596408251,
- "loss": 0.0072,
+ "loss": 0.007,
"macro_f1": 0.6666666865348816,
"num_tokens": 7834485.0,
"repeat_count": 0.0,
- "routers_loss": 0.005648438818752766,
+ "routers_loss": 0.004373365081846714,
"skip_count": 1.0,
"step": 4858,
"text_loss": 0.2541539669036865
@@ -46168,13 +46168,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0291748046875,
+ "grad_norm": 0.02734375,
"learning_rate": 0.0006099021337377148,
- "loss": 0.0049,
+ "loss": 0.0048,
"macro_f1": 0.3333333432674408,
"num_tokens": 7837749.0,
"repeat_count": 0.0,
- "routers_loss": 0.0042733135633170605,
+ "routers_loss": 0.004309024661779404,
"skip_count": 0.0,
"step": 4860,
"text_loss": 0.3163885176181793
@@ -46187,13 +46187,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.8571428656578064,
- "grad_norm": 0.04296875,
+ "grad_norm": 0.049072265625,
"learning_rate": 0.0006096001657198995,
- "loss": 0.0066,
+ "loss": 0.0065,
"macro_f1": 0.6122449040412903,
"num_tokens": 7840979.0,
"repeat_count": 0.0,
- "routers_loss": 0.023403044790029526,
+ "routers_loss": 0.023044804111123085,
"skip_count": 4.0,
"step": 4862,
"text_loss": 0.49609798192977905
@@ -46206,13 +46206,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.02490234375,
+ "grad_norm": 0.0250244140625,
"learning_rate": 0.0006092981557030941,
- "loss": 0.0058,
+ "loss": 0.0056,
"macro_f1": 1.0,
"num_tokens": 7844905.0,
"repeat_count": 1.0,
- "routers_loss": 0.011750902980566025,
+ "routers_loss": 0.010683654807507992,
"skip_count": 3.0,
"step": 4864,
"text_loss": 0.16866883635520935
@@ -46225,13 +46225,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02099609375,
+ "grad_norm": 0.0224609375,
"learning_rate": 0.0006089961038030291,
- "loss": 0.006,
+ "loss": 0.0061,
"macro_f1": 0.3333333432674408,
"num_tokens": 7847800.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012288664001971483,
+ "routers_loss": 0.0011224723421037197,
"skip_count": 0.0,
"step": 4866,
"text_loss": 0.5093055367469788
@@ -46244,13 +46244,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.037353515625,
"learning_rate": 0.0006086940101354515,
- "loss": 0.0051,
+ "loss": 0.0052,
"macro_f1": 0.6666666865348816,
"num_tokens": 7850983.0,
"repeat_count": 0.0,
- "routers_loss": 0.004861745983362198,
+ "routers_loss": 0.003944621421396732,
"skip_count": 1.0,
"step": 4868,
"text_loss": 0.5753747224807739
@@ -46263,13 +46263,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.0311279296875,
+ "grad_norm": 0.0289306640625,
"learning_rate": 0.0006083918748161244,
"loss": 0.0069,
"macro_f1": 0.5492662787437439,
"num_tokens": 7855041.0,
"repeat_count": 0.0,
- "routers_loss": 0.025008518248796463,
+ "routers_loss": 0.02532145567238331,
"skip_count": 2.0,
"step": 4870,
"text_loss": 0.8082366585731506
@@ -46282,13 +46282,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0286865234375,
+ "grad_norm": 0.029052734375,
"learning_rate": 0.0006080896979608262,
- "loss": 0.0054,
+ "loss": 0.0052,
"macro_f1": 0.3333333432674408,
"num_tokens": 7858058.0,
"repeat_count": 0.0,
- "routers_loss": 0.0007896953029558063,
+ "routers_loss": 0.0007558314246125519,
"skip_count": 0.0,
"step": 4872,
"text_loss": 0.6476574540138245
@@ -46301,13 +46301,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05029296875,
+ "grad_norm": 0.0615234375,
"learning_rate": 0.000607787479685352,
"loss": 0.0073,
"macro_f1": 0.3333333432674408,
"num_tokens": 7861223.0,
"repeat_count": 0.0,
- "routers_loss": 0.0008885554852895439,
+ "routers_loss": 0.0009224560926668346,
"skip_count": 0.0,
"step": 4874,
"text_loss": 0.5012133717536926
@@ -46320,13 +46320,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03662109375,
+ "grad_norm": 0.03515625,
"learning_rate": 0.0006074852201055121,
- "loss": 0.0084,
+ "loss": 0.0082,
"macro_f1": 0.3333333432674408,
"num_tokens": 7864180.0,
"repeat_count": 0.0,
- "routers_loss": 0.0029017175547778606,
+ "routers_loss": 0.0028308273758739233,
"skip_count": 0.0,
"step": 4876,
"text_loss": 0.7447214722633362
@@ -46339,13 +46339,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.04541015625,
+ "grad_norm": 0.052734375,
"learning_rate": 0.0006071829193371331,
- "loss": 0.0058,
+ "loss": 0.0059,
"macro_f1": 0.3333333432674408,
"num_tokens": 7866726.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021245202515274286,
+ "routers_loss": 0.0021505290642380714,
"skip_count": 0.0,
"step": 4878,
"text_loss": 0.5444929599761963
@@ -46358,13 +46358,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0751953125,
+ "grad_norm": 0.11376953125,
"learning_rate": 0.0006068805774960573,
"loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 7870166.0,
"repeat_count": 0.0,
- "routers_loss": 0.0021692372392863035,
+ "routers_loss": 0.0021109723020344973,
"skip_count": 0.0,
"step": 4880,
"text_loss": 0.3577263355255127
@@ -46377,13 +46377,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02880859375,
+ "grad_norm": 0.0308837890625,
"learning_rate": 0.0006065781946981425,
- "loss": 0.0057,
+ "loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 7873028.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026705453637987375,
+ "routers_loss": 0.0027144821360707283,
"skip_count": 0.0,
"step": 4882,
"text_loss": 0.28464797139167786
@@ -46396,13 +46396,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.05517578125,
+ "grad_norm": 0.05224609375,
"learning_rate": 0.0006062757710592624,
- "loss": 0.0058,
+ "loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 7876747.0,
"repeat_count": 0.0,
- "routers_loss": 0.0004111099406145513,
+ "routers_loss": 0.0004638207610696554,
"skip_count": 0.0,
"step": 4884,
"text_loss": 0.381534606218338
@@ -46415,13 +46415,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0299072265625,
+ "grad_norm": 0.02685546875,
"learning_rate": 0.0006059733066953066,
- "loss": 0.0045,
+ "loss": 0.0043,
"macro_f1": 1.0,
"num_tokens": 7879524.0,
"repeat_count": 1.0,
- "routers_loss": 0.0019129335414618254,
+ "routers_loss": 0.002225410658866167,
"skip_count": 2.0,
"step": 4886,
"text_loss": 0.5167883634567261
@@ -46434,13 +46434,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0279541015625,
+ "grad_norm": 0.028564453125,
"learning_rate": 0.0006056708017221796,
- "loss": 0.0043,
+ "loss": 0.0042,
"macro_f1": 0.6666666865348816,
"num_tokens": 7882809.0,
"repeat_count": 0.0,
- "routers_loss": 0.0046940455213189125,
+ "routers_loss": 0.00419368501752615,
"skip_count": 1.0,
"step": 4888,
"text_loss": 0.22688335180282593
@@ -46453,13 +46453,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.030517578125,
+ "grad_norm": 0.036376953125,
"learning_rate": 0.000605368256255802,
- "loss": 0.0054,
+ "loss": 0.0053,
"macro_f1": 0.6666666865348816,
"num_tokens": 7886310.0,
"repeat_count": 0.0,
- "routers_loss": 0.0017953033093363047,
+ "routers_loss": 0.0017340193735435605,
"skip_count": 1.0,
"step": 4890,
"text_loss": 1.0128135681152344
@@ -46472,13 +46472,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.07373046875,
+ "grad_norm": 0.0712890625,
"learning_rate": 0.0006050656704121098,
- "loss": 0.0098,
+ "loss": 0.0096,
"macro_f1": 0.3333333432674408,
"num_tokens": 7889483.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018971457611769438,
+ "routers_loss": 0.0016647159354761243,
"skip_count": 0.0,
"step": 4892,
"text_loss": 0.2213262915611267
@@ -46491,13 +46491,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.03271484375,
"learning_rate": 0.0006047630443070547,
- "loss": 0.0063,
+ "loss": 0.0064,
"macro_f1": 0.6666666865348816,
"num_tokens": 7892615.0,
"repeat_count": 0.0,
- "routers_loss": 0.0040974365547299385,
+ "routers_loss": 0.0038971947506070137,
"skip_count": 3.0,
"step": 4894,
"text_loss": 0.45751357078552246
@@ -46510,13 +46510,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.034912109375,
+ "grad_norm": 0.0341796875,
"learning_rate": 0.0006044603780566032,
- "loss": 0.0053,
+ "loss": 0.0052,
"macro_f1": 1.0,
"num_tokens": 7895747.0,
"repeat_count": 1.0,
- "routers_loss": 0.0035370320547372103,
+ "routers_loss": 0.0036852145567536354,
"skip_count": 1.0,
"step": 4896,
"text_loss": 0.13489919900894165
@@ -46529,13 +46529,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.024658203125,
+ "grad_norm": 0.0235595703125,
"learning_rate": 0.0006041576717767379,
- "loss": 0.0058,
+ "loss": 0.0057,
"macro_f1": 0.6666666865348816,
"num_tokens": 7899155.0,
"repeat_count": 0.0,
- "routers_loss": 0.008930077776312828,
+ "routers_loss": 0.007661987561732531,
"skip_count": 1.0,
"step": 4898,
"text_loss": 0.281853586435318
@@ -46548,13 +46548,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.029541015625,
+ "grad_norm": 0.03125,
"learning_rate": 0.0006038549255834563,
"loss": 0.007,
"macro_f1": 1.0,
"num_tokens": 7901667.0,
"repeat_count": 2.0,
- "routers_loss": 0.014533254317939281,
+ "routers_loss": 0.01836695335805416,
"skip_count": 5.0,
"step": 4900,
"text_loss": 0.24879895150661469
@@ -46567,13 +46567,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0262451171875,
+ "grad_norm": 0.02880859375,
"learning_rate": 0.000603552139592771,
- "loss": 0.0058,
+ "loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 7904506.0,
"repeat_count": 0.0,
- "routers_loss": 0.0012445948086678982,
+ "routers_loss": 0.0011829182039946318,
"skip_count": 0.0,
"step": 4902,
"text_loss": 0.7550268769264221
@@ -46586,13 +46586,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.037109375,
+ "grad_norm": 0.03662109375,
"learning_rate": 0.0006032493139207106,
"loss": 0.0049,
"macro_f1": 0.6666666865348816,
"num_tokens": 7907316.0,
"repeat_count": 1.0,
- "routers_loss": 0.0022323536686599255,
+ "routers_loss": 0.0022891140542924404,
"skip_count": 0.0,
"step": 4904,
"text_loss": 0.37596020102500916
@@ -46605,13 +46605,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0286865234375,
+ "grad_norm": 0.0289306640625,
"learning_rate": 0.0006029464486833186,
"loss": 0.007,
"macro_f1": 0.3333333432674408,
"num_tokens": 7911283.0,
"repeat_count": 0.0,
- "routers_loss": 0.00221167947165668,
+ "routers_loss": 0.001990227960050106,
"skip_count": 0.0,
"step": 4906,
"text_loss": 0.5879577994346619
@@ -46624,13 +46624,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0220947265625,
+ "grad_norm": 0.0211181640625,
"learning_rate": 0.0006026435439966531,
"loss": 0.0042,
"macro_f1": 0.6666666865348816,
"num_tokens": 7913907.0,
"repeat_count": 0.0,
- "routers_loss": 0.0025787551421672106,
+ "routers_loss": 0.0026039890944957733,
"skip_count": 1.0,
"step": 4908,
"text_loss": 0.41484713554382324
@@ -46643,13 +46643,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.027099609375,
+ "grad_norm": 0.0272216796875,
"learning_rate": 0.0006023405999767879,
"loss": 0.0059,
"macro_f1": 0.6666666865348816,
"num_tokens": 7916772.0,
"repeat_count": 0.0,
- "routers_loss": 0.00866663083434105,
+ "routers_loss": 0.009183229878544807,
"skip_count": 1.0,
"step": 4910,
"text_loss": 0.20732562243938446
@@ -46662,13 +46662,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0274658203125,
+ "grad_norm": 0.0302734375,
"learning_rate": 0.0006020376167398116,
"loss": 0.0054,
"macro_f1": 0.6666666865348816,
"num_tokens": 7919346.0,
"repeat_count": 0.0,
- "routers_loss": 0.005565170664340258,
+ "routers_loss": 0.005508727394044399,
"skip_count": 1.0,
"step": 4912,
"text_loss": 0.41416165232658386
@@ -46681,13 +46681,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03271484375,
+ "grad_norm": 0.033203125,
"learning_rate": 0.0006017345944018284,
- "loss": 0.0052,
+ "loss": 0.0051,
"macro_f1": 0.3272727429866791,
"num_tokens": 7922404.0,
"repeat_count": 0.0,
- "routers_loss": 0.008860527537763119,
+ "routers_loss": 0.008651934564113617,
"skip_count": 0.0,
"step": 4914,
"text_loss": 0.4290519952774048
@@ -46700,13 +46700,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.034423828125,
+ "grad_norm": 0.0299072265625,
"learning_rate": 0.0006014315330789563,
- "loss": 0.0078,
+ "loss": 0.0077,
"macro_f1": 0.6666666865348816,
"num_tokens": 7925165.0,
"repeat_count": 0.0,
- "routers_loss": 0.003383385483175516,
+ "routers_loss": 0.003601635340601206,
"skip_count": 1.0,
"step": 4916,
"text_loss": 0.8447931408882141
@@ -46719,13 +46719,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0341796875,
+ "grad_norm": 0.034912109375,
"learning_rate": 0.0006011284328873296,
- "loss": 0.004,
+ "loss": 0.0041,
"macro_f1": 1.0,
"num_tokens": 7928146.0,
"repeat_count": 1.0,
- "routers_loss": 0.0047926693223416805,
+ "routers_loss": 0.0049415635876357555,
"skip_count": 2.0,
"step": 4918,
"text_loss": 0.32237401604652405
@@ -46738,13 +46738,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.027099609375,
+ "grad_norm": 0.0291748046875,
"learning_rate": 0.0006008252939430967,
"loss": 0.0045,
"macro_f1": 0.3333333432674408,
"num_tokens": 7931163.0,
"repeat_count": 0.0,
- "routers_loss": 0.002505079610273242,
+ "routers_loss": 0.0024150956887751818,
"skip_count": 0.0,
"step": 4920,
"text_loss": 0.2251713126897812
@@ -46757,13 +46757,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03125,
+ "grad_norm": 0.04150390625,
"learning_rate": 0.0006005221163624209,
- "loss": 0.0056,
+ "loss": 0.0057,
"macro_f1": 0.3272727429866791,
"num_tokens": 7934084.0,
"repeat_count": 1.0,
- "routers_loss": 0.0305723175406456,
+ "routers_loss": 0.03181030973792076,
"skip_count": 0.0,
"step": 4922,
"text_loss": 0.4962928593158722
@@ -46776,13 +46776,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.046630859375,
+ "grad_norm": 0.054931640625,
"learning_rate": 0.0006002189002614806,
- "loss": 0.0091,
+ "loss": 0.0089,
"macro_f1": 0.6666666865348816,
"num_tokens": 7937021.0,
"repeat_count": 0.0,
- "routers_loss": 0.002330876188352704,
+ "routers_loss": 0.00227518193423748,
"skip_count": 2.0,
"step": 4924,
"text_loss": 0.34440335631370544
@@ -46795,13 +46795,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0283203125,
+ "grad_norm": 0.0277099609375,
"learning_rate": 0.0005999156457564685,
- "loss": 0.0064,
+ "loss": 0.0065,
"macro_f1": 0.6666666865348816,
"num_tokens": 7940205.0,
"repeat_count": 0.0,
- "routers_loss": 0.0043139951303601265,
+ "routers_loss": 0.004331593867391348,
"skip_count": 1.0,
"step": 4926,
"text_loss": 0.14114083349704742
@@ -46814,13 +46814,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0296630859375,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0005996123529635925,
- "loss": 0.0066,
+ "loss": 0.0067,
"macro_f1": 0.3333333432674408,
"num_tokens": 7945174.0,
"repeat_count": 0.0,
- "routers_loss": 0.0005922197597101331,
+ "routers_loss": 0.000612895586527884,
"skip_count": 0.0,
"step": 4928,
"text_loss": 0.3895469009876251
@@ -46833,13 +46833,13 @@
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03369140625,
+ "grad_norm": 0.036376953125,
"learning_rate": 0.000599309021999075,
- "loss": 0.0061,
+ "loss": 0.006,
"macro_f1": 0.3272727429866791,
"num_tokens": 7948716.0,
"repeat_count": 0.0,
- "routers_loss": 0.022591346874833107,
+ "routers_loss": 0.02319233864545822,
"skip_count": 1.0,
"step": 4930,
"text_loss": 0.38103172183036804
@@ -46852,13 +46852,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0230712890625,
+ "grad_norm": 0.0247802734375,
"learning_rate": 0.0005990056529791528,
"loss": 0.0056,
"macro_f1": 0.3333333432674408,
"num_tokens": 7952497.0,
"repeat_count": 0.0,
- "routers_loss": 0.003156521590426564,
+ "routers_loss": 0.003423231653869152,
"skip_count": 0.0,
"step": 4932,
"text_loss": 0.30447322130203247
@@ -46871,13 +46871,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.017333984375,
+ "grad_norm": 0.017822265625,
"learning_rate": 0.0005987022460200778,
- "loss": 0.0061,
+ "loss": 0.006,
"macro_f1": 0.3333333432674408,
"num_tokens": 7955578.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006762169650755823,
+ "routers_loss": 0.0007005351362749934,
"skip_count": 0.0,
"step": 4934,
"text_loss": 0.49621838331222534
@@ -46890,13 +46890,13 @@
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.024169921875,
+ "grad_norm": 0.0234375,
"learning_rate": 0.0005983988012381159,
- "loss": 0.006,
+ "loss": 0.0061,
"macro_f1": 0.8823530077934265,
"num_tokens": 7958741.0,
"repeat_count": 2.0,
- "routers_loss": 0.03957916796207428,
+ "routers_loss": 0.03962617367506027,
"skip_count": 1.0,
"step": 4936,
"text_loss": 0.1920493096113205
@@ -46909,13 +46909,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0211181640625,
+ "grad_norm": 0.022216796875,
"learning_rate": 0.0005980953187495476,
- "loss": 0.0074,
+ "loss": 0.0072,
"macro_f1": 0.6666666865348816,
"num_tokens": 7962236.0,
"repeat_count": 0.0,
- "routers_loss": 0.0026140862610191107,
+ "routers_loss": 0.0026006060652434826,
"skip_count": 3.0,
"step": 4938,
"text_loss": 0.5286803841590881
@@ -46928,13 +46928,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0218505859375,
+ "grad_norm": 0.0224609375,
"learning_rate": 0.0005977917986706681,
"loss": 0.0063,
"macro_f1": 0.3333333432674408,
"num_tokens": 7965631.0,
"repeat_count": 0.0,
- "routers_loss": 0.004825619049370289,
+ "routers_loss": 0.005010952707380056,
"skip_count": 0.0,
"step": 4940,
"text_loss": 0.3507745563983917
@@ -46947,13 +46947,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0269775390625,
+ "grad_norm": 0.0291748046875,
"learning_rate": 0.0005974882411177871,
- "loss": 0.0054,
+ "loss": 0.0053,
"macro_f1": 0.3333333432674408,
"num_tokens": 7968516.0,
"repeat_count": 0.0,
- "routers_loss": 0.002404073951765895,
+ "routers_loss": 0.0023964287247508764,
"skip_count": 0.0,
"step": 4942,
"text_loss": 0.9110504388809204
@@ -46966,13 +46966,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03173828125,
+ "grad_norm": 0.0322265625,
"learning_rate": 0.000597184646207228,
- "loss": 0.0065,
+ "loss": 0.0063,
"macro_f1": 0.6666666865348816,
"num_tokens": 7971310.0,
"repeat_count": 0.0,
- "routers_loss": 0.0035465885885059834,
+ "routers_loss": 0.0026230409275740385,
"skip_count": 1.0,
"step": 4944,
"text_loss": 0.4131232798099518
@@ -46985,13 +46985,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.03857421875,
+ "grad_norm": 0.0390625,
"learning_rate": 0.0005968810140553292,
- "loss": 0.0105,
+ "loss": 0.0102,
"macro_f1": 0.3333333432674408,
"num_tokens": 7974809.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006932367105036974,
+ "routers_loss": 0.0007397596491500735,
"skip_count": 0.0,
"step": 4946,
"text_loss": 0.5130466222763062
@@ -47004,13 +47004,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0257568359375,
+ "grad_norm": 0.0267333984375,
"learning_rate": 0.0005965773447784431,
"loss": 0.0048,
"macro_f1": 0.3333333432674408,
"num_tokens": 7977800.0,
"repeat_count": 0.0,
- "routers_loss": 0.0009562313207425177,
+ "routers_loss": 0.0009955473942682147,
"skip_count": 0.0,
"step": 4948,
"text_loss": 0.5366153717041016
@@ -47023,13 +47023,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.01318359375,
+ "grad_norm": 0.01373291015625,
"learning_rate": 0.0005962736384929362,
"loss": 0.0026,
"macro_f1": 0.3333333432674408,
"num_tokens": 7981027.0,
"repeat_count": 0.0,
- "routers_loss": 0.004678000696003437,
+ "routers_loss": 0.0049227322451770306,
"skip_count": 0.0,
"step": 4950,
"text_loss": 0.17266370356082916
@@ -47042,13 +47042,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0546875,
+ "grad_norm": 0.06201171875,
"learning_rate": 0.0005959698953151895,
"loss": 0.005,
"macro_f1": 0.3333333432674408,
"num_tokens": 7983580.0,
"repeat_count": 0.0,
- "routers_loss": 0.0010157873621210456,
+ "routers_loss": 0.0009975163266062737,
"skip_count": 0.0,
"step": 4952,
"text_loss": 0.2474549114704132
@@ -47061,13 +47061,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0252685546875,
+ "grad_norm": 0.0255126953125,
"learning_rate": 0.0005956661153615979,
"loss": 0.0053,
"macro_f1": 0.3333333432674408,
"num_tokens": 7986711.0,
"repeat_count": 0.0,
- "routers_loss": 0.0006747227744199336,
+ "routers_loss": 0.0006475782720372081,
"skip_count": 0.0,
"step": 4954,
"text_loss": 0.5748327970504761
@@ -47080,13 +47080,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0240478515625,
+ "grad_norm": 0.02294921875,
"learning_rate": 0.0005953622987485703,
- "loss": 0.0067,
+ "loss": 0.0063,
"macro_f1": 0.3333333432674408,
"num_tokens": 7990194.0,
"repeat_count": 0.0,
- "routers_loss": 0.0014360204804688692,
+ "routers_loss": 0.001449751085601747,
"skip_count": 0.0,
"step": 4956,
"text_loss": 0.5163559317588806
@@ -47099,13 +47099,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0322265625,
+ "grad_norm": 0.0400390625,
"learning_rate": 0.0005950584455925301,
- "loss": 0.0045,
+ "loss": 0.0043,
"macro_f1": 0.3333333432674408,
"num_tokens": 7993050.0,
"repeat_count": 0.0,
- "routers_loss": 0.001549707492813468,
+ "routers_loss": 0.0017087773885577917,
"skip_count": 0.0,
"step": 4958,
"text_loss": 0.15892620384693146
@@ -47118,13 +47118,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0283203125,
+ "grad_norm": 0.0289306640625,
"learning_rate": 0.0005947545560099142,
"loss": 0.0061,
"macro_f1": 0.3333333432674408,
"num_tokens": 7996383.0,
"repeat_count": 0.0,
- "routers_loss": 0.0047186254523694515,
+ "routers_loss": 0.0044417232275009155,
"skip_count": 0.0,
"step": 4960,
"text_loss": 0.48022928833961487
@@ -47137,13 +47137,13 @@
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
- "grad_norm": 0.033935546875,
+ "grad_norm": 0.031982421875,
"learning_rate": 0.0005944506301171734,
- "loss": 0.0068,
+ "loss": 0.0066,
"macro_f1": 0.5492662787437439,
"num_tokens": 7999843.0,
"repeat_count": 0.0,
- "routers_loss": 0.010887560434639454,
+ "routers_loss": 0.010093312710523605,
"skip_count": 2.0,
"step": 4962,
"text_loss": 0.5050316452980042
@@ -47156,13 +47156,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.029541015625,
+ "grad_norm": 0.03369140625,
"learning_rate": 0.0005941466680307732,
- "loss": 0.0051,
+ "loss": 0.005,
"macro_f1": 0.3333333432674408,
"num_tokens": 8003504.0,
"repeat_count": 0.0,
- "routers_loss": 0.009678485803306103,
+ "routers_loss": 0.009699694812297821,
"skip_count": 0.0,
"step": 4964,
"text_loss": 0.30474427342414856
@@ -47175,13 +47175,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
- "grad_norm": 0.038330078125,
+ "grad_norm": 0.040771484375,
"learning_rate": 0.0005938426698671922,
- "loss": 0.0099,
+ "loss": 0.0097,
"macro_f1": 0.6666666865348816,
"num_tokens": 8007427.0,
"repeat_count": 1.0,
- "routers_loss": 0.0018421853892505169,
+ "routers_loss": 0.0016759657301008701,
"skip_count": 0.0,
"step": 4966,
"text_loss": 0.25060293078422546
@@ -47194,13 +47194,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.03759765625,
+ "grad_norm": 0.04443359375,
"learning_rate": 0.0005935386357429232,
- "loss": 0.0069,
+ "loss": 0.0067,
"macro_f1": 1.0,
"num_tokens": 8010265.0,
"repeat_count": 2.0,
- "routers_loss": 0.006872798316180706,
+ "routers_loss": 0.006916914135217667,
"skip_count": 3.0,
"step": 4968,
"text_loss": 0.49084481596946716
@@ -47213,13 +47213,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0245361328125,
+ "grad_norm": 0.024658203125,
"learning_rate": 0.0005932345657744723,
- "loss": 0.0051,
+ "loss": 0.0052,
"macro_f1": 1.0,
"num_tokens": 8013733.0,
"repeat_count": 1.0,
- "routers_loss": 0.017219332978129387,
+ "routers_loss": 0.017182426527142525,
"skip_count": 5.0,
"step": 4970,
"text_loss": 0.2705717980861664
@@ -47232,13 +47232,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.026611328125,
+ "grad_norm": 0.0272216796875,
"learning_rate": 0.00059293046007836,
- "loss": 0.0064,
+ "loss": 0.0062,
"macro_f1": 0.6666666865348816,
"num_tokens": 8017068.0,
"repeat_count": 0.0,
- "routers_loss": 0.008250568993389606,
+ "routers_loss": 0.008485594764351845,
"skip_count": 2.0,
"step": 4972,
"text_loss": 0.18570218980312347
@@ -47251,13 +47251,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.03662109375,
+ "grad_norm": 0.03515625,
"learning_rate": 0.0005926263187711201,
- "loss": 0.008,
+ "loss": 0.0078,
"macro_f1": 0.6666666865348816,
"num_tokens": 8020185.0,
"repeat_count": 0.0,
- "routers_loss": 0.0022686906158924103,
+ "routers_loss": 0.0021750847809016705,
"skip_count": 2.0,
"step": 4974,
"text_loss": 0.4457069933414459
@@ -47270,13 +47270,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0306396484375,
+ "grad_norm": 0.031005859375,
"learning_rate": 0.0005923221419693001,
- "loss": 0.0087,
+ "loss": 0.0086,
"macro_f1": 0.3333333432674408,
"num_tokens": 8023038.0,
"repeat_count": 0.0,
- "routers_loss": 0.00217001186683774,
+ "routers_loss": 0.0020193420350551605,
"skip_count": 0.0,
"step": 4976,
"text_loss": 0.7394505143165588
@@ -47289,13 +47289,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.0537109375,
+ "grad_norm": 0.054931640625,
"learning_rate": 0.0005920179297894613,
- "loss": 0.0067,
+ "loss": 0.0064,
"macro_f1": 0.6666666865348816,
"num_tokens": 8026236.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015752838226035237,
+ "routers_loss": 0.001450369250960648,
"skip_count": 1.0,
"step": 4978,
"text_loss": 0.5914503335952759
@@ -47308,13 +47308,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0361328125,
+ "grad_norm": 0.0380859375,
"learning_rate": 0.000591713682348178,
- "loss": 0.0053,
+ "loss": 0.0052,
"macro_f1": 0.3333333432674408,
"num_tokens": 8028765.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018897822592407465,
+ "routers_loss": 0.0017808573320508003,
"skip_count": 0.0,
"step": 4980,
"text_loss": 0.19231407344341278
@@ -47327,13 +47327,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02587890625,
+ "grad_norm": 0.03173828125,
"learning_rate": 0.0005914093997620388,
- "loss": 0.0049,
+ "loss": 0.0051,
"macro_f1": 0.3333333432674408,
"num_tokens": 8032043.0,
"repeat_count": 0.0,
- "routers_loss": 0.0018230826826766133,
+ "routers_loss": 0.0018225493840873241,
"skip_count": 0.0,
"step": 4982,
"text_loss": 0.3567875325679779
@@ -47346,13 +47346,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.020751953125,
+ "grad_norm": 0.02197265625,
"learning_rate": 0.0005911050821476449,
"loss": 0.005,
"macro_f1": 0.3333333432674408,
"num_tokens": 8035086.0,
"repeat_count": 0.0,
- "routers_loss": 0.001746289781294763,
+ "routers_loss": 0.0016285666497424245,
"skip_count": 0.0,
"step": 4984,
"text_loss": 0.34609633684158325
@@ -47365,13 +47365,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0213623046875,
+ "grad_norm": 0.0220947265625,
"learning_rate": 0.0005908007296216119,
"loss": 0.0049,
"macro_f1": 0.3333333432674408,
"num_tokens": 8038193.0,
"repeat_count": 0.0,
- "routers_loss": 0.001723633729852736,
+ "routers_loss": 0.0014699801104143262,
"skip_count": 0.0,
"step": 4986,
"text_loss": 0.4492359757423401
@@ -47384,13 +47384,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0250244140625,
+ "grad_norm": 0.0245361328125,
"learning_rate": 0.000590496342300568,
"loss": 0.0069,
"macro_f1": 0.3333333432674408,
"num_tokens": 8041099.0,
"repeat_count": 0.0,
- "routers_loss": 0.002329434733837843,
+ "routers_loss": 0.002442725468426943,
"skip_count": 0.0,
"step": 4988,
"text_loss": 0.5162975788116455
@@ -47403,13 +47403,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.031982421875,
+ "grad_norm": 0.03125,
"learning_rate": 0.0005901919203011548,
"loss": 0.0052,
"macro_f1": 0.6666666865348816,
"num_tokens": 8044350.0,
"repeat_count": 0.0,
- "routers_loss": 0.00884273648262024,
+ "routers_loss": 0.008624207228422165,
"skip_count": 2.0,
"step": 4990,
"text_loss": 0.2533033490180969
@@ -47422,13 +47422,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.02001953125,
+ "grad_norm": 0.021728515625,
"learning_rate": 0.0005898874637400279,
- "loss": 0.0041,
+ "loss": 0.0042,
"macro_f1": 0.3333333432674408,
"num_tokens": 8047467.0,
"repeat_count": 0.0,
- "routers_loss": 0.0015820686239749193,
+ "routers_loss": 0.0015421364223584533,
"skip_count": 0.0,
"step": 4992,
"text_loss": 0.4890289306640625
@@ -47441,13 +47441,13 @@
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
- "grad_norm": 0.0291748046875,
+ "grad_norm": 0.0279541015625,
"learning_rate": 0.0005895829727338552,
- "loss": 0.0066,
+ "loss": 0.0065,
"macro_f1": 1.0,
"num_tokens": 8050626.0,
"repeat_count": 1.0,
- "routers_loss": 0.0024422749411314726,
+ "routers_loss": 0.0024516626726835966,
"skip_count": 2.0,
"step": 4994,
"text_loss": 0.50797039270401
@@ -47460,13 +47460,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
- "grad_norm": 0.029541015625,
+ "grad_norm": 0.0262451171875,
"learning_rate": 0.0005892784473993184,
- "loss": 0.0073,
+ "loss": 0.0071,
"macro_f1": 0.6666666865348816,
"num_tokens": 8053386.0,
"repeat_count": 0.0,
- "routers_loss": 0.0019333140226081014,
+ "routers_loss": 0.0018553845584392548,
"skip_count": 2.0,
"step": 4996,
"text_loss": 0.628828763961792
@@ -47479,13 +47479,13 @@
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
- "grad_norm": 0.0244140625,
+ "grad_norm": 0.0286865234375,
"learning_rate": 0.000588973887853112,
- "loss": 0.0052,
+ "loss": 0.005,
"macro_f1": 0.3333333432674408,
"num_tokens": 8055941.0,
"repeat_count": 0.0,
- "routers_loss": 0.004120452329516411,
+ "routers_loss": 0.004258487373590469,
"skip_count": 0.0,
"step": 4998,
"text_loss": 0.2643229067325592
@@ -47498,13 +47498,13 @@
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
- "grad_norm": 0.028564453125,
+ "grad_norm": 0.02783203125,
"learning_rate": 0.0005886692942119441,
- "loss": 0.0064,
+ "loss": 0.0062,
"macro_f1": 0.8820862174034119,
"num_tokens": 8058638.0,
"repeat_count": 2.0,
- "routers_loss": 0.018097922205924988,
+ "routers_loss": 0.019064312800765038,
"skip_count": 2.0,
"step": 5000,
"text_loss": 0.4925006031990051
diff --git a/checkpoint-5000/training_args.bin b/checkpoint-5000/training_args.bin
index deeea733277b4031781a5b299881dd8e675e7606..a3d3ae372faf14539639f54454aa52b6ee730c4a 100644
--- a/checkpoint-5000/training_args.bin
+++ b/checkpoint-5000/training_args.bin
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:0b3f5975f57762b552c7ee29776bf32a4dbb125781a0658488d3884fb25c5296
+oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8
size 5880
diff --git a/checkpoint-6000/chat_template.jinja b/checkpoint-6000/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0
--- /dev/null
+++ b/checkpoint-6000/chat_template.jinja
@@ -0,0 +1,93 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+ {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+ {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+ {%- if strftime_now is defined %}
+ {%- set date_string = strftime_now("%d %b %Y") %}
+ {%- else %}
+ {%- set date_string = "26 Jul 2024" %}
+ {%- endif %}
+{%- endif %}
+{%- if not tools is defined %}
+ {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+ {%- set system_message = messages[0]['content']|trim %}
+ {%- set messages = messages[1:] %}
+{%- else %}
+ {%- set system_message = "" %}
+{%- endif %}
+
+{#- System message #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if tools is not none %}
+ {{- "Environment: ipython\n" }}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+ {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+ {{- "Do not use variables.\n\n" }}
+ {%- for t in tools %}
+ {{- t | tojson(indent=4) }}
+ {{- "\n\n" }}
+ {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+ {#- Extract the first user message so we can plug it in here #}
+ {%- if messages | length != 0 %}
+ {%- set first_user_message = messages[0]['content']|trim %}
+ {%- set messages = messages[1:] %}
+ {%- else %}
+ {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+ {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+ {{- "Given the following functions, please respond with a JSON for a function call " }}
+ {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+ {{- "Do not use variables.\n\n" }}
+ {%- for t in tools %}
+ {{- t | tojson(indent=4) }}
+ {{- "\n\n" }}
+ {%- endfor %}
+ {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+ {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+ {%- elif 'tool_calls' in message %}
+ {%- if not message.tool_calls|length == 1 %}
+ {{- raise_exception("This model only supports single tool-calls at once!") }}
+ {%- endif %}
+ {%- set tool_call = message.tool_calls[0].function %}
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+ {{- '{"name": "' + tool_call.name + '", ' }}
+ {{- '"parameters": ' }}
+ {{- tool_call.arguments | tojson }}
+ {{- "}" }}
+ {{- "<|eot_id|>" }}
+ {%- elif message.role == "tool" or message.role == "ipython" %}
+ {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+ {%- if message.content is mapping or message.content is iterable %}
+ {{- message.content | tojson }}
+ {%- else %}
+ {{- message.content }}
+ {%- endif %}
+ {{- "<|eot_id|>" }}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
diff --git a/checkpoint-6000/model-00001-of-00002.safetensors b/checkpoint-6000/model-00001-of-00002.safetensors
index 1d33a92b59b4c6310c176588b68e5c1a23416ebc..08a01e1ba553cdcb2222f034a209861d7b54e284 100644
--- a/checkpoint-6000/model-00001-of-00002.safetensors
+++ b/checkpoint-6000/model-00001-of-00002.safetensors
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:5ddf8aeede6bdf2e78f716cd051b152f13d75d6629866637f946e8c7b6fd5dfe
-size 2101248
+oid sha256:13cbd6d16e927a0c5bad54102514e6e18b4a47b3a6eb911e39d678d328d19f55
+size 4965799096
diff --git a/checkpoint-6000/model-00002-of-00002.safetensors b/checkpoint-6000/model-00002-of-00002.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7d4b08d62e5a1c45cf9fc4ce734bc52d2f508a3f
--- /dev/null
+++ b/checkpoint-6000/model-00002-of-00002.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89f427de602cf2d49899f4d59a40675907bebc713187d70c900bc708c907b434
+size 1481790520
diff --git a/checkpoint-6000/model.safetensors.index.json b/checkpoint-6000/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..21bb567761d75ade0c0eef6495c450697dd3ff18
--- /dev/null
+++ b/checkpoint-6000/model.safetensors.index.json
@@ -0,0 +1,374 @@
+{
+ "metadata": {
+ "total_parameters": 3223774292,
+ "total_size": 6447548584
+ },
+ "weight_map": {
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+ "model.norm.weight": "model-00002-of-00002.safetensors",
+ "model.routers.0.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.0.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.0.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.0.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.1.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.1.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.1.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.1.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.10.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.10.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.10.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.10.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.11.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.11.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.11.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.11.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.12.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.12.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.12.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.12.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.13.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.13.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.13.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.13.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.14.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.14.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.14.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.14.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.15.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.15.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.15.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.15.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.16.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.16.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.16.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.16.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.17.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.17.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.17.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.17.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.18.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.18.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.18.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.18.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.19.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.19.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.19.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.19.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.2.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.2.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.2.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.2.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.20.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.20.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.20.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.20.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.21.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.21.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.21.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.21.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.22.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.22.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.22.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.22.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.23.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.23.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.23.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.23.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.24.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.24.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.24.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.24.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.25.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.25.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.25.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.25.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.26.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.26.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.26.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.26.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.27.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.27.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.27.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.27.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.3.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.3.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.3.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.3.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.4.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.4.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.4.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.4.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.5.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.5.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.5.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.5.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.6.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.6.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.6.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.6.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.7.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.7.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.7.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.7.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.8.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.8.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.8.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.8.linear2.weight": "model-00002-of-00002.safetensors",
+ "model.routers.9.linear1.bias": "model-00002-of-00002.safetensors",
+ "model.routers.9.linear1.weight": "model-00002-of-00002.safetensors",
+ "model.routers.9.linear2.bias": "model-00002-of-00002.safetensors",
+ "model.routers.9.linear2.weight": "model-00002-of-00002.safetensors"
+ }
+}
diff --git a/checkpoint-6000/optimizer.pt b/checkpoint-6000/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8e4ab382c8497388b7c1be77ced6bbc77cdecba0
--- /dev/null
+++ b/checkpoint-6000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14927b2932f622295e2116b67605be95e37a7bb5d8c7ba9c3cc4cfe8a8904d9a
+size 44191162
diff --git a/checkpoint-6000/rng_state.pth b/checkpoint-6000/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..39733b89e5c76956e6e2c15090922858a3da8be6
--- /dev/null
+++ b/checkpoint-6000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52d4940820055e4d79b82c43038fd1599197353607649f88dcebf26376772128
+size 14244
diff --git a/checkpoint-6000/scheduler.pt b/checkpoint-6000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cb293ba2b6427f12e737d50fc0ce36432853fafc
--- /dev/null
+++ b/checkpoint-6000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e30f9ec924312828673433a0b5a4fb7c915f61d146b7694043a9f729a7b67b18
+size 1064
diff --git a/checkpoint-6000/special_tokens_map.json b/checkpoint-6000/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..165b36bc2293dda9a2fb3c0daf6577d9eba9df7a
--- /dev/null
+++ b/checkpoint-6000/special_tokens_map.json
@@ -0,0 +1,17 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "<|finetune_right_pad_id|>"
+}
diff --git a/checkpoint-6000/tokenizer.json b/checkpoint-6000/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-6000/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-6000/tokenizer_config.json b/checkpoint-6000/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c68051fe3c4d23234a59316bc52d21f6e3a4182c
--- /dev/null
+++ b/checkpoint-6000/tokenizer_config.json
@@ -0,0 +1,2063 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<